odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +17 -17
- benchmarks/datasets.py +284 -284
- benchmarks/metrics.py +275 -275
- benchmarks/run_ablation.py +279 -279
- benchmarks/run_npll_benchmark.py +270 -270
- npll/__init__.py +10 -10
- npll/bootstrap.py +474 -474
- npll/core/__init__.py +33 -33
- npll/core/knowledge_graph.py +308 -308
- npll/core/logical_rules.py +496 -496
- npll/core/mln.py +474 -474
- npll/inference/__init__.py +40 -40
- npll/inference/e_step.py +419 -419
- npll/inference/elbo.py +434 -434
- npll/inference/m_step.py +576 -576
- npll/npll_model.py +631 -631
- npll/scoring/__init__.py +42 -42
- npll/scoring/embeddings.py +441 -441
- npll/scoring/probability.py +402 -402
- npll/scoring/scoring_module.py +369 -369
- npll/training/__init__.py +24 -24
- npll/training/evaluation.py +496 -496
- npll/training/npll_trainer.py +520 -520
- npll/utils/__init__.py +47 -47
- npll/utils/batch_utils.py +492 -492
- npll/utils/config.py +144 -144
- npll/utils/math_utils.py +338 -338
- odin/__init__.py +21 -20
- odin/engine.py +264 -264
- odin/schema.py +210 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
- odin_engine-0.2.0.dist-info/RECORD +63 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
- retrieval/__init__.py +50 -50
- retrieval/adapters.py +140 -140
- retrieval/adapters_arango.py +1418 -1418
- retrieval/aggregators.py +707 -707
- retrieval/beam.py +127 -127
- retrieval/budget.py +60 -60
- retrieval/cache.py +159 -159
- retrieval/confidence.py +88 -88
- retrieval/eval.py +49 -49
- retrieval/linker.py +87 -87
- retrieval/metrics.py +105 -105
- retrieval/metrics_motifs.py +36 -36
- retrieval/orchestrator.py +571 -571
- retrieval/ppr/__init__.py +12 -12
- retrieval/ppr/anchors.py +41 -41
- retrieval/ppr/bippr.py +61 -61
- retrieval/ppr/engines.py +257 -257
- retrieval/ppr/global_pr.py +76 -76
- retrieval/ppr/indexes.py +78 -78
- retrieval/ppr.py +156 -156
- retrieval/ppr_cache.py +25 -25
- retrieval/scoring.py +294 -294
- retrieval/utils/pii_redaction.py +36 -36
- retrieval/writers/__init__.py +9 -9
- retrieval/writers/arango_writer.py +28 -28
- retrieval/writers/base.py +21 -21
- retrieval/writers/janus_writer.py +36 -36
- odin_engine-0.1.0.dist-info/RECORD +0 -62
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
npll/utils/config.py
CHANGED
|
@@ -1,145 +1,145 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Configuration for Neural Probabilistic Logic Learning (NPLL)
|
|
3
|
-
Hyperparameters and settings based on the paper specifications
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from typing import List, Optional
|
|
8
|
-
import torch
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class NPLLConfig:
|
|
13
|
-
"""Configuration class for NPLL implementation following paper specifications"""
|
|
14
|
-
|
|
15
|
-
# Entity and Relation Embedding Dimensions (Paper Section 4.1)
|
|
16
|
-
entity_embedding_dim: int = 256 # d-dimensional entity embeddings
|
|
17
|
-
relation_embedding_dim: int = 256 # d-dimensional relation embeddings
|
|
18
|
-
rule_embedding_dim: int = 512 # k-dimensional rule embeddings
|
|
19
|
-
|
|
20
|
-
# Scoring Module Parameters (Equation 7)
|
|
21
|
-
# g(l, eh, et) = u^T_R f(e^T_h W_R et + V_R [eh; et] + b_R)
|
|
22
|
-
scoring_hidden_dim: int = 512 # k dimension for scoring function
|
|
23
|
-
scoring_activation: str = "relu" # Non-linear activation function f
|
|
24
|
-
|
|
25
|
-
# MLN Parameters (Equations 1-2)
|
|
26
|
-
max_rule_length: int = 3 # Maximum atoms per rule premise
|
|
27
|
-
max_ground_rules: int = 1000 # Maximum ground rules per batch
|
|
28
|
-
temperature: float = 1.0 # Temperature scaling for calibration
|
|
29
|
-
|
|
30
|
-
# Training Hyperparameters (Paper Section 5)
|
|
31
|
-
learning_rate: float = 0.0005 # Initial learning rate from paper
|
|
32
|
-
batch_size: int = 128 # Batch size for ground rule sampling
|
|
33
|
-
max_epochs: int = 100 # Maximum training epochs
|
|
34
|
-
patience: int = 20 # Early stopping patience
|
|
35
|
-
|
|
36
|
-
# E-M Algorithm Parameters (Sections 4.2-4.3)
|
|
37
|
-
em_iterations: int = 10 # Number of E-M alternations per epoch
|
|
38
|
-
convergence_threshold: float = 1e-4 # Convergence criterion for E-M
|
|
39
|
-
# Extended convergence controls
|
|
40
|
-
elbo_rel_tol: float = 1e-4 # relative ELBO tol
|
|
41
|
-
weight_abs_tol: float = 1e-4 # weight change tol
|
|
42
|
-
convergence_patience: int = 3 # number of consecutive hits required
|
|
43
|
-
|
|
44
|
-
# Regularization and Optimization
|
|
45
|
-
dropout: float = 0.1 # Dropout rate
|
|
46
|
-
weight_decay: float = 0.01 # L2 regularization
|
|
47
|
-
grad_clip_norm: float = 1.0 # Gradient clipping
|
|
48
|
-
|
|
49
|
-
# ELBO Optimization (Equation 5)
|
|
50
|
-
elbo_weight: float = 1.0 # Weight for ELBO term
|
|
51
|
-
kl_weight: float = 1.0 # Weight for KL divergence term
|
|
52
|
-
|
|
53
|
-
# Mean-field Approximation (Equation 8)
|
|
54
|
-
mean_field_iterations: int = 5 # Iterations for mean-field convergence
|
|
55
|
-
|
|
56
|
-
# Pseudo-log-likelihood (Equation 13)
|
|
57
|
-
pseudo_likelihood: bool = True # Use pseudo-likelihood in M-step
|
|
58
|
-
markov_blanket_size: int = 10 # Size of Markov blanket
|
|
59
|
-
|
|
60
|
-
# Device and Performance
|
|
61
|
-
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
|
62
|
-
num_workers: int = 4 # DataLoader workers
|
|
63
|
-
pin_memory: bool = True # Pin memory for GPU
|
|
64
|
-
|
|
65
|
-
# Evaluation Settings (Paper Section 5.2)
|
|
66
|
-
eval_batch_size: int = 256 # Evaluation batch size
|
|
67
|
-
eval_metrics: List[str] = None # MRR, Hit@1, Hit@3, Hit@10
|
|
68
|
-
filtered_evaluation: bool = True # Filtered setting from paper
|
|
69
|
-
|
|
70
|
-
# Dataset-specific Settings (Paper datasets)
|
|
71
|
-
dataset_name: str = "ArangoDB_Triples" # Default dataset
|
|
72
|
-
train_ratio: float = 1.0 # Training data ratio (for data efficiency experiments)
|
|
73
|
-
|
|
74
|
-
# Logging and Checkpointing
|
|
75
|
-
log_interval: int = 10 # Log every N epochs
|
|
76
|
-
save_interval: int = 50 # Save model every N epochs
|
|
77
|
-
checkpoint_dir: str = "checkpoints/"
|
|
78
|
-
|
|
79
|
-
def __post_init__(self):
|
|
80
|
-
"""Initialize derived configurations"""
|
|
81
|
-
if self.eval_metrics is None:
|
|
82
|
-
self.eval_metrics = ["MRR", "Hit@1", "Hit@3", "Hit@10"]
|
|
83
|
-
|
|
84
|
-
# Ensure scoring dimensions are consistent
|
|
85
|
-
assert self.scoring_hidden_dim > 0, "Scoring hidden dimension must be positive"
|
|
86
|
-
assert self.entity_embedding_dim == self.relation_embedding_dim, \
|
|
87
|
-
"Entity and relation embedding dimensions must match (paper assumption)"
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# Paper-specific configurations for different datasets
|
|
91
|
-
FB15K_237_CONFIG = NPLLConfig(
|
|
92
|
-
dataset_name="FB15k-237",
|
|
93
|
-
entity_embedding_dim=256,
|
|
94
|
-
relation_embedding_dim=256,
|
|
95
|
-
rule_embedding_dim=512,
|
|
96
|
-
learning_rate=0.0005,
|
|
97
|
-
max_epochs=200
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
WN18RR_CONFIG = NPLLConfig(
|
|
101
|
-
dataset_name="WN18RR",
|
|
102
|
-
entity_embedding_dim=256,
|
|
103
|
-
relation_embedding_dim=256,
|
|
104
|
-
rule_embedding_dim=512,
|
|
105
|
-
learning_rate=0.0005,
|
|
106
|
-
max_epochs=200
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
UMLS_CONFIG = NPLLConfig(
|
|
110
|
-
dataset_name="UMLS",
|
|
111
|
-
entity_embedding_dim=128,
|
|
112
|
-
relation_embedding_dim=128,
|
|
113
|
-
rule_embedding_dim=256,
|
|
114
|
-
learning_rate=0.001,
|
|
115
|
-
max_epochs=100
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
KINSHIP_CONFIG = NPLLConfig(
|
|
119
|
-
dataset_name="Kinship",
|
|
120
|
-
entity_embedding_dim=512,
|
|
121
|
-
relation_embedding_dim=512,
|
|
122
|
-
rule_embedding_dim=512,
|
|
123
|
-
learning_rate=0.0005,
|
|
124
|
-
max_epochs=150
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def get_config(dataset_name: str) -> NPLLConfig:
|
|
129
|
-
"""Get dataset-specific configuration"""
|
|
130
|
-
configs = {
|
|
131
|
-
"FB15k-237": FB15K_237_CONFIG,
|
|
132
|
-
"WN18RR": WN18RR_CONFIG,
|
|
133
|
-
"UMLS": UMLS_CONFIG,
|
|
134
|
-
"Kinship": KINSHIP_CONFIG
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
if dataset_name in configs:
|
|
138
|
-
return configs[dataset_name]
|
|
139
|
-
else:
|
|
140
|
-
print(f"Warning: Unknown dataset {dataset_name}, using default ArangoDB_Triples config")
|
|
141
|
-
return FB15K_237_CONFIG
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# Export default config
|
|
1
|
+
"""
|
|
2
|
+
Configuration for Neural Probabilistic Logic Learning (NPLL)
|
|
3
|
+
Hyperparameters and settings based on the paper specifications
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class NPLLConfig:
|
|
13
|
+
"""Configuration class for NPLL implementation following paper specifications"""
|
|
14
|
+
|
|
15
|
+
# Entity and Relation Embedding Dimensions (Paper Section 4.1)
|
|
16
|
+
entity_embedding_dim: int = 256 # d-dimensional entity embeddings
|
|
17
|
+
relation_embedding_dim: int = 256 # d-dimensional relation embeddings
|
|
18
|
+
rule_embedding_dim: int = 512 # k-dimensional rule embeddings
|
|
19
|
+
|
|
20
|
+
# Scoring Module Parameters (Equation 7)
|
|
21
|
+
# g(l, eh, et) = u^T_R f(e^T_h W_R et + V_R [eh; et] + b_R)
|
|
22
|
+
scoring_hidden_dim: int = 512 # k dimension for scoring function
|
|
23
|
+
scoring_activation: str = "relu" # Non-linear activation function f
|
|
24
|
+
|
|
25
|
+
# MLN Parameters (Equations 1-2)
|
|
26
|
+
max_rule_length: int = 3 # Maximum atoms per rule premise
|
|
27
|
+
max_ground_rules: int = 1000 # Maximum ground rules per batch
|
|
28
|
+
temperature: float = 1.0 # Temperature scaling for calibration
|
|
29
|
+
|
|
30
|
+
# Training Hyperparameters (Paper Section 5)
|
|
31
|
+
learning_rate: float = 0.0005 # Initial learning rate from paper
|
|
32
|
+
batch_size: int = 128 # Batch size for ground rule sampling
|
|
33
|
+
max_epochs: int = 100 # Maximum training epochs
|
|
34
|
+
patience: int = 20 # Early stopping patience
|
|
35
|
+
|
|
36
|
+
# E-M Algorithm Parameters (Sections 4.2-4.3)
|
|
37
|
+
em_iterations: int = 10 # Number of E-M alternations per epoch
|
|
38
|
+
convergence_threshold: float = 1e-4 # Convergence criterion for E-M
|
|
39
|
+
# Extended convergence controls
|
|
40
|
+
elbo_rel_tol: float = 1e-4 # relative ELBO tol
|
|
41
|
+
weight_abs_tol: float = 1e-4 # weight change tol
|
|
42
|
+
convergence_patience: int = 3 # number of consecutive hits required
|
|
43
|
+
|
|
44
|
+
# Regularization and Optimization
|
|
45
|
+
dropout: float = 0.1 # Dropout rate
|
|
46
|
+
weight_decay: float = 0.01 # L2 regularization
|
|
47
|
+
grad_clip_norm: float = 1.0 # Gradient clipping
|
|
48
|
+
|
|
49
|
+
# ELBO Optimization (Equation 5)
|
|
50
|
+
elbo_weight: float = 1.0 # Weight for ELBO term
|
|
51
|
+
kl_weight: float = 1.0 # Weight for KL divergence term
|
|
52
|
+
|
|
53
|
+
# Mean-field Approximation (Equation 8)
|
|
54
|
+
mean_field_iterations: int = 5 # Iterations for mean-field convergence
|
|
55
|
+
|
|
56
|
+
# Pseudo-log-likelihood (Equation 13)
|
|
57
|
+
pseudo_likelihood: bool = True # Use pseudo-likelihood in M-step
|
|
58
|
+
markov_blanket_size: int = 10 # Size of Markov blanket
|
|
59
|
+
|
|
60
|
+
# Device and Performance
|
|
61
|
+
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
|
62
|
+
num_workers: int = 4 # DataLoader workers
|
|
63
|
+
pin_memory: bool = True # Pin memory for GPU
|
|
64
|
+
|
|
65
|
+
# Evaluation Settings (Paper Section 5.2)
|
|
66
|
+
eval_batch_size: int = 256 # Evaluation batch size
|
|
67
|
+
eval_metrics: List[str] = None # MRR, Hit@1, Hit@3, Hit@10
|
|
68
|
+
filtered_evaluation: bool = True # Filtered setting from paper
|
|
69
|
+
|
|
70
|
+
# Dataset-specific Settings (Paper datasets)
|
|
71
|
+
dataset_name: str = "ArangoDB_Triples" # Default dataset
|
|
72
|
+
train_ratio: float = 1.0 # Training data ratio (for data efficiency experiments)
|
|
73
|
+
|
|
74
|
+
# Logging and Checkpointing
|
|
75
|
+
log_interval: int = 10 # Log every N epochs
|
|
76
|
+
save_interval: int = 50 # Save model every N epochs
|
|
77
|
+
checkpoint_dir: str = "checkpoints/"
|
|
78
|
+
|
|
79
|
+
def __post_init__(self):
|
|
80
|
+
"""Initialize derived configurations"""
|
|
81
|
+
if self.eval_metrics is None:
|
|
82
|
+
self.eval_metrics = ["MRR", "Hit@1", "Hit@3", "Hit@10"]
|
|
83
|
+
|
|
84
|
+
# Ensure scoring dimensions are consistent
|
|
85
|
+
assert self.scoring_hidden_dim > 0, "Scoring hidden dimension must be positive"
|
|
86
|
+
assert self.entity_embedding_dim == self.relation_embedding_dim, \
|
|
87
|
+
"Entity and relation embedding dimensions must match (paper assumption)"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Paper-specific configurations for different datasets
|
|
91
|
+
FB15K_237_CONFIG = NPLLConfig(
|
|
92
|
+
dataset_name="FB15k-237",
|
|
93
|
+
entity_embedding_dim=256,
|
|
94
|
+
relation_embedding_dim=256,
|
|
95
|
+
rule_embedding_dim=512,
|
|
96
|
+
learning_rate=0.0005,
|
|
97
|
+
max_epochs=200
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
WN18RR_CONFIG = NPLLConfig(
|
|
101
|
+
dataset_name="WN18RR",
|
|
102
|
+
entity_embedding_dim=256,
|
|
103
|
+
relation_embedding_dim=256,
|
|
104
|
+
rule_embedding_dim=512,
|
|
105
|
+
learning_rate=0.0005,
|
|
106
|
+
max_epochs=200
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
UMLS_CONFIG = NPLLConfig(
|
|
110
|
+
dataset_name="UMLS",
|
|
111
|
+
entity_embedding_dim=128,
|
|
112
|
+
relation_embedding_dim=128,
|
|
113
|
+
rule_embedding_dim=256,
|
|
114
|
+
learning_rate=0.001,
|
|
115
|
+
max_epochs=100
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
KINSHIP_CONFIG = NPLLConfig(
|
|
119
|
+
dataset_name="Kinship",
|
|
120
|
+
entity_embedding_dim=512,
|
|
121
|
+
relation_embedding_dim=512,
|
|
122
|
+
rule_embedding_dim=512,
|
|
123
|
+
learning_rate=0.0005,
|
|
124
|
+
max_epochs=150
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_config(dataset_name: str) -> NPLLConfig:
|
|
129
|
+
"""Get dataset-specific configuration"""
|
|
130
|
+
configs = {
|
|
131
|
+
"FB15k-237": FB15K_237_CONFIG,
|
|
132
|
+
"WN18RR": WN18RR_CONFIG,
|
|
133
|
+
"UMLS": UMLS_CONFIG,
|
|
134
|
+
"Kinship": KINSHIP_CONFIG
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if dataset_name in configs:
|
|
138
|
+
return configs[dataset_name]
|
|
139
|
+
else:
|
|
140
|
+
print(f"Warning: Unknown dataset {dataset_name}, using default ArangoDB_Triples config")
|
|
141
|
+
return FB15K_237_CONFIG
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# Export default config
|
|
145
145
|
default_config = FB15K_237_CONFIG
|