odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. benchmarks/__init__.py +17 -17
  2. benchmarks/datasets.py +284 -284
  3. benchmarks/metrics.py +275 -275
  4. benchmarks/run_ablation.py +279 -279
  5. benchmarks/run_npll_benchmark.py +270 -270
  6. npll/__init__.py +10 -10
  7. npll/bootstrap.py +474 -474
  8. npll/core/__init__.py +33 -33
  9. npll/core/knowledge_graph.py +308 -308
  10. npll/core/logical_rules.py +496 -496
  11. npll/core/mln.py +474 -474
  12. npll/inference/__init__.py +40 -40
  13. npll/inference/e_step.py +419 -419
  14. npll/inference/elbo.py +434 -434
  15. npll/inference/m_step.py +576 -576
  16. npll/npll_model.py +631 -631
  17. npll/scoring/__init__.py +42 -42
  18. npll/scoring/embeddings.py +441 -441
  19. npll/scoring/probability.py +402 -402
  20. npll/scoring/scoring_module.py +369 -369
  21. npll/training/__init__.py +24 -24
  22. npll/training/evaluation.py +496 -496
  23. npll/training/npll_trainer.py +520 -520
  24. npll/utils/__init__.py +47 -47
  25. npll/utils/batch_utils.py +492 -492
  26. npll/utils/config.py +144 -144
  27. npll/utils/math_utils.py +338 -338
  28. odin/__init__.py +21 -20
  29. odin/engine.py +264 -264
  30. odin/schema.py +210 -0
  31. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
  32. odin_engine-0.2.0.dist-info/RECORD +63 -0
  33. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
  34. retrieval/__init__.py +50 -50
  35. retrieval/adapters.py +140 -140
  36. retrieval/adapters_arango.py +1418 -1418
  37. retrieval/aggregators.py +707 -707
  38. retrieval/beam.py +127 -127
  39. retrieval/budget.py +60 -60
  40. retrieval/cache.py +159 -159
  41. retrieval/confidence.py +88 -88
  42. retrieval/eval.py +49 -49
  43. retrieval/linker.py +87 -87
  44. retrieval/metrics.py +105 -105
  45. retrieval/metrics_motifs.py +36 -36
  46. retrieval/orchestrator.py +571 -571
  47. retrieval/ppr/__init__.py +12 -12
  48. retrieval/ppr/anchors.py +41 -41
  49. retrieval/ppr/bippr.py +61 -61
  50. retrieval/ppr/engines.py +257 -257
  51. retrieval/ppr/global_pr.py +76 -76
  52. retrieval/ppr/indexes.py +78 -78
  53. retrieval/ppr.py +156 -156
  54. retrieval/ppr_cache.py +25 -25
  55. retrieval/scoring.py +294 -294
  56. retrieval/utils/pii_redaction.py +36 -36
  57. retrieval/writers/__init__.py +9 -9
  58. retrieval/writers/arango_writer.py +28 -28
  59. retrieval/writers/base.py +21 -21
  60. retrieval/writers/janus_writer.py +36 -36
  61. odin_engine-0.1.0.dist-info/RECORD +0 -62
  62. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
  63. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
npll/utils/config.py CHANGED
@@ -1,145 +1,145 @@
1
- """
2
- Configuration for Neural Probabilistic Logic Learning (NPLL)
3
- Hyperparameters and settings based on the paper specifications
4
- """
5
-
6
- from dataclasses import dataclass
7
- from typing import List, Optional
8
- import torch
9
-
10
-
11
- @dataclass
12
- class NPLLConfig:
13
- """Configuration class for NPLL implementation following paper specifications"""
14
-
15
- # Entity and Relation Embedding Dimensions (Paper Section 4.1)
16
- entity_embedding_dim: int = 256 # d-dimensional entity embeddings
17
- relation_embedding_dim: int = 256 # d-dimensional relation embeddings
18
- rule_embedding_dim: int = 512 # k-dimensional rule embeddings
19
-
20
- # Scoring Module Parameters (Equation 7)
21
- # g(l, eh, et) = u^T_R f(e^T_h W_R et + V_R [eh; et] + b_R)
22
- scoring_hidden_dim: int = 512 # k dimension for scoring function
23
- scoring_activation: str = "relu" # Non-linear activation function f
24
-
25
- # MLN Parameters (Equations 1-2)
26
- max_rule_length: int = 3 # Maximum atoms per rule premise
27
- max_ground_rules: int = 1000 # Maximum ground rules per batch
28
- temperature: float = 1.0 # Temperature scaling for calibration
29
-
30
- # Training Hyperparameters (Paper Section 5)
31
- learning_rate: float = 0.0005 # Initial learning rate from paper
32
- batch_size: int = 128 # Batch size for ground rule sampling
33
- max_epochs: int = 100 # Maximum training epochs
34
- patience: int = 20 # Early stopping patience
35
-
36
- # E-M Algorithm Parameters (Sections 4.2-4.3)
37
- em_iterations: int = 10 # Number of E-M alternations per epoch
38
- convergence_threshold: float = 1e-4 # Convergence criterion for E-M
39
- # Extended convergence controls
40
- elbo_rel_tol: float = 1e-4 # relative ELBO tol
41
- weight_abs_tol: float = 1e-4 # weight change tol
42
- convergence_patience: int = 3 # number of consecutive hits required
43
-
44
- # Regularization and Optimization
45
- dropout: float = 0.1 # Dropout rate
46
- weight_decay: float = 0.01 # L2 regularization
47
- grad_clip_norm: float = 1.0 # Gradient clipping
48
-
49
- # ELBO Optimization (Equation 5)
50
- elbo_weight: float = 1.0 # Weight for ELBO term
51
- kl_weight: float = 1.0 # Weight for KL divergence term
52
-
53
- # Mean-field Approximation (Equation 8)
54
- mean_field_iterations: int = 5 # Iterations for mean-field convergence
55
-
56
- # Pseudo-log-likelihood (Equation 13)
57
- pseudo_likelihood: bool = True # Use pseudo-likelihood in M-step
58
- markov_blanket_size: int = 10 # Size of Markov blanket
59
-
60
- # Device and Performance
61
- device: str = "cuda" if torch.cuda.is_available() else "cpu"
62
- num_workers: int = 4 # DataLoader workers
63
- pin_memory: bool = True # Pin memory for GPU
64
-
65
- # Evaluation Settings (Paper Section 5.2)
66
- eval_batch_size: int = 256 # Evaluation batch size
67
- eval_metrics: List[str] = None # MRR, Hit@1, Hit@3, Hit@10
68
- filtered_evaluation: bool = True # Filtered setting from paper
69
-
70
- # Dataset-specific Settings (Paper datasets)
71
- dataset_name: str = "ArangoDB_Triples" # Default dataset
72
- train_ratio: float = 1.0 # Training data ratio (for data efficiency experiments)
73
-
74
- # Logging and Checkpointing
75
- log_interval: int = 10 # Log every N epochs
76
- save_interval: int = 50 # Save model every N epochs
77
- checkpoint_dir: str = "checkpoints/"
78
-
79
- def __post_init__(self):
80
- """Initialize derived configurations"""
81
- if self.eval_metrics is None:
82
- self.eval_metrics = ["MRR", "Hit@1", "Hit@3", "Hit@10"]
83
-
84
- # Ensure scoring dimensions are consistent
85
- assert self.scoring_hidden_dim > 0, "Scoring hidden dimension must be positive"
86
- assert self.entity_embedding_dim == self.relation_embedding_dim, \
87
- "Entity and relation embedding dimensions must match (paper assumption)"
88
-
89
-
90
- # Paper-specific configurations for different datasets
91
- FB15K_237_CONFIG = NPLLConfig(
92
- dataset_name="FB15k-237",
93
- entity_embedding_dim=256,
94
- relation_embedding_dim=256,
95
- rule_embedding_dim=512,
96
- learning_rate=0.0005,
97
- max_epochs=200
98
- )
99
-
100
- WN18RR_CONFIG = NPLLConfig(
101
- dataset_name="WN18RR",
102
- entity_embedding_dim=256,
103
- relation_embedding_dim=256,
104
- rule_embedding_dim=512,
105
- learning_rate=0.0005,
106
- max_epochs=200
107
- )
108
-
109
- UMLS_CONFIG = NPLLConfig(
110
- dataset_name="UMLS",
111
- entity_embedding_dim=128,
112
- relation_embedding_dim=128,
113
- rule_embedding_dim=256,
114
- learning_rate=0.001,
115
- max_epochs=100
116
- )
117
-
118
- KINSHIP_CONFIG = NPLLConfig(
119
- dataset_name="Kinship",
120
- entity_embedding_dim=512,
121
- relation_embedding_dim=512,
122
- rule_embedding_dim=512,
123
- learning_rate=0.0005,
124
- max_epochs=150
125
- )
126
-
127
-
128
- def get_config(dataset_name: str) -> NPLLConfig:
129
- """Get dataset-specific configuration"""
130
- configs = {
131
- "FB15k-237": FB15K_237_CONFIG,
132
- "WN18RR": WN18RR_CONFIG,
133
- "UMLS": UMLS_CONFIG,
134
- "Kinship": KINSHIP_CONFIG
135
- }
136
-
137
- if dataset_name in configs:
138
- return configs[dataset_name]
139
- else:
140
- print(f"Warning: Unknown dataset {dataset_name}, using default ArangoDB_Triples config")
141
- return FB15K_237_CONFIG
142
-
143
-
144
- # Export default config
1
+ """
2
+ Configuration for Neural Probabilistic Logic Learning (NPLL)
3
+ Hyperparameters and settings based on the paper specifications
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+ from typing import List, Optional
8
+ import torch
9
+
10
+
11
+ @dataclass
12
+ class NPLLConfig:
13
+ """Configuration class for NPLL implementation following paper specifications"""
14
+
15
+ # Entity and Relation Embedding Dimensions (Paper Section 4.1)
16
+ entity_embedding_dim: int = 256 # d-dimensional entity embeddings
17
+ relation_embedding_dim: int = 256 # d-dimensional relation embeddings
18
+ rule_embedding_dim: int = 512 # k-dimensional rule embeddings
19
+
20
+ # Scoring Module Parameters (Equation 7)
21
+ # g(l, eh, et) = u^T_R f(e^T_h W_R et + V_R [eh; et] + b_R)
22
+ scoring_hidden_dim: int = 512 # k dimension for scoring function
23
+ scoring_activation: str = "relu" # Non-linear activation function f
24
+
25
+ # MLN Parameters (Equations 1-2)
26
+ max_rule_length: int = 3 # Maximum atoms per rule premise
27
+ max_ground_rules: int = 1000 # Maximum ground rules per batch
28
+ temperature: float = 1.0 # Temperature scaling for calibration
29
+
30
+ # Training Hyperparameters (Paper Section 5)
31
+ learning_rate: float = 0.0005 # Initial learning rate from paper
32
+ batch_size: int = 128 # Batch size for ground rule sampling
33
+ max_epochs: int = 100 # Maximum training epochs
34
+ patience: int = 20 # Early stopping patience
35
+
36
+ # E-M Algorithm Parameters (Sections 4.2-4.3)
37
+ em_iterations: int = 10 # Number of E-M alternations per epoch
38
+ convergence_threshold: float = 1e-4 # Convergence criterion for E-M
39
+ # Extended convergence controls
40
+ elbo_rel_tol: float = 1e-4 # relative ELBO tol
41
+ weight_abs_tol: float = 1e-4 # weight change tol
42
+ convergence_patience: int = 3 # number of consecutive hits required
43
+
44
+ # Regularization and Optimization
45
+ dropout: float = 0.1 # Dropout rate
46
+ weight_decay: float = 0.01 # L2 regularization
47
+ grad_clip_norm: float = 1.0 # Gradient clipping
48
+
49
+ # ELBO Optimization (Equation 5)
50
+ elbo_weight: float = 1.0 # Weight for ELBO term
51
+ kl_weight: float = 1.0 # Weight for KL divergence term
52
+
53
+ # Mean-field Approximation (Equation 8)
54
+ mean_field_iterations: int = 5 # Iterations for mean-field convergence
55
+
56
+ # Pseudo-log-likelihood (Equation 13)
57
+ pseudo_likelihood: bool = True # Use pseudo-likelihood in M-step
58
+ markov_blanket_size: int = 10 # Size of Markov blanket
59
+
60
+ # Device and Performance
61
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
62
+ num_workers: int = 4 # DataLoader workers
63
+ pin_memory: bool = True # Pin memory for GPU
64
+
65
+ # Evaluation Settings (Paper Section 5.2)
66
+ eval_batch_size: int = 256 # Evaluation batch size
67
+ eval_metrics: List[str] = None # MRR, Hit@1, Hit@3, Hit@10
68
+ filtered_evaluation: bool = True # Filtered setting from paper
69
+
70
+ # Dataset-specific Settings (Paper datasets)
71
+ dataset_name: str = "ArangoDB_Triples" # Default dataset
72
+ train_ratio: float = 1.0 # Training data ratio (for data efficiency experiments)
73
+
74
+ # Logging and Checkpointing
75
+ log_interval: int = 10 # Log every N epochs
76
+ save_interval: int = 50 # Save model every N epochs
77
+ checkpoint_dir: str = "checkpoints/"
78
+
79
+ def __post_init__(self):
80
+ """Initialize derived configurations"""
81
+ if self.eval_metrics is None:
82
+ self.eval_metrics = ["MRR", "Hit@1", "Hit@3", "Hit@10"]
83
+
84
+ # Ensure scoring dimensions are consistent
85
+ assert self.scoring_hidden_dim > 0, "Scoring hidden dimension must be positive"
86
+ assert self.entity_embedding_dim == self.relation_embedding_dim, \
87
+ "Entity and relation embedding dimensions must match (paper assumption)"
88
+
89
+
90
+ # Paper-specific configurations for different datasets
91
+ FB15K_237_CONFIG = NPLLConfig(
92
+ dataset_name="FB15k-237",
93
+ entity_embedding_dim=256,
94
+ relation_embedding_dim=256,
95
+ rule_embedding_dim=512,
96
+ learning_rate=0.0005,
97
+ max_epochs=200
98
+ )
99
+
100
+ WN18RR_CONFIG = NPLLConfig(
101
+ dataset_name="WN18RR",
102
+ entity_embedding_dim=256,
103
+ relation_embedding_dim=256,
104
+ rule_embedding_dim=512,
105
+ learning_rate=0.0005,
106
+ max_epochs=200
107
+ )
108
+
109
+ UMLS_CONFIG = NPLLConfig(
110
+ dataset_name="UMLS",
111
+ entity_embedding_dim=128,
112
+ relation_embedding_dim=128,
113
+ rule_embedding_dim=256,
114
+ learning_rate=0.001,
115
+ max_epochs=100
116
+ )
117
+
118
+ KINSHIP_CONFIG = NPLLConfig(
119
+ dataset_name="Kinship",
120
+ entity_embedding_dim=512,
121
+ relation_embedding_dim=512,
122
+ rule_embedding_dim=512,
123
+ learning_rate=0.0005,
124
+ max_epochs=150
125
+ )
126
+
127
+
128
+ def get_config(dataset_name: str) -> NPLLConfig:
129
+ """Get dataset-specific configuration"""
130
+ configs = {
131
+ "FB15k-237": FB15K_237_CONFIG,
132
+ "WN18RR": WN18RR_CONFIG,
133
+ "UMLS": UMLS_CONFIG,
134
+ "Kinship": KINSHIP_CONFIG
135
+ }
136
+
137
+ if dataset_name in configs:
138
+ return configs[dataset_name]
139
+ else:
140
+ print(f"Warning: Unknown dataset {dataset_name}, using default ArangoDB_Triples config")
141
+ return FB15K_237_CONFIG
142
+
143
+
144
+ # Export default config
145
145
  default_config = FB15K_237_CONFIG