crca 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. CRCA.py +172 -7
  2. MODEL_CARD.md +53 -0
  3. PKG-INFO +8 -2
  4. RELEASE_NOTES.md +17 -0
  5. STABILITY.md +19 -0
  6. architecture/hybrid/consistency_engine.py +362 -0
  7. architecture/hybrid/conversation_manager.py +421 -0
  8. architecture/hybrid/explanation_generator.py +452 -0
  9. architecture/hybrid/few_shot_learner.py +533 -0
  10. architecture/hybrid/graph_compressor.py +286 -0
  11. architecture/hybrid/hybrid_agent.py +4398 -0
  12. architecture/hybrid/language_compiler.py +623 -0
  13. architecture/hybrid/main,py +0 -0
  14. architecture/hybrid/reasoning_tracker.py +322 -0
  15. architecture/hybrid/self_verifier.py +524 -0
  16. architecture/hybrid/task_decomposer.py +567 -0
  17. architecture/hybrid/text_corrector.py +341 -0
  18. benchmark_results/crca_core_benchmarks.json +178 -0
  19. branches/crca_sd/crca_sd_realtime.py +6 -2
  20. branches/general_agent/__init__.py +102 -0
  21. branches/general_agent/general_agent.py +1400 -0
  22. branches/general_agent/personality.py +169 -0
  23. branches/general_agent/utils/__init__.py +19 -0
  24. branches/general_agent/utils/prompt_builder.py +170 -0
  25. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/METADATA +8 -2
  26. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/RECORD +303 -20
  27. crca_core/__init__.py +35 -0
  28. crca_core/benchmarks/__init__.py +14 -0
  29. crca_core/benchmarks/synthetic_scm.py +103 -0
  30. crca_core/core/__init__.py +23 -0
  31. crca_core/core/api.py +120 -0
  32. crca_core/core/estimate.py +208 -0
  33. crca_core/core/godclass.py +72 -0
  34. crca_core/core/intervention_design.py +174 -0
  35. crca_core/core/lifecycle.py +48 -0
  36. crca_core/discovery/__init__.py +9 -0
  37. crca_core/discovery/tabular.py +193 -0
  38. crca_core/identify/__init__.py +171 -0
  39. crca_core/identify/backdoor.py +39 -0
  40. crca_core/identify/frontdoor.py +48 -0
  41. crca_core/identify/graph.py +106 -0
  42. crca_core/identify/id_algorithm.py +43 -0
  43. crca_core/identify/iv.py +48 -0
  44. crca_core/models/__init__.py +67 -0
  45. crca_core/models/provenance.py +56 -0
  46. crca_core/models/refusal.py +39 -0
  47. crca_core/models/result.py +83 -0
  48. crca_core/models/spec.py +151 -0
  49. crca_core/models/validation.py +68 -0
  50. crca_core/scm/__init__.py +9 -0
  51. crca_core/scm/linear_gaussian.py +198 -0
  52. crca_core/timeseries/__init__.py +6 -0
  53. crca_core/timeseries/pcmci.py +181 -0
  54. crca_llm/__init__.py +12 -0
  55. crca_llm/client.py +85 -0
  56. crca_llm/coauthor.py +118 -0
  57. crca_llm/orchestrator.py +289 -0
  58. crca_llm/types.py +21 -0
  59. crca_reasoning/__init__.py +16 -0
  60. crca_reasoning/critique.py +54 -0
  61. crca_reasoning/godclass.py +206 -0
  62. crca_reasoning/memory.py +24 -0
  63. crca_reasoning/rationale.py +10 -0
  64. crca_reasoning/react_controller.py +81 -0
  65. crca_reasoning/tool_router.py +97 -0
  66. crca_reasoning/types.py +40 -0
  67. crca_sd/__init__.py +15 -0
  68. crca_sd/crca_sd_core.py +2 -0
  69. crca_sd/crca_sd_governance.py +2 -0
  70. crca_sd/crca_sd_mpc.py +2 -0
  71. crca_sd/crca_sd_realtime.py +2 -0
  72. crca_sd/crca_sd_tui.py +2 -0
  73. cuda-keyring_1.1-1_all.deb +0 -0
  74. cuda-keyring_1.1-1_all.deb.1 +0 -0
  75. docs/IMAGE_ANNOTATION_USAGE.md +539 -0
  76. docs/INSTALL_DEEPSPEED.md +125 -0
  77. docs/api/branches/crca-cg.md +19 -0
  78. docs/api/branches/crca-q.md +27 -0
  79. docs/api/branches/crca-sd.md +37 -0
  80. docs/api/branches/general-agent.md +24 -0
  81. docs/api/branches/overview.md +19 -0
  82. docs/api/crca/agent-methods.md +62 -0
  83. docs/api/crca/operations.md +79 -0
  84. docs/api/crca/overview.md +32 -0
  85. docs/api/image-annotation/engine.md +52 -0
  86. docs/api/image-annotation/overview.md +17 -0
  87. docs/api/schemas/annotation.md +34 -0
  88. docs/api/schemas/core-schemas.md +82 -0
  89. docs/api/schemas/overview.md +32 -0
  90. docs/api/schemas/policy.md +30 -0
  91. docs/api/utils/conversation.md +22 -0
  92. docs/api/utils/graph-reasoner.md +32 -0
  93. docs/api/utils/overview.md +21 -0
  94. docs/api/utils/router.md +19 -0
  95. docs/api/utils/utilities.md +97 -0
  96. docs/architecture/causal-graphs.md +41 -0
  97. docs/architecture/data-flow.md +29 -0
  98. docs/architecture/design-principles.md +33 -0
  99. docs/architecture/hybrid-agent/components.md +38 -0
  100. docs/architecture/hybrid-agent/consistency.md +26 -0
  101. docs/architecture/hybrid-agent/overview.md +44 -0
  102. docs/architecture/hybrid-agent/reasoning.md +22 -0
  103. docs/architecture/llm-integration.md +26 -0
  104. docs/architecture/modular-structure.md +37 -0
  105. docs/architecture/overview.md +69 -0
  106. docs/architecture/policy-engine-arch.md +29 -0
  107. docs/branches/crca-cg/corposwarm.md +39 -0
  108. docs/branches/crca-cg/esg-scoring.md +30 -0
  109. docs/branches/crca-cg/multi-agent.md +35 -0
  110. docs/branches/crca-cg/overview.md +40 -0
  111. docs/branches/crca-q/alternative-data.md +55 -0
  112. docs/branches/crca-q/architecture.md +71 -0
  113. docs/branches/crca-q/backtesting.md +45 -0
  114. docs/branches/crca-q/causal-engine.md +33 -0
  115. docs/branches/crca-q/execution.md +39 -0
  116. docs/branches/crca-q/market-data.md +60 -0
  117. docs/branches/crca-q/overview.md +58 -0
  118. docs/branches/crca-q/philosophy.md +60 -0
  119. docs/branches/crca-q/portfolio-optimization.md +66 -0
  120. docs/branches/crca-q/risk-management.md +102 -0
  121. docs/branches/crca-q/setup.md +65 -0
  122. docs/branches/crca-q/signal-generation.md +61 -0
  123. docs/branches/crca-q/signal-validation.md +43 -0
  124. docs/branches/crca-sd/core.md +84 -0
  125. docs/branches/crca-sd/governance.md +53 -0
  126. docs/branches/crca-sd/mpc-solver.md +65 -0
  127. docs/branches/crca-sd/overview.md +59 -0
  128. docs/branches/crca-sd/realtime.md +28 -0
  129. docs/branches/crca-sd/tui.md +20 -0
  130. docs/branches/general-agent/overview.md +37 -0
  131. docs/branches/general-agent/personality.md +36 -0
  132. docs/branches/general-agent/prompt-builder.md +30 -0
  133. docs/changelog/index.md +79 -0
  134. docs/contributing/code-style.md +69 -0
  135. docs/contributing/documentation.md +43 -0
  136. docs/contributing/overview.md +29 -0
  137. docs/contributing/testing.md +29 -0
  138. docs/core/crcagent/async-operations.md +65 -0
  139. docs/core/crcagent/automatic-extraction.md +107 -0
  140. docs/core/crcagent/batch-prediction.md +80 -0
  141. docs/core/crcagent/bayesian-inference.md +60 -0
  142. docs/core/crcagent/causal-graph.md +92 -0
  143. docs/core/crcagent/counterfactuals.md +96 -0
  144. docs/core/crcagent/deterministic-simulation.md +78 -0
  145. docs/core/crcagent/dual-mode-operation.md +82 -0
  146. docs/core/crcagent/initialization.md +88 -0
  147. docs/core/crcagent/optimization.md +65 -0
  148. docs/core/crcagent/overview.md +63 -0
  149. docs/core/crcagent/time-series.md +57 -0
  150. docs/core/schemas/annotation.md +30 -0
  151. docs/core/schemas/core-schemas.md +82 -0
  152. docs/core/schemas/overview.md +30 -0
  153. docs/core/schemas/policy.md +41 -0
  154. docs/core/templates/base-agent.md +31 -0
  155. docs/core/templates/feature-mixins.md +31 -0
  156. docs/core/templates/overview.md +29 -0
  157. docs/core/templates/templates-guide.md +75 -0
  158. docs/core/tools/mcp-client.md +34 -0
  159. docs/core/tools/overview.md +24 -0
  160. docs/core/utils/conversation.md +27 -0
  161. docs/core/utils/graph-reasoner.md +29 -0
  162. docs/core/utils/overview.md +27 -0
  163. docs/core/utils/router.md +27 -0
  164. docs/core/utils/utilities.md +97 -0
  165. docs/css/custom.css +84 -0
  166. docs/examples/basic-usage.md +57 -0
  167. docs/examples/general-agent/general-agent-examples.md +50 -0
  168. docs/examples/hybrid-agent/hybrid-agent-examples.md +56 -0
  169. docs/examples/image-annotation/image-annotation-examples.md +54 -0
  170. docs/examples/integration/integration-examples.md +58 -0
  171. docs/examples/overview.md +37 -0
  172. docs/examples/trading/trading-examples.md +46 -0
  173. docs/features/causal-reasoning/advanced-topics.md +101 -0
  174. docs/features/causal-reasoning/counterfactuals.md +43 -0
  175. docs/features/causal-reasoning/do-calculus.md +50 -0
  176. docs/features/causal-reasoning/overview.md +47 -0
  177. docs/features/causal-reasoning/structural-models.md +52 -0
  178. docs/features/hybrid-agent/advanced-components.md +55 -0
  179. docs/features/hybrid-agent/core-components.md +64 -0
  180. docs/features/hybrid-agent/overview.md +34 -0
  181. docs/features/image-annotation/engine.md +82 -0
  182. docs/features/image-annotation/features.md +113 -0
  183. docs/features/image-annotation/integration.md +75 -0
  184. docs/features/image-annotation/overview.md +53 -0
  185. docs/features/image-annotation/quickstart.md +73 -0
  186. docs/features/policy-engine/doctrine-ledger.md +105 -0
  187. docs/features/policy-engine/monitoring.md +44 -0
  188. docs/features/policy-engine/mpc-control.md +89 -0
  189. docs/features/policy-engine/overview.md +46 -0
  190. docs/getting-started/configuration.md +225 -0
  191. docs/getting-started/first-agent.md +164 -0
  192. docs/getting-started/installation.md +144 -0
  193. docs/getting-started/quickstart.md +137 -0
  194. docs/index.md +118 -0
  195. docs/js/mathjax.js +13 -0
  196. docs/lrm/discovery_proof_notes.md +25 -0
  197. docs/lrm/finetune_full.md +83 -0
  198. docs/lrm/math_appendix.md +120 -0
  199. docs/lrm/overview.md +32 -0
  200. docs/mkdocs.yml +238 -0
  201. docs/stylesheets/extra.css +21 -0
  202. docs_generated/crca_core/CounterfactualResult.md +12 -0
  203. docs_generated/crca_core/DiscoveryHypothesisResult.md +13 -0
  204. docs_generated/crca_core/DraftSpec.md +13 -0
  205. docs_generated/crca_core/EstimateResult.md +13 -0
  206. docs_generated/crca_core/IdentificationResult.md +17 -0
  207. docs_generated/crca_core/InterventionDesignResult.md +12 -0
  208. docs_generated/crca_core/LockedSpec.md +15 -0
  209. docs_generated/crca_core/RefusalResult.md +12 -0
  210. docs_generated/crca_core/ValidationReport.md +9 -0
  211. docs_generated/crca_core/index.md +13 -0
  212. examples/general_agent_example.py +277 -0
  213. examples/general_agent_quickstart.py +202 -0
  214. examples/general_agent_simple.py +92 -0
  215. examples/hybrid_agent_auto_extraction.py +84 -0
  216. examples/hybrid_agent_dictionary_demo.py +104 -0
  217. examples/hybrid_agent_enhanced.py +179 -0
  218. examples/hybrid_agent_general_knowledge.py +107 -0
  219. examples/image_annotation_quickstart.py +328 -0
  220. examples/test_hybrid_fixes.py +77 -0
  221. image_annotation/__init__.py +27 -0
  222. image_annotation/annotation_engine.py +2593 -0
  223. install_cuda_wsl2.sh +59 -0
  224. install_deepspeed.sh +56 -0
  225. install_deepspeed_simple.sh +87 -0
  226. mkdocs.yml +252 -0
  227. ollama/Modelfile +8 -0
  228. prompts/__init__.py +2 -1
  229. prompts/default_crca.py +9 -1
  230. prompts/general_agent.py +227 -0
  231. prompts/image_annotation.py +56 -0
  232. pyproject.toml +17 -2
  233. requirements-docs.txt +10 -0
  234. requirements.txt +21 -2
  235. schemas/__init__.py +26 -1
  236. schemas/annotation.py +222 -0
  237. schemas/conversation.py +193 -0
  238. schemas/hybrid.py +211 -0
  239. schemas/reasoning.py +276 -0
  240. schemas_export/crca_core/CounterfactualResult.schema.json +108 -0
  241. schemas_export/crca_core/DiscoveryHypothesisResult.schema.json +113 -0
  242. schemas_export/crca_core/DraftSpec.schema.json +635 -0
  243. schemas_export/crca_core/EstimateResult.schema.json +113 -0
  244. schemas_export/crca_core/IdentificationResult.schema.json +145 -0
  245. schemas_export/crca_core/InterventionDesignResult.schema.json +111 -0
  246. schemas_export/crca_core/LockedSpec.schema.json +646 -0
  247. schemas_export/crca_core/RefusalResult.schema.json +90 -0
  248. schemas_export/crca_core/ValidationReport.schema.json +62 -0
  249. scripts/build_lrm_dataset.py +80 -0
  250. scripts/export_crca_core_schemas.py +54 -0
  251. scripts/export_hf_lrm.py +37 -0
  252. scripts/export_ollama_gguf.py +45 -0
  253. scripts/generate_changelog.py +157 -0
  254. scripts/generate_crca_core_docs_from_schemas.py +86 -0
  255. scripts/run_crca_core_benchmarks.py +163 -0
  256. scripts/run_full_finetune.py +198 -0
  257. scripts/run_lrm_eval.py +31 -0
  258. templates/graph_management.py +29 -0
  259. tests/conftest.py +9 -0
  260. tests/test_core.py +2 -3
  261. tests/test_crca_core_discovery_tabular.py +15 -0
  262. tests/test_crca_core_estimate_dowhy.py +36 -0
  263. tests/test_crca_core_identify.py +18 -0
  264. tests/test_crca_core_intervention_design.py +36 -0
  265. tests/test_crca_core_linear_gaussian_scm.py +69 -0
  266. tests/test_crca_core_spec.py +25 -0
  267. tests/test_crca_core_timeseries_pcmci.py +15 -0
  268. tests/test_crca_llm_coauthor.py +12 -0
  269. tests/test_crca_llm_orchestrator.py +80 -0
  270. tests/test_hybrid_agent_llm_enhanced.py +556 -0
  271. tests/test_image_annotation_demo.py +376 -0
  272. tests/test_image_annotation_operational.py +408 -0
  273. tests/test_image_annotation_unit.py +551 -0
  274. tests/test_training_moe.py +13 -0
  275. training/__init__.py +42 -0
  276. training/datasets.py +140 -0
  277. training/deepspeed_zero2_0_5b.json +22 -0
  278. training/deepspeed_zero2_1_5b.json +22 -0
  279. training/deepspeed_zero3_0_5b.json +28 -0
  280. training/deepspeed_zero3_14b.json +28 -0
  281. training/deepspeed_zero3_h100_3gpu.json +20 -0
  282. training/deepspeed_zero3_offload.json +28 -0
  283. training/eval.py +92 -0
  284. training/finetune.py +516 -0
  285. training/public_datasets.py +89 -0
  286. training_data/react_train.jsonl +7473 -0
  287. utils/agent_discovery.py +311 -0
  288. utils/batch_processor.py +317 -0
  289. utils/conversation.py +78 -0
  290. utils/edit_distance.py +118 -0
  291. utils/formatter.py +33 -0
  292. utils/graph_reasoner.py +530 -0
  293. utils/rate_limiter.py +283 -0
  294. utils/router.py +2 -2
  295. utils/tool_discovery.py +307 -0
  296. webui/__init__.py +10 -0
  297. webui/app.py +229 -0
  298. webui/config.py +104 -0
  299. webui/static/css/style.css +332 -0
  300. webui/static/js/main.js +284 -0
  301. webui/templates/index.html +42 -0
  302. tests/test_crca_excel.py +0 -166
  303. tests/test_data_broker.py +0 -424
  304. tests/test_palantir.py +0 -349
  305. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/WHEEL +0 -0
  306. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,533 @@
1
+ """
2
+ Few-Shot Learning System with Meta-Learning and Pattern Generalization.
3
+
4
+ Implements meta-learning with MDL-based pattern generalization,
5
+ gradient-free optimization, LSH indexing, and Bayesian pattern updating.
6
+
7
+ Theoretical Basis:
8
+ - Meta-Learning (Schmidhuber 1987, Thrun & Pratt 1998)
9
+ - Pattern Recognition (Duda & Hart 1973)
10
+ - Minimum Description Length (Rissanen 1978)
11
+ """
12
+
13
+ from typing import Dict, List, Optional, Tuple, Any, Set
14
+ from collections import defaultdict
15
+ import logging
16
+ import re
17
+ import math
18
+ import hashlib
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Try to import numpy for numerical operations
23
+ try:
24
+ import numpy as np
25
+ NUMPY_AVAILABLE = True
26
+ except ImportError:
27
+ NUMPY_AVAILABLE = False
28
+ np = None
29
+
30
+
31
+ class ExampleStore:
32
+ """
33
+ Stores example patterns with LSH indexing for O(1) approximate nearest neighbor search.
34
+
35
+ Implements episodic memory with:
36
+ - Input-output pairs: D = {(x₁, y₁), ..., (xₙ, yₙ)}
37
+ - Variable extraction patterns with learned weights
38
+ - Relationship inference patterns
39
+ - Domain-specific templates
40
+ """
41
+
42
+ def __init__(self):
43
+ """Initialize example store."""
44
+ self.examples: List[Tuple[str, Dict[str, Any]]] = [] # (input, output) pairs
45
+ self.patterns: List[Dict[str, Any]] = [] # Learned patterns
46
+ self.lsh_index: Dict[str, List[int]] = defaultdict(list) # LSH buckets
47
+ self.domain_templates: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
48
+
49
+ def add_example(
50
+ self,
51
+ input_text: str,
52
+ output: Dict[str, Any]
53
+ ) -> None:
54
+ """
55
+ Add input-output example pair.
56
+
57
+ Args:
58
+ input_text: Input natural language text
59
+ output: Extracted structure (variables, edges, etc.)
60
+ """
61
+ self.examples.append((input_text, output))
62
+
63
+ # Update LSH index (simplified hash-based indexing)
64
+ hash_key = self._lsh_hash(input_text)
65
+ self.lsh_index[hash_key].append(len(self.examples) - 1)
66
+
67
+ def _lsh_hash(self, text: str, num_hashes: int = 5) -> str:
68
+ """
69
+ Compute LSH hash for approximate nearest neighbor search.
70
+
71
+ Simplified implementation using multiple hash functions.
72
+
73
+ Args:
74
+ text: Text to hash
75
+ num_hashes: Number of hash functions
76
+
77
+ Returns:
78
+ Hash key
79
+ """
80
+ # Simple character-based hash
81
+ text_lower = text.lower()
82
+ hashes = []
83
+ for i in range(num_hashes):
84
+ # Use different hash seeds
85
+ hash_obj = hashlib.md5(f"{i}_{text_lower}".encode())
86
+ hashes.append(hash_obj.hexdigest()[:8])
87
+ return "_".join(hashes)
88
+
89
+ def find_similar_examples(
90
+ self,
91
+ query: str,
92
+ k: int = 5
93
+ ) -> List[Tuple[str, Dict[str, Any], float]]:
94
+ """
95
+ Find k most similar examples using LSH.
96
+
97
+ O(1) approximate nearest neighbor search.
98
+
99
+ Args:
100
+ query: Query text
101
+ k: Number of examples to retrieve
102
+
103
+ Returns:
104
+ List of (input, output, similarity_score) tuples
105
+ """
106
+ query_hash = self._lsh_hash(query)
107
+
108
+ # Get candidates from same LSH bucket
109
+ candidate_indices = set()
110
+ for hash_key in self.lsh_index:
111
+ # Simple similarity: count matching hash components
112
+ similarity = self._hash_similarity(query_hash, hash_key)
113
+ if similarity > 0.3: # Threshold
114
+ candidate_indices.update(self.lsh_index[hash_key])
115
+
116
+ # Compute similarity scores for candidates
117
+ scored_examples = []
118
+ for idx in candidate_indices:
119
+ if idx < len(self.examples):
120
+ input_text, output = self.examples[idx]
121
+ similarity = self._text_similarity(query, input_text)
122
+ scored_examples.append((input_text, output, similarity))
123
+
124
+ # Sort by similarity and return top k
125
+ scored_examples.sort(key=lambda x: x[2], reverse=True)
126
+ return scored_examples[:k]
127
+
128
+ def _hash_similarity(self, hash1: str, hash2: str) -> float:
129
+ """Compute similarity between two hashes."""
130
+ components1 = set(hash1.split("_"))
131
+ components2 = set(hash2.split("_"))
132
+ if not components1 or not components2:
133
+ return 0.0
134
+ intersection = len(components1 & components2)
135
+ union = len(components1 | components2)
136
+ return intersection / union if union > 0 else 0.0
137
+
138
+ def _text_similarity(self, text1: str, text2: str) -> float:
139
+ """Compute Jaccard similarity between texts."""
140
+ words1 = set(re.findall(r'\b\w+\b', text1.lower()))
141
+ words2 = set(re.findall(r'\b\w+\b', text2.lower()))
142
+ if not words1 or not words2:
143
+ return 0.0
144
+ intersection = len(words1 & words2)
145
+ union = len(words1 | words2)
146
+ return intersection / union if union > 0 else 0.0
147
+
148
+
149
+ class PatternLearner:
150
+ """
151
+ Implements gradient-free meta-learning with MDL-based pattern generalization.
152
+
153
+ Features:
154
+ - Pattern extraction using Minimum Description Length (MDL)
155
+ - Generalization via abstraction function
156
+ - Dynamic rule updates with exponential moving average
157
+ - Domain adaptation using domain embeddings
158
+ """
159
+
160
+ def __init__(self, example_store: ExampleStore):
161
+ """
162
+ Initialize pattern learner.
163
+
164
+ Args:
165
+ example_store: ExampleStore instance
166
+ """
167
+ self.example_store = example_store
168
+ self.learned_patterns: List[Dict[str, Any]] = []
169
+ self.pattern_weights: Dict[str, float] = {}
170
+
171
+ def learn_from_examples(
172
+ self,
173
+ examples: Optional[List[Tuple[str, Dict[str, Any]]]] = None,
174
+ k: int = 10
175
+ ) -> List[Dict[str, Any]]:
176
+ """
177
+ Learn patterns from examples using MDL principle.
178
+
179
+ Algorithm:
180
+ function learn_from_examples(examples):
181
+ patterns = []
182
+ for (x, y) in examples:
183
+ pattern = extract_pattern(x, y)
184
+ patterns.append((pattern, compute_mdl(pattern)))
185
+ return select_best_patterns(patterns, k) // Top-k by MDL
186
+
187
+ MDL: L(pattern) = L(data|pattern) + L(pattern)
188
+
189
+ Args:
190
+ examples: Optional list of examples (uses store if None)
191
+ k: Number of best patterns to return
192
+
193
+ Returns:
194
+ List of learned patterns
195
+ """
196
+ if examples is None:
197
+ examples = self.example_store.examples
198
+
199
+ patterns_with_mdl = []
200
+
201
+ for input_text, output in examples:
202
+ pattern = self._extract_pattern(input_text, output)
203
+ mdl_score = self._compute_mdl(pattern, input_text, output)
204
+ patterns_with_mdl.append((pattern, mdl_score))
205
+
206
+ # Select top-k by MDL (lower is better)
207
+ patterns_with_mdl.sort(key=lambda x: x[1])
208
+ best_patterns = [p for p, _ in patterns_with_mdl[:k]]
209
+
210
+ self.learned_patterns = best_patterns
211
+
212
+ # Initialize weights
213
+ for i, pattern in enumerate(best_patterns):
214
+ pattern_id = pattern.get('id', f"pattern_{i}")
215
+ self.pattern_weights[pattern_id] = 1.0 / len(best_patterns)
216
+
217
+ return best_patterns
218
+
219
+ def _extract_pattern(
220
+ self,
221
+ input_text: str,
222
+ output: Dict[str, Any]
223
+ ) -> Dict[str, Any]:
224
+ """
225
+ Extract pattern from input-output pair.
226
+
227
+ Args:
228
+ input_text: Input text
229
+ output: Output structure
230
+
231
+ Returns:
232
+ Pattern dictionary
233
+ """
234
+ # Extract variable patterns
235
+ variables = output.get('variables', [])
236
+ edges = output.get('edges', [])
237
+
238
+ # Create regex pattern from input
239
+ # Simple: replace specific words with generic patterns
240
+ pattern_text = input_text.lower()
241
+
242
+ # Replace variable names with placeholders
243
+ for var in variables:
244
+ pattern_text = re.sub(r'\b' + re.escape(var.lower()) + r'\b', r'\\w+', pattern_text)
245
+
246
+ pattern = {
247
+ 'id': f"pattern_{hash(input_text) % 10000}",
248
+ 'regex': pattern_text,
249
+ 'variables': variables,
250
+ 'edges': edges,
251
+ 'input_template': input_text,
252
+ 'output_template': output
253
+ }
254
+
255
+ return pattern
256
+
257
+ def _compute_mdl(
258
+ self,
259
+ pattern: Dict[str, Any],
260
+ input_text: str,
261
+ output: Dict[str, Any]
262
+ ) -> float:
263
+ """
264
+ Compute Minimum Description Length for pattern.
265
+
266
+ MDL: L(pattern) = L(data|pattern) + L(pattern)
267
+
268
+ Args:
269
+ pattern: Pattern dictionary
270
+ input_text: Input text
271
+ output: Output structure
272
+
273
+ Returns:
274
+ MDL score (lower is better)
275
+ """
276
+ # L(pattern): Description length of pattern
277
+ pattern_length = len(str(pattern))
278
+
279
+ # L(data|pattern): Description length of data given pattern
280
+ # Simplified: how well pattern matches data
281
+ match_score = self._pattern_match_score(pattern, input_text, output)
282
+ data_given_pattern = -math.log(match_score + 1e-10) # Negative log likelihood
283
+
284
+ mdl = pattern_length + data_given_pattern
285
+ return mdl
286
+
287
+ def _pattern_match_score(
288
+ self,
289
+ pattern: Dict[str, Any],
290
+ input_text: str,
291
+ output: Dict[str, Any]
292
+ ) -> float:
293
+ """Compute how well pattern matches input-output pair."""
294
+ # Simple matching: check if pattern variables match output variables
295
+ pattern_vars = set(pattern.get('variables', []))
296
+ output_vars = set(output.get('variables', []))
297
+
298
+ if not pattern_vars or not output_vars:
299
+ return 0.0
300
+
301
+ intersection = len(pattern_vars & output_vars)
302
+ union = len(pattern_vars | output_vars)
303
+ return intersection / union if union > 0 else 0.0
304
+
305
+ def update_pattern_weights(
306
+ self,
307
+ pattern_id: str,
308
+ success: bool,
309
+ learning_rate: float = 0.1
310
+ ) -> None:
311
+ """
312
+ Update pattern weights using exponential moving average.
313
+
314
+ θ_t = (1-η)·θ_{t-1} + η·θ_new
315
+
316
+ Args:
317
+ pattern_id: Pattern ID
318
+ success: Whether pattern was successful
319
+ learning_rate: Learning rate η
320
+ """
321
+ if pattern_id not in self.pattern_weights:
322
+ return
323
+
324
+ # Update weight based on success
325
+ new_weight = 1.0 if success else 0.0
326
+ current_weight = self.pattern_weights[pattern_id]
327
+
328
+ # Exponential moving average
329
+ updated_weight = (1 - learning_rate) * current_weight + learning_rate * new_weight
330
+
331
+ self.pattern_weights[pattern_id] = updated_weight
332
+
333
+
334
+ class AdaptiveExtractor:
335
+ """
336
+ Implements adaptive pattern matching with style adaptation and correction learning.
337
+
338
+ Features:
339
+ - Learned patterns with confidence-weighted matching
340
+ - Style adaptation using n-gram language model
341
+ - Correction learning from feedback
342
+ - Bayesian updating of pattern confidence
343
+ """
344
+
345
+ def __init__(
346
+ self,
347
+ pattern_learner: PatternLearner,
348
+ example_store: ExampleStore
349
+ ):
350
+ """
351
+ Initialize adaptive extractor.
352
+
353
+ Args:
354
+ pattern_learner: PatternLearner instance
355
+ example_store: ExampleStore instance
356
+ """
357
+ self.pattern_learner = pattern_learner
358
+ self.example_store = example_store
359
+ self.user_style_model: Dict[str, float] = defaultdict(float) # n-gram model
360
+
361
+ def adapt_extraction(
362
+ self,
363
+ input_text: str,
364
+ learned_patterns: Optional[List[Dict[str, Any]]] = None
365
+ ) -> Dict[str, Any]:
366
+ """
367
+ Adapt extraction using learned patterns.
368
+
369
+ Algorithm:
370
+ function adapt_extraction(input, learned_patterns):
371
+ scores = [match_score(input, p) for p in learned_patterns]
372
+ best_pattern = argmax(scores)
373
+ return apply_pattern(best_pattern, input)
374
+
375
+ Args:
376
+ input_text: Input text to extract from
377
+ learned_patterns: Optional list of patterns (uses learner's patterns if None)
378
+
379
+ Returns:
380
+ Extracted structure
381
+ """
382
+ if learned_patterns is None:
383
+ learned_patterns = self.pattern_learner.learned_patterns
384
+
385
+ if not learned_patterns:
386
+ # No patterns learned yet, return empty structure
387
+ return {'variables': [], 'edges': []}
388
+
389
+ # Compute match scores
390
+ scores = []
391
+ for pattern in learned_patterns:
392
+ score = self._match_score(input_text, pattern)
393
+ pattern_id = pattern.get('id', '')
394
+ weight = self.pattern_learner.pattern_weights.get(pattern_id, 0.5)
395
+ # Weighted score
396
+ weighted_score = score * weight
397
+ scores.append((pattern, weighted_score))
398
+
399
+ # Get best pattern
400
+ if not scores:
401
+ return {'variables': [], 'edges': []}
402
+
403
+ best_pattern, best_score = max(scores, key=lambda x: x[1])
404
+
405
+ # Apply pattern
406
+ return self._apply_pattern(input_text, best_pattern)
407
+
408
+ def _match_score(
409
+ self,
410
+ input_text: str,
411
+ pattern: Dict[str, Any]
412
+ ) -> float:
413
+ """
414
+ Compute match score between input and pattern.
415
+
416
+ score = Σᵢ wᵢ·match(patternᵢ, input)
417
+
418
+ Args:
419
+ input_text: Input text
420
+ pattern: Pattern dictionary
421
+
422
+ Returns:
423
+ Match score (0.0-1.0)
424
+ """
425
+ # Simple regex matching
426
+ regex = pattern.get('regex', '')
427
+ if not regex:
428
+ return 0.0
429
+
430
+ try:
431
+ match = re.search(regex, input_text.lower())
432
+ if match:
433
+ return 1.0
434
+ else:
435
+ # Partial match: word overlap
436
+ pattern_words = set(re.findall(r'\b\w+\b', regex))
437
+ input_words = set(re.findall(r'\b\w+\b', input_text.lower()))
438
+ if not pattern_words or not input_words:
439
+ return 0.0
440
+ intersection = len(pattern_words & input_words)
441
+ union = len(pattern_words | input_words)
442
+ return intersection / union if union > 0 else 0.0
443
+ except re.error:
444
+ return 0.0
445
+
446
+ def _apply_pattern(
447
+ self,
448
+ input_text: str,
449
+ pattern: Dict[str, Any]
450
+ ) -> Dict[str, Any]:
451
+ """
452
+ Apply pattern to input text.
453
+
454
+ Args:
455
+ input_text: Input text
456
+ pattern: Pattern dictionary
457
+
458
+ Returns:
459
+ Extracted structure
460
+ """
461
+ # Use pattern's output template as base
462
+ output_template = pattern.get('output_template', {})
463
+
464
+ # Extract variables from input (simplified)
465
+ variables = []
466
+ edges = []
467
+
468
+ # Try to match pattern variables in input
469
+ pattern_vars = pattern.get('variables', [])
470
+ for var in pattern_vars:
471
+ # Look for variable in input
472
+ if var.lower() in input_text.lower():
473
+ variables.append(var)
474
+
475
+ # Use pattern edges if variables match
476
+ pattern_edges = pattern.get('edges', [])
477
+ for source, target in pattern_edges:
478
+ if source in variables and target in variables:
479
+ edges.append((source, target))
480
+
481
+ return {
482
+ 'variables': variables,
483
+ 'edges': edges,
484
+ 'confidence': 0.8, # Default confidence
485
+ 'pattern_id': pattern.get('id', '')
486
+ }
487
+
488
+ def learn_from_correction(
489
+ self,
490
+ input_text: str,
491
+ correction: Dict[str, Any],
492
+ pattern_id: Optional[str] = None
493
+ ) -> None:
494
+ """
495
+ Learn from correction feedback.
496
+
497
+ Implements online learning: update(pattern, correction)
498
+
499
+ Args:
500
+ input_text: Original input
501
+ correction: Corrected output
502
+ pattern_id: Optional pattern ID that was used
503
+ """
504
+ # Add correction as new example
505
+ self.example_store.add_example(input_text, correction)
506
+
507
+ # Update pattern weights if pattern_id provided
508
+ if pattern_id:
509
+ self.pattern_learner.update_pattern_weights(pattern_id, success=True)
510
+
511
+ # Re-learn patterns (simplified: just add to examples)
512
+ # In full implementation, would trigger re-learning
513
+
514
+ def update_style_model(
515
+ self,
516
+ text: str,
517
+ n: int = 2
518
+ ) -> None:
519
+ """
520
+ Update user-specific language model using n-grams.
521
+
522
+ P(user_word | context) using n-gram model
523
+
524
+ Args:
525
+ text: Text to learn from
526
+ n: N-gram size
527
+ """
528
+ words = re.findall(r'\b\w+\b', text.lower())
529
+
530
+ # Generate n-grams
531
+ for i in range(len(words) - n + 1):
532
+ ngram = tuple(words[i:i+n])
533
+ self.user_style_model[str(ngram)] += 1.0