crca 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. CRCA.py +172 -7
  2. MODEL_CARD.md +53 -0
  3. PKG-INFO +8 -2
  4. RELEASE_NOTES.md +17 -0
  5. STABILITY.md +19 -0
  6. architecture/hybrid/consistency_engine.py +362 -0
  7. architecture/hybrid/conversation_manager.py +421 -0
  8. architecture/hybrid/explanation_generator.py +452 -0
  9. architecture/hybrid/few_shot_learner.py +533 -0
  10. architecture/hybrid/graph_compressor.py +286 -0
  11. architecture/hybrid/hybrid_agent.py +4398 -0
  12. architecture/hybrid/language_compiler.py +623 -0
  13. architecture/hybrid/main,py +0 -0
  14. architecture/hybrid/reasoning_tracker.py +322 -0
  15. architecture/hybrid/self_verifier.py +524 -0
  16. architecture/hybrid/task_decomposer.py +567 -0
  17. architecture/hybrid/text_corrector.py +341 -0
  18. benchmark_results/crca_core_benchmarks.json +178 -0
  19. branches/crca_sd/crca_sd_realtime.py +6 -2
  20. branches/general_agent/__init__.py +102 -0
  21. branches/general_agent/general_agent.py +1400 -0
  22. branches/general_agent/personality.py +169 -0
  23. branches/general_agent/utils/__init__.py +19 -0
  24. branches/general_agent/utils/prompt_builder.py +170 -0
  25. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/METADATA +8 -2
  26. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/RECORD +303 -20
  27. crca_core/__init__.py +35 -0
  28. crca_core/benchmarks/__init__.py +14 -0
  29. crca_core/benchmarks/synthetic_scm.py +103 -0
  30. crca_core/core/__init__.py +23 -0
  31. crca_core/core/api.py +120 -0
  32. crca_core/core/estimate.py +208 -0
  33. crca_core/core/godclass.py +72 -0
  34. crca_core/core/intervention_design.py +174 -0
  35. crca_core/core/lifecycle.py +48 -0
  36. crca_core/discovery/__init__.py +9 -0
  37. crca_core/discovery/tabular.py +193 -0
  38. crca_core/identify/__init__.py +171 -0
  39. crca_core/identify/backdoor.py +39 -0
  40. crca_core/identify/frontdoor.py +48 -0
  41. crca_core/identify/graph.py +106 -0
  42. crca_core/identify/id_algorithm.py +43 -0
  43. crca_core/identify/iv.py +48 -0
  44. crca_core/models/__init__.py +67 -0
  45. crca_core/models/provenance.py +56 -0
  46. crca_core/models/refusal.py +39 -0
  47. crca_core/models/result.py +83 -0
  48. crca_core/models/spec.py +151 -0
  49. crca_core/models/validation.py +68 -0
  50. crca_core/scm/__init__.py +9 -0
  51. crca_core/scm/linear_gaussian.py +198 -0
  52. crca_core/timeseries/__init__.py +6 -0
  53. crca_core/timeseries/pcmci.py +181 -0
  54. crca_llm/__init__.py +12 -0
  55. crca_llm/client.py +85 -0
  56. crca_llm/coauthor.py +118 -0
  57. crca_llm/orchestrator.py +289 -0
  58. crca_llm/types.py +21 -0
  59. crca_reasoning/__init__.py +16 -0
  60. crca_reasoning/critique.py +54 -0
  61. crca_reasoning/godclass.py +206 -0
  62. crca_reasoning/memory.py +24 -0
  63. crca_reasoning/rationale.py +10 -0
  64. crca_reasoning/react_controller.py +81 -0
  65. crca_reasoning/tool_router.py +97 -0
  66. crca_reasoning/types.py +40 -0
  67. crca_sd/__init__.py +15 -0
  68. crca_sd/crca_sd_core.py +2 -0
  69. crca_sd/crca_sd_governance.py +2 -0
  70. crca_sd/crca_sd_mpc.py +2 -0
  71. crca_sd/crca_sd_realtime.py +2 -0
  72. crca_sd/crca_sd_tui.py +2 -0
  73. cuda-keyring_1.1-1_all.deb +0 -0
  74. cuda-keyring_1.1-1_all.deb.1 +0 -0
  75. docs/IMAGE_ANNOTATION_USAGE.md +539 -0
  76. docs/INSTALL_DEEPSPEED.md +125 -0
  77. docs/api/branches/crca-cg.md +19 -0
  78. docs/api/branches/crca-q.md +27 -0
  79. docs/api/branches/crca-sd.md +37 -0
  80. docs/api/branches/general-agent.md +24 -0
  81. docs/api/branches/overview.md +19 -0
  82. docs/api/crca/agent-methods.md +62 -0
  83. docs/api/crca/operations.md +79 -0
  84. docs/api/crca/overview.md +32 -0
  85. docs/api/image-annotation/engine.md +52 -0
  86. docs/api/image-annotation/overview.md +17 -0
  87. docs/api/schemas/annotation.md +34 -0
  88. docs/api/schemas/core-schemas.md +82 -0
  89. docs/api/schemas/overview.md +32 -0
  90. docs/api/schemas/policy.md +30 -0
  91. docs/api/utils/conversation.md +22 -0
  92. docs/api/utils/graph-reasoner.md +32 -0
  93. docs/api/utils/overview.md +21 -0
  94. docs/api/utils/router.md +19 -0
  95. docs/api/utils/utilities.md +97 -0
  96. docs/architecture/causal-graphs.md +41 -0
  97. docs/architecture/data-flow.md +29 -0
  98. docs/architecture/design-principles.md +33 -0
  99. docs/architecture/hybrid-agent/components.md +38 -0
  100. docs/architecture/hybrid-agent/consistency.md +26 -0
  101. docs/architecture/hybrid-agent/overview.md +44 -0
  102. docs/architecture/hybrid-agent/reasoning.md +22 -0
  103. docs/architecture/llm-integration.md +26 -0
  104. docs/architecture/modular-structure.md +37 -0
  105. docs/architecture/overview.md +69 -0
  106. docs/architecture/policy-engine-arch.md +29 -0
  107. docs/branches/crca-cg/corposwarm.md +39 -0
  108. docs/branches/crca-cg/esg-scoring.md +30 -0
  109. docs/branches/crca-cg/multi-agent.md +35 -0
  110. docs/branches/crca-cg/overview.md +40 -0
  111. docs/branches/crca-q/alternative-data.md +55 -0
  112. docs/branches/crca-q/architecture.md +71 -0
  113. docs/branches/crca-q/backtesting.md +45 -0
  114. docs/branches/crca-q/causal-engine.md +33 -0
  115. docs/branches/crca-q/execution.md +39 -0
  116. docs/branches/crca-q/market-data.md +60 -0
  117. docs/branches/crca-q/overview.md +58 -0
  118. docs/branches/crca-q/philosophy.md +60 -0
  119. docs/branches/crca-q/portfolio-optimization.md +66 -0
  120. docs/branches/crca-q/risk-management.md +102 -0
  121. docs/branches/crca-q/setup.md +65 -0
  122. docs/branches/crca-q/signal-generation.md +61 -0
  123. docs/branches/crca-q/signal-validation.md +43 -0
  124. docs/branches/crca-sd/core.md +84 -0
  125. docs/branches/crca-sd/governance.md +53 -0
  126. docs/branches/crca-sd/mpc-solver.md +65 -0
  127. docs/branches/crca-sd/overview.md +59 -0
  128. docs/branches/crca-sd/realtime.md +28 -0
  129. docs/branches/crca-sd/tui.md +20 -0
  130. docs/branches/general-agent/overview.md +37 -0
  131. docs/branches/general-agent/personality.md +36 -0
  132. docs/branches/general-agent/prompt-builder.md +30 -0
  133. docs/changelog/index.md +79 -0
  134. docs/contributing/code-style.md +69 -0
  135. docs/contributing/documentation.md +43 -0
  136. docs/contributing/overview.md +29 -0
  137. docs/contributing/testing.md +29 -0
  138. docs/core/crcagent/async-operations.md +65 -0
  139. docs/core/crcagent/automatic-extraction.md +107 -0
  140. docs/core/crcagent/batch-prediction.md +80 -0
  141. docs/core/crcagent/bayesian-inference.md +60 -0
  142. docs/core/crcagent/causal-graph.md +92 -0
  143. docs/core/crcagent/counterfactuals.md +96 -0
  144. docs/core/crcagent/deterministic-simulation.md +78 -0
  145. docs/core/crcagent/dual-mode-operation.md +82 -0
  146. docs/core/crcagent/initialization.md +88 -0
  147. docs/core/crcagent/optimization.md +65 -0
  148. docs/core/crcagent/overview.md +63 -0
  149. docs/core/crcagent/time-series.md +57 -0
  150. docs/core/schemas/annotation.md +30 -0
  151. docs/core/schemas/core-schemas.md +82 -0
  152. docs/core/schemas/overview.md +30 -0
  153. docs/core/schemas/policy.md +41 -0
  154. docs/core/templates/base-agent.md +31 -0
  155. docs/core/templates/feature-mixins.md +31 -0
  156. docs/core/templates/overview.md +29 -0
  157. docs/core/templates/templates-guide.md +75 -0
  158. docs/core/tools/mcp-client.md +34 -0
  159. docs/core/tools/overview.md +24 -0
  160. docs/core/utils/conversation.md +27 -0
  161. docs/core/utils/graph-reasoner.md +29 -0
  162. docs/core/utils/overview.md +27 -0
  163. docs/core/utils/router.md +27 -0
  164. docs/core/utils/utilities.md +97 -0
  165. docs/css/custom.css +84 -0
  166. docs/examples/basic-usage.md +57 -0
  167. docs/examples/general-agent/general-agent-examples.md +50 -0
  168. docs/examples/hybrid-agent/hybrid-agent-examples.md +56 -0
  169. docs/examples/image-annotation/image-annotation-examples.md +54 -0
  170. docs/examples/integration/integration-examples.md +58 -0
  171. docs/examples/overview.md +37 -0
  172. docs/examples/trading/trading-examples.md +46 -0
  173. docs/features/causal-reasoning/advanced-topics.md +101 -0
  174. docs/features/causal-reasoning/counterfactuals.md +43 -0
  175. docs/features/causal-reasoning/do-calculus.md +50 -0
  176. docs/features/causal-reasoning/overview.md +47 -0
  177. docs/features/causal-reasoning/structural-models.md +52 -0
  178. docs/features/hybrid-agent/advanced-components.md +55 -0
  179. docs/features/hybrid-agent/core-components.md +64 -0
  180. docs/features/hybrid-agent/overview.md +34 -0
  181. docs/features/image-annotation/engine.md +82 -0
  182. docs/features/image-annotation/features.md +113 -0
  183. docs/features/image-annotation/integration.md +75 -0
  184. docs/features/image-annotation/overview.md +53 -0
  185. docs/features/image-annotation/quickstart.md +73 -0
  186. docs/features/policy-engine/doctrine-ledger.md +105 -0
  187. docs/features/policy-engine/monitoring.md +44 -0
  188. docs/features/policy-engine/mpc-control.md +89 -0
  189. docs/features/policy-engine/overview.md +46 -0
  190. docs/getting-started/configuration.md +225 -0
  191. docs/getting-started/first-agent.md +164 -0
  192. docs/getting-started/installation.md +144 -0
  193. docs/getting-started/quickstart.md +137 -0
  194. docs/index.md +118 -0
  195. docs/js/mathjax.js +13 -0
  196. docs/lrm/discovery_proof_notes.md +25 -0
  197. docs/lrm/finetune_full.md +83 -0
  198. docs/lrm/math_appendix.md +120 -0
  199. docs/lrm/overview.md +32 -0
  200. docs/mkdocs.yml +238 -0
  201. docs/stylesheets/extra.css +21 -0
  202. docs_generated/crca_core/CounterfactualResult.md +12 -0
  203. docs_generated/crca_core/DiscoveryHypothesisResult.md +13 -0
  204. docs_generated/crca_core/DraftSpec.md +13 -0
  205. docs_generated/crca_core/EstimateResult.md +13 -0
  206. docs_generated/crca_core/IdentificationResult.md +17 -0
  207. docs_generated/crca_core/InterventionDesignResult.md +12 -0
  208. docs_generated/crca_core/LockedSpec.md +15 -0
  209. docs_generated/crca_core/RefusalResult.md +12 -0
  210. docs_generated/crca_core/ValidationReport.md +9 -0
  211. docs_generated/crca_core/index.md +13 -0
  212. examples/general_agent_example.py +277 -0
  213. examples/general_agent_quickstart.py +202 -0
  214. examples/general_agent_simple.py +92 -0
  215. examples/hybrid_agent_auto_extraction.py +84 -0
  216. examples/hybrid_agent_dictionary_demo.py +104 -0
  217. examples/hybrid_agent_enhanced.py +179 -0
  218. examples/hybrid_agent_general_knowledge.py +107 -0
  219. examples/image_annotation_quickstart.py +328 -0
  220. examples/test_hybrid_fixes.py +77 -0
  221. image_annotation/__init__.py +27 -0
  222. image_annotation/annotation_engine.py +2593 -0
  223. install_cuda_wsl2.sh +59 -0
  224. install_deepspeed.sh +56 -0
  225. install_deepspeed_simple.sh +87 -0
  226. mkdocs.yml +252 -0
  227. ollama/Modelfile +8 -0
  228. prompts/__init__.py +2 -1
  229. prompts/default_crca.py +9 -1
  230. prompts/general_agent.py +227 -0
  231. prompts/image_annotation.py +56 -0
  232. pyproject.toml +17 -2
  233. requirements-docs.txt +10 -0
  234. requirements.txt +21 -2
  235. schemas/__init__.py +26 -1
  236. schemas/annotation.py +222 -0
  237. schemas/conversation.py +193 -0
  238. schemas/hybrid.py +211 -0
  239. schemas/reasoning.py +276 -0
  240. schemas_export/crca_core/CounterfactualResult.schema.json +108 -0
  241. schemas_export/crca_core/DiscoveryHypothesisResult.schema.json +113 -0
  242. schemas_export/crca_core/DraftSpec.schema.json +635 -0
  243. schemas_export/crca_core/EstimateResult.schema.json +113 -0
  244. schemas_export/crca_core/IdentificationResult.schema.json +145 -0
  245. schemas_export/crca_core/InterventionDesignResult.schema.json +111 -0
  246. schemas_export/crca_core/LockedSpec.schema.json +646 -0
  247. schemas_export/crca_core/RefusalResult.schema.json +90 -0
  248. schemas_export/crca_core/ValidationReport.schema.json +62 -0
  249. scripts/build_lrm_dataset.py +80 -0
  250. scripts/export_crca_core_schemas.py +54 -0
  251. scripts/export_hf_lrm.py +37 -0
  252. scripts/export_ollama_gguf.py +45 -0
  253. scripts/generate_changelog.py +157 -0
  254. scripts/generate_crca_core_docs_from_schemas.py +86 -0
  255. scripts/run_crca_core_benchmarks.py +163 -0
  256. scripts/run_full_finetune.py +198 -0
  257. scripts/run_lrm_eval.py +31 -0
  258. templates/graph_management.py +29 -0
  259. tests/conftest.py +9 -0
  260. tests/test_core.py +2 -3
  261. tests/test_crca_core_discovery_tabular.py +15 -0
  262. tests/test_crca_core_estimate_dowhy.py +36 -0
  263. tests/test_crca_core_identify.py +18 -0
  264. tests/test_crca_core_intervention_design.py +36 -0
  265. tests/test_crca_core_linear_gaussian_scm.py +69 -0
  266. tests/test_crca_core_spec.py +25 -0
  267. tests/test_crca_core_timeseries_pcmci.py +15 -0
  268. tests/test_crca_llm_coauthor.py +12 -0
  269. tests/test_crca_llm_orchestrator.py +80 -0
  270. tests/test_hybrid_agent_llm_enhanced.py +556 -0
  271. tests/test_image_annotation_demo.py +376 -0
  272. tests/test_image_annotation_operational.py +408 -0
  273. tests/test_image_annotation_unit.py +551 -0
  274. tests/test_training_moe.py +13 -0
  275. training/__init__.py +42 -0
  276. training/datasets.py +140 -0
  277. training/deepspeed_zero2_0_5b.json +22 -0
  278. training/deepspeed_zero2_1_5b.json +22 -0
  279. training/deepspeed_zero3_0_5b.json +28 -0
  280. training/deepspeed_zero3_14b.json +28 -0
  281. training/deepspeed_zero3_h100_3gpu.json +20 -0
  282. training/deepspeed_zero3_offload.json +28 -0
  283. training/eval.py +92 -0
  284. training/finetune.py +516 -0
  285. training/public_datasets.py +89 -0
  286. training_data/react_train.jsonl +7473 -0
  287. utils/agent_discovery.py +311 -0
  288. utils/batch_processor.py +317 -0
  289. utils/conversation.py +78 -0
  290. utils/edit_distance.py +118 -0
  291. utils/formatter.py +33 -0
  292. utils/graph_reasoner.py +530 -0
  293. utils/rate_limiter.py +283 -0
  294. utils/router.py +2 -2
  295. utils/tool_discovery.py +307 -0
  296. webui/__init__.py +10 -0
  297. webui/app.py +229 -0
  298. webui/config.py +104 -0
  299. webui/static/css/style.css +332 -0
  300. webui/static/js/main.js +284 -0
  301. webui/templates/index.html +42 -0
  302. tests/test_crca_excel.py +0 -166
  303. tests/test_data_broker.py +0 -424
  304. tests/test_palantir.py +0 -349
  305. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/WHEEL +0 -0
  306. {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,4398 @@
1
+ """
2
+ Enhanced Symbolic-Statistical Hybrid AI - General Purpose LLM Replacement System.
3
+
4
+ A production-ready, stable, pure symbolic-statistical reasoning agent
5
+ that can replace LLMs entirely without actually using one.
6
+
7
+ Supports both:
8
+ - Causal Reasoning (CRCA): Causal analysis, counterfactuals, interventions
9
+ - General Knowledge: Facts, definitions, taxonomic relationships, spatial/temporal knowledge
10
+
11
+ Key Features:
12
+ - Graph-first reasoning: All answers come from graph state, never text parsing
13
+ - Enhanced NLU: Comprehensive pattern matching for causal AND general knowledge
14
+ - Enhanced NLG: Natural, conversational responses with pragmatic tone adjustment
15
+ - Non-destructive text correction: Handles spelling, abbreviations, grammar
16
+ - Language compilation: Three-layer system (lexical, grammatical, pragmatic)
17
+ - Multi-domain support: Causal, taxonomic, spatial, temporal, functional relationships
18
+ - Graph compression: Composite nodes, latent factors, abstraction
19
+ - Provenance tracking: Every edge tracks its source and confidence decay
20
+ - Robust error handling: Graceful degradation, validation, fallback responses
21
+
22
+ Relationship Types Supported:
23
+ - Causal: affects, causes, influences, depends on, leads to
24
+ - Taxonomic: is-a, type-of, belongs-to, classified-as
25
+ - Meronymic: part-of, consists-of, contains, has
26
+ - Spatial: located-in, found-in
27
+ - Temporal: before, after, precedes, follows
28
+ - Functional: used-for, functions-as
29
+ - Definitional: is, means, refers-to, defined-as
30
+ - Factual: was, became, changed-to
31
+
32
+ CRITICAL: Epistemic Validation (for causal tasks)
33
+ For causal reasoning tasks, this agent requires explicit causal structure, not intent statements. It will:
34
+ - REJECT action verbs (identify, analyze, examine) as causal variables
35
+ - REJECT epistemic terms (policy, task, goal, decision) as causal variables
36
+ - WARN when structure is inferred from syntax alone
37
+ - ERROR when task is epistemically underspecified (no valid state variables)
38
+
39
+ For general knowledge tasks, the agent is more flexible and can extract:
40
+ - Facts and definitions
41
+ - Taxonomic relationships
42
+ - Spatial and temporal information
43
+ - Properties and attributes
44
+
45
+ This system is designed to be stable, reliable, and capable of replacing LLMs
46
+ for both causal reasoning AND general knowledge tasks while maintaining natural language interaction.
47
+ """
48
+
49
+ import re
50
+ import json
51
+ from typing import Dict, List, Optional, Tuple, Any, Set
52
+ import logging
53
+ from collections import defaultdict, deque
54
+
55
+ import numpy as np
56
+
57
+ # Optional dependencies
58
+ try:
59
+ import pandas as pd
60
+ PANDAS_AVAILABLE = True
61
+ except ImportError:
62
+ PANDAS_AVAILABLE = False
63
+
64
+ # Import CRCA templates
65
+ from templates.graph_management import GraphManager
66
+ from templates.statistical_methods import StatisticalMethods
67
+ from templates.prediction_framework import PredictionFramework, CounterfactualScenario
68
+
69
+ # Import new graph-first components
70
+ from utils.graph_reasoner import GraphFirstReasoner
71
+ from architecture.hybrid.graph_compressor import GraphCompressor
72
+ from architecture.hybrid.language_compiler import LexicalCompiler, GrammaticalCompiler, PragmaticCompiler
73
+ from architecture.hybrid.text_corrector import TextCorrector
74
+ from schemas.hybrid import EdgeProvenance, TemporalEdge, TemporalType
75
+
76
+ # Import new LLM-enhanced components
77
+ from architecture.hybrid.conversation_manager import ConversationHistory, ContextTracker
78
+ from architecture.hybrid.reasoning_tracker import ReasoningTracker
79
+ from architecture.hybrid.few_shot_learner import ExampleStore, PatternLearner, AdaptiveExtractor
80
+ from architecture.hybrid.task_decomposer import TaskAnalyzer, SubTaskExecutor, PlanGenerator
81
+ from architecture.hybrid.explanation_generator import ExplanationBuilder, TransparencyLayer
82
+ from architecture.hybrid.self_verifier import ConsistencyChecker, ErrorDetector, SelfCorrector
83
+ from architecture.hybrid.consistency_engine import ConsistencyEngine
84
+ from schemas.conversation import ConversationContext, MessageRole
85
+ from schemas.reasoning import ReasoningChain, StepType, InferenceRule, Evidence
86
+
87
+ logger = logging.getLogger(__name__)
88
+
89
+
90
+ class SymbolicReasoner:
91
+ """
92
+ Advanced symbolic reasoning engine for natural language understanding and causal extraction.
93
+
94
+ Uses sophisticated pattern matching, semantic analysis, and context-aware parsing
95
+ to extract causal variables and relationships from natural language tasks.
96
+
97
+ Enhanced Features:
98
+ - Action Verb Understanding: Extracts state variables from action verbs (e.g., "identify X" -> extracts "X")
99
+ - Epistemic Term Understanding: Extracts state variables from epistemic terms (e.g., "policy of X" -> extracts "X")
100
+ - Vague Language Handling: Understands vague language patterns like "what affects X", "factors influencing Y"
101
+ - Semantic Role Analysis: Understands that action verbs and epistemic terms are signals, not variables themselves
102
+ - Context-Aware Extraction: Uses context to infer relationships even from vague descriptions
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ graph_manager: GraphManager,
108
+ lexical_compiler: Optional[Any] = None,
109
+ adaptive_extractor: Optional[Any] = None
110
+ ):
111
+ """
112
+ Initialize symbolic reasoner.
113
+
114
+ Args:
115
+ graph_manager: GraphManager instance for graph operations
116
+ lexical_compiler: Optional LexicalCompiler instance for dictionary validation
117
+ adaptive_extractor: Optional AdaptiveExtractor for few-shot learning
118
+ """
119
+ self.graph_manager = graph_manager
120
+ self.lexical_compiler = lexical_compiler
121
+ self.adaptive_extractor = adaptive_extractor
122
+
123
+ # Comprehensive pattern definitions for extracting causal relationships
124
+ # Updated patterns to handle numerical values, conditionals, and state descriptions
125
+ self.patterns = [
126
+ # Direct causal verbs (with optional numerical values)
127
+ (r'(\w+(?:\s+\w+)?)\s+(?:depends?\s+on|depends?\s+upon)\s+(\w+(?:\s+\w+)?)', 'depends_on', 0.9),
128
+ (r'(\w+(?:\s+\w+)?)\s+causes?\s+(\w+(?:\s+\w+)?)', 'causes', 0.95),
129
+ (r'(\w+(?:\s+\w+)?)\s+affects?\s+(\w+(?:\s+\w+)?)', 'affects', 0.9),
130
+ (r'(\w+(?:\s+\w+)?)\s+influences?\s+(\w+(?:\s+\w+)?)', 'influences', 0.85),
131
+ (r'(\w+(?:\s+\w+)?)\s+leads?\s+to\s+(\w+(?:\s+\w+)?)', 'leads_to', 0.9),
132
+ (r'(\w+(?:\s+\w+)?)\s+results?\s+in\s+(\w+(?:\s+\w+)?)', 'results_in', 0.9),
133
+ (r'(\w+(?:\s+\w+)?)\s+impacts?\s+(\w+(?:\s+\w+)?)', 'impacts', 0.85),
134
+ (r'(\w+(?:\s+\w+)?)\s+drives?\s+(\w+(?:\s+\w+)?)', 'drives', 0.9),
135
+ (r'(\w+(?:\s+\w+)?)\s+determines?\s+(\w+(?:\s+\w+)?)', 'determines', 0.95),
136
+ (r'(\w+(?:\s+\w+)?)\s+controls?\s+(\w+(?:\s+\w+)?)', 'controls', 0.9),
137
+
138
+ # Passive voice patterns
139
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:affected|influenced|determined|controlled|driven)\s+by\s+(\w+(?:\s+\w+)?)', 'affected_by', 0.9),
140
+ (r'(\w+(?:\s+\w+)?)\s+is\s+caused\s+by\s+(\w+(?:\s+\w+)?)', 'caused_by', 0.95),
141
+ (r'(\w+(?:\s+\w+)?)\s+results?\s+from\s+(\w+(?:\s+\w+)?)', 'results_from', 0.9),
142
+
143
+ # State description patterns (X is Y, X = Y, X: Y)
144
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:\d+[.,]?\d*|[\d%]+|[a-z]+)', 'state_description', 0.7),
145
+ (r'(\w+(?:\s+\w+)?)\s*[=:]\s*(?:\d+[.,]?\d*|[\d%]+)', 'state_equals', 0.7),
146
+ (r'(\w+(?:\s+\w+)?)\s+of\s+(\d+[.,]?\d*%?|\w+)', 'state_of', 0.6),
147
+
148
+ # Conditional patterns (enhanced)
149
+ (r'if\s+(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+(?:\d+[.,]?\d*|[\d%]+|\w+)\s*,?\s*(?:then\s+)?(?:what\s+)?(?:is|are|will|would)\s+(\w+(?:\s+\w+)?)', 'conditional_question', 0.85),
150
+ (r'if\s+(\w+(?:\s+\w+)?)\s+then\s+(\w+(?:\s+\w+)?)', 'conditional', 0.85),
151
+ (r'when\s+(\w+(?:\s+\w+)?)\s+,\s+(\w+(?:\s+\w+)?)', 'temporal', 0.8),
152
+ (r'(\w+(?:\s+\w+)?)\s+when\s+(\w+(?:\s+\w+)?)', 'temporal_reverse', 0.8),
153
+
154
+ # Question patterns (what is X, what will X be, etc.)
155
+ (r'(?:what|which|how\s+much|how\s+many)\s+(?:is|are|will|would|should)\s+(?:the\s+)?(\w+(?:\s+\w+)?)', 'question_target', 0.8),
156
+ (r'(?:what|which|how\s+much|how\s+many)\s+(?:is|are|will|would|should)\s+(?:the\s+)?(\w+(?:\s+\w+)?)\s+(?:of|in|for|after|in\s+\d+\s+days)', 'question_target_time', 0.85),
157
+
158
+ # Arrow notation
159
+ (r'(\w+(?:\s+\w+)?)\s*[-->->]\s*(\w+(?:\s+\w+)?)', 'arrow', 0.95),
160
+ (r'(\w+(?:\s+\w+)?)\s*=>\s*(\w+(?:\s+\w+)?)', 'arrow', 0.95),
161
+
162
+ # Comparative patterns
163
+ (r'(\w+(?:\s+\w+)?)\s+increases?\s+(\w+(?:\s+\w+)?)', 'increases', 0.9),
164
+ (r'(\w+(?:\s+\w+)?)\s+decreases?\s+(\w+(?:\s+\w+)?)', 'decreases', 0.9),
165
+ (r'(\w+(?:\s+\w+)?)\s+raises?\s+(\w+(?:\s+\w+)?)', 'increases', 0.85),
166
+ (r'(\w+(?:\s+\w+)?)\s+lowers?\s+(\w+(?:\s+\w+)?)', 'decreases', 0.85),
167
+
168
+ # Correlation patterns (weaker causality)
169
+ (r'(\w+(?:\s+\w+)?)\s+is\s+correlated\s+with\s+(\w+(?:\s+\w+)?)', 'correlated', 0.6),
170
+ (r'(\w+(?:\s+\w+)?)\s+is\s+related\s+to\s+(\w+(?:\s+\w+)?)', 'related', 0.5),
171
+
172
+ # Implicit relationships (X and Y, X with Y)
173
+ (r'(\w+(?:\s+\w+)?)\s+and\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|determine|control)', 'implicit_and', 0.6),
174
+
175
+ # NEW: Vague language patterns - "what affects X", "factors influencing X"
176
+ (r'what\s+(?:affects|influences|causes|impacts|changes)\s+(\w+(?:\s+\w+)?)', 'vague_causal', 0.6),
177
+ (r'factors?\s+(?:affecting|influencing|causing|impacting)\s+(\w+(?:\s+\w+)?)', 'vague_causal', 0.6),
178
+ (r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|cause|impact)', 'vague_causal', 0.6),
179
+ # NEW: Relationship patterns - "relationship between X and Y"
180
+ (r'relationship\s+(?:between|among)\s+(\w+(?:\s+\w+)?)\s+(?:and|&)\s+(\w+(?:\s+\w+)?)', 'relationship', 0.7),
181
+ (r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+relate\s+to\s+(\w+(?:\s+\w+)?)', 'relationship', 0.7),
182
+ (r'effect\s+of\s+(\w+(?:\s+\w+)?)\s+on\s+(\w+(?:\s+\w+)?)', 'causal', 0.8),
183
+
184
+ # Enhanced patterns for better coverage
185
+ # Temporal patterns
186
+ (r'(\w+(?:\s+\w+)?)\s+before\s+(\w+(?:\s+\w+)?)', 'before', 0.85),
187
+ (r'(\w+(?:\s+\w+)?)\s+after\s+(\w+(?:\s+\w+)?)', 'after', 0.85),
188
+ (r'(\w+(?:\s+\w+)?)\s+leads?\s+to\s+(\w+(?:\s+\w+)?)\s+in\s+(\d+)\s+(?:days?|hours?|weeks?|months?)', 'delayed', 0.9),
189
+
190
+ # Comparative and quantitative
191
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:higher|greater|larger|bigger)\s+than\s+(\w+(?:\s+\w+)?)', 'greater_than', 0.7),
192
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:lower|smaller|less)\s+than\s+(\w+(?:\s+\w+)?)', 'less_than', 0.7),
193
+ (r'(\w+(?:\s+\w+)?)\s+varies?\s+with\s+(\w+(?:\s+\w+)?)', 'varies_with', 0.75),
194
+
195
+ # Question patterns (enhanced)
196
+ (r'what\s+(?:happens?|occurs?|results?)\s+(?:if|when)\s+(\w+(?:\s+\w+)?)', 'what_if', 0.9),
197
+ (r'how\s+(?:does|do|will|would)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|impact)\s+(\w+(?:\s+\w+)?)', 'how_affects', 0.9),
198
+ (r'why\s+(?:does|do|is|are)\s+(\w+(?:\s+\w+)?)', 'why_question', 0.8),
199
+
200
+ # Multi-variable patterns
201
+ (r'(\w+(?:\s+\w+)?)\s+(?:together\s+with|along\s+with|combined\s+with)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|cause)', 'combined_effect', 0.8),
202
+ (r'(\w+(?:\s+\w+)?)\s+(?:and|or)\s+(\w+(?:\s+\w+)?)\s+(?:both|together)\s+(?:affect|influence|determine)', 'joint_effect', 0.75),
203
+
204
+ # ====================================================================
205
+ # GENERAL KNOWLEDGE PATTERNS (Non-Causal Relationships)
206
+ # ====================================================================
207
+
208
+ # Taxonomic/Classification patterns (is-a, type-of)
209
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+(\w+(?:\s+\w+)?)', 'is_a', 0.9),
210
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+type\s+of\s+(\w+(?:\s+\w+)?)', 'is_a', 0.95),
211
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+kind\s+of\s+(\w+(?:\s+\w+)?)', 'is_a', 0.9),
212
+ (r'(\w+(?:\s+\w+)?)\s+belongs?\s+to\s+(\w+(?:\s+\w+)?)', 'belongs_to', 0.85),
213
+ (r'(\w+(?:\s+\w+)?)\s+is\s+classified\s+as\s+(\w+(?:\s+\w+)?)', 'is_a', 0.9),
214
+
215
+ # Property/Has patterns
216
+ (r'(\w+(?:\s+\w+)?)\s+has\s+(\w+(?:\s+\w+)?)', 'has_property', 0.85),
217
+ (r'(\w+(?:\s+\w+)?)\s+has\s+(?:a|an)\s+(\w+(?:\s+\w+)?)', 'has_property', 0.85),
218
+ (r'(\w+(?:\s+\w+)?)\s+possesses?\s+(\w+(?:\s+\w+)?)', 'has_property', 0.8),
219
+ (r'(\w+(?:\s+\w+)?)\s+contains?\s+(\w+(?:\s+\w+)?)', 'contains', 0.85),
220
+ (r'(\w+(?:\s+\w+)?)\s+includes?\s+(\w+(?:\s+\w+)?)', 'includes', 0.8),
221
+
222
+ # Part-Whole patterns (meronymy)
223
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+part\s+of\s+(\w+(?:\s+\w+)?)', 'part_of', 0.9),
224
+ (r'(\w+(?:\s+\w+)?)\s+is\s+part\s+of\s+(\w+(?:\s+\w+)?)', 'part_of', 0.9),
225
+ (r'(\w+(?:\s+\w+)?)\s+belongs?\s+to\s+(\w+(?:\s+\w+)?)', 'part_of', 0.8),
226
+ (r'(\w+(?:\s+\w+)?)\s+consists?\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.9),
227
+ (r'(\w+(?:\s+\w+)?)\s+is\s+composed\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.9),
228
+ (r'(\w+(?:\s+\w+)?)\s+is\s+made\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.85),
229
+ (r'(\w+(?:\s+\w+)?)\s+is\s+made\s+up\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.85),
230
+
231
+ # Location/Spatial patterns
232
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.8),
233
+ (r'(\w+(?:\s+\w+)?)\s+is\s+located\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.85),
234
+ (r'(\w+(?:\s+\w+)?)\s+is\s+found\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.8),
235
+ (r'(\w+(?:\s+\w+)?)\s+resides?\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.8),
236
+
237
+ # Definition patterns (X is Y, X means Y, X refers to Y)
238
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:defined\s+as|means?|refers?\s+to)\s+(\w+(?:\s+\w+)?)', 'defined_as', 0.9),
239
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(\w+(?:\s+\w+)?)', 'is', 0.7), # General "is" (weaker)
240
+ (r'(\w+(?:\s+\w+)?)\s+means?\s+(\w+(?:\s+\w+)?)', 'means', 0.85),
241
+ (r'(\w+(?:\s+\w+)?)\s+refers?\s+to\s+(\w+(?:\s+\w+)?)', 'refers_to', 0.85),
242
+
243
+ # Similarity/Equivalence patterns
244
+ (r'(\w+(?:\s+\w+)?)\s+is\s+(?:similar\s+to|like|equivalent\s+to)\s+(\w+(?:\s+\w+)?)', 'similar_to', 0.8),
245
+ (r'(\w+(?:\s+\w+)?)\s+is\s+the\s+same\s+as\s+(\w+(?:\s+\w+)?)', 'equivalent_to', 0.9),
246
+ (r'(\w+(?:\s+\w+)?)\s+equals?\s+(\w+(?:\s+\w+)?)', 'equivalent_to', 0.85),
247
+
248
+ # Temporal patterns (general knowledge)
249
+ (r'(\w+(?:\s+\w+)?)\s+occurs?\s+(?:before|after|during)\s+(\w+(?:\s+\w+)?)', 'temporal', 0.8),
250
+ (r'(\w+(?:\s+\w+)?)\s+happens?\s+(?:before|after|during)\s+(\w+(?:\s+\w+)?)', 'temporal', 0.8),
251
+ (r'(\w+(?:\s+\w+)?)\s+precedes?\s+(\w+(?:\s+\w+)?)', 'precedes', 0.85),
252
+ (r'(\w+(?:\s+\w+)?)\s+follows?\s+(\w+(?:\s+\w+)?)', 'follows', 0.85),
253
+
254
+ # Purpose/Function patterns
255
+ (r'(\w+(?:\s+\w+)?)\s+is\s+used\s+(?:for|to)\s+(\w+(?:\s+\w+)?)', 'used_for', 0.85),
256
+ (r'(\w+(?:\s+\w+)?)\s+serves?\s+to\s+(\w+(?:\s+\w+)?)', 'used_for', 0.85),
257
+ (r'(\w+(?:\s+\w+)?)\s+functions?\s+as\s+(\w+(?:\s+\w+)?)', 'functions_as', 0.85),
258
+ (r'(\w+(?:\s+\w+)?)\s+is\s+for\s+(\w+(?:\s+\w+)?)', 'used_for', 0.8),
259
+
260
+ # General knowledge question patterns
261
+ (r'what\s+is\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'what_is', 0.9),
262
+ (r'what\s+are?\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'what_is', 0.9),
263
+ (r'who\s+is\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'who_is', 0.9),
264
+ (r'where\s+is\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'where_is', 0.9),
265
+ (r'when\s+(?:is|was|does|did)\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'when_is', 0.85),
266
+ (r'how\s+(?:does|do|is|are)\s+(\w+(?:\s+\w+)?)\s+work', 'how_works', 0.85),
267
+ (r'what\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+mean', 'what_means', 0.9),
268
+
269
+ # Factual statement patterns
270
+ (r'(\w+(?:\s+\w+)?)\s+was\s+(\w+(?:\s+\w+)?)', 'factual', 0.7),
271
+ (r'(\w+(?:\s+\w+)?)\s+were\s+(\w+(?:\s+\w+)?)', 'factual', 0.7),
272
+ (r'(\w+(?:\s+\w+)?)\s+became\s+(\w+(?:\s+\w+)?)', 'became', 0.8),
273
+ (r'(\w+(?:\s+\w+)?)\s+changed\s+to\s+(\w+(?:\s+\w+)?)', 'changed_to', 0.8),
274
+ ]
275
+
276
+ # Extended keywords for identifying variables (domain-agnostic)
277
+ self.variable_keywords = [
278
+ # General terms
279
+ 'variable', 'factor', 'metric', 'indicator', 'measure', 'parameter',
280
+ 'dimension', 'attribute', 'feature', 'component', 'element',
281
+ 'concept', 'entity', 'object', 'item', 'thing', 'subject', 'topic',
282
+
283
+ # Business/Economics
284
+ 'price', 'demand', 'supply', 'sales', 'revenue', 'cost', 'profit',
285
+ 'margin', 'growth', 'market', 'customer', 'product', 'service',
286
+
287
+ # Quality/Performance
288
+ 'satisfaction', 'quality', 'performance', 'efficiency', 'effectiveness',
289
+ 'productivity', 'output', 'throughput', 'latency', 'speed',
290
+
291
+ # Social/Psychological
292
+ 'happiness', 'wellbeing', 'stress', 'motivation', 'engagement',
293
+ 'retention', 'turnover', 'loyalty', 'trust',
294
+
295
+ # General Knowledge entities
296
+ 'person', 'place', 'location', 'country', 'city', 'organization',
297
+ 'company', 'institution', 'event', 'date', 'time', 'period',
298
+ 'category', 'type', 'class', 'group', 'species', 'genre',
299
+
300
+ # Technical
301
+ 'temperature', 'pressure', 'voltage', 'current', 'frequency',
302
+ 'bandwidth', 'capacity', 'utilization', 'availability',
303
+ ]
304
+
305
+ # Stop words to filter out
306
+ self.stop_words = {
307
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
308
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
309
+ 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
310
+ 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this',
311
+ 'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'where',
312
+ 'when', 'why', 'how', 'all', 'each', 'every', 'some', 'any', 'no',
313
+ 'not', 'only', 'just', 'also', 'too', 'very', 'more', 'most', 'less',
314
+ 'least', 'many', 'much', 'few', 'little', 'other', 'another', 'same',
315
+ 'different', 'such', 'own', 'so', 'than', 'then', 'there', 'here',
316
+ }
317
+
318
+ # Causal verb synonyms for better matching
319
+ self.causal_verbs = {
320
+ 'cause', 'causes', 'caused', 'affect', 'affects', 'affected',
321
+ 'influence', 'influences', 'influenced', 'impact', 'impacts', 'impacted',
322
+ 'determine', 'determines', 'determined', 'control', 'controls', 'controlled',
323
+ 'drive', 'drives', 'driven', 'lead', 'leads', 'led', 'result', 'results',
324
+ 'increase', 'increases', 'increased', 'decrease', 'decreases', 'decreased',
325
+ 'raise', 'raises', 'raised', 'lower', 'lowers', 'lowered',
326
+ }
327
+
328
+ # Negation words
329
+ self.negation_words = {'not', 'no', 'never', 'none', 'nothing', 'nobody', 'nowhere', 'neither', 'nor'}
330
+
331
+ # Quantifier words
332
+ self.quantifier_words = {'all', 'some', 'many', 'most', 'few', 'several', 'each', 'every', 'any'}
333
+
334
+ # Action verbs that should NEVER be treated as causal variables
335
+ # These are epistemic/intentional actions, not state variables
336
+ self.action_verbs = {
337
+ 'identify', 'analyze', 'examine', 'study', 'investigate', 'explore',
338
+ 'determine', 'find', 'discover', 'detect', 'recognize', 'understand',
339
+ 'explain', 'describe', 'define', 'specify', 'clarify', 'elucidate',
340
+ 'predict', 'forecast', 'estimate', 'calculate', 'compute', 'measure',
341
+ 'evaluate', 'assess', 'judge', 'compare', 'contrast', 'differentiate',
342
+ 'recommend', 'suggest', 'propose', 'advise', 'counsel', 'guide',
343
+ 'implement', 'execute', 'perform', 'conduct', 'carry', 'out',
344
+ 'create', 'generate', 'produce', 'make', 'build', 'construct',
345
+ 'modify', 'change', 'alter', 'adjust', 'update', 'revise',
346
+ 'remove', 'delete', 'eliminate', 'exclude', 'omit', 'skip',
347
+ 'add', 'include', 'insert', 'append', 'attach', 'incorporate',
348
+ 'process', 'handle', 'manage', 'control', 'operate', 'run',
349
+ 'check', 'verify', 'validate', 'confirm', 'test', 'trial',
350
+ 'show', 'display', 'present', 'demonstrate', 'illustrate', 'reveal',
351
+ 'report', 'document', 'record', 'log', 'track', 'monitor',
352
+ 'request', 'ask', 'query', 'question', 'inquire', 'interrogate',
353
+ 'provide', 'supply', 'deliver', 'offer', 'give', 'send',
354
+ 'receive', 'obtain', 'acquire', 'get', 'fetch', 'retrieve',
355
+ 'use', 'utilize', 'employ', 'apply', 'leverage', 'exploit',
356
+ 'consider', 'think', 'contemplate', 'reflect', 'ponder', 'muse',
357
+ 'decide', 'choose', 'select', 'pick', 'opt', 'prefer',
358
+ 'plan', 'design', 'scheme', 'devise', 'formulate', 'develop',
359
+ 'solve', 'resolve', 'fix', 'repair', 'correct', 'rectify',
360
+ 'learn', 'teach', 'train', 'educate', 'instruct', 'coach',
361
+ 'help', 'assist', 'aid', 'support', 'facilitate', 'enable'
362
+ }
363
+
364
+ # Epistemic/intentional terms that indicate tasks, not causal variables
365
+ self.epistemic_terms = {
366
+ 'task', 'goal', 'objective', 'aim', 'purpose', 'intent', 'intention',
367
+ 'requirement', 'specification', 'criteria', 'standard', 'benchmark',
368
+ 'policy', 'strategy', 'approach', 'method', 'technique', 'procedure',
369
+ 'process', 'workflow', 'pipeline', 'system', 'framework', 'model',
370
+ 'analysis', 'study', 'research', 'investigation', 'examination',
371
+ 'result', 'outcome', 'consequence', 'effect', 'impact', 'influence',
372
+ 'finding', 'discovery', 'insight', 'observation', 'conclusion',
373
+ 'recommendation', 'suggestion', 'advice', 'guidance', 'direction',
374
+ 'decision', 'choice', 'selection', 'option', 'alternative',
375
+ 'problem', 'issue', 'challenge', 'difficulty', 'obstacle', 'barrier',
376
+ 'solution', 'answer', 'resolution', 'fix', 'remedy', 'cure',
377
+ 'question', 'query', 'inquiry', 'request', 'demand', 'need'
378
+ }
379
+
380
+ def _normalize_variable_name(self, var: str) -> str:
381
+ """
382
+ Normalize variable name by cleaning and standardizing.
383
+
384
+ Args:
385
+ var: Raw variable name
386
+
387
+ Returns:
388
+ Normalized variable name
389
+ """
390
+ if not var:
391
+ return ''
392
+
393
+ # Remove extra whitespace
394
+ var = ' '.join(var.split())
395
+
396
+ # Remove common articles and prepositions at start
397
+ words = var.split()
398
+ while words and words[0].lower() in {'the', 'a', 'an', 'of', 'for', 'in', 'on', 'at', 'to', 'from'}:
399
+ words = words[1:]
400
+
401
+ var = ' '.join(words)
402
+
403
+ # Convert to lowercase for consistency
404
+ return var.lower().strip()
405
+
406
+ def _extract_noun_phrases(self, text: str) -> List[str]:
407
+ """
408
+ Extract noun phrases from text using pattern matching.
409
+
410
+ Args:
411
+ text: Input text
412
+
413
+ Returns:
414
+ List of noun phrases
415
+ """
416
+ noun_phrases = []
417
+
418
+ # Pattern: adjective* noun+
419
+ pattern = r'\b(?:[a-z]+(?:\s+[a-z]+)*\s+)?(?:[a-z]+(?:ing|ed|tion|sion|ment|ness|ity|ance|ence)?)\b'
420
+ matches = re.finditer(pattern, text.lower())
421
+
422
+ for match in matches:
423
+ phrase = match.group(0).strip()
424
+ # Filter out stop words and very short phrases
425
+ words = phrase.split()
426
+ if len(words) >= 1 and not all(w in self.stop_words for w in words):
427
+ # Remove stop words from beginning/end
428
+ while words and words[0] in self.stop_words:
429
+ words = words[1:]
430
+ while words and words[-1] in self.stop_words:
431
+ words = words[:-1]
432
+ if words:
433
+ noun_phrases.append(' '.join(words))
434
+
435
+ return list(set(noun_phrases))
436
+
437
+ def _detect_negation(self, text: str, start_pos: int, end_pos: int) -> bool:
438
+ """
439
+ Detect if a phrase is negated.
440
+
441
+ Args:
442
+ text: Full text
443
+ start_pos: Start position of phrase
444
+ end_pos: End position of phrase
445
+
446
+ Returns:
447
+ True if negated
448
+ """
449
+ # Check before the phrase
450
+ before = text[max(0, start_pos-20):start_pos].lower()
451
+ for neg_word in self.negation_words:
452
+ if neg_word in before:
453
+ return True
454
+ return False
455
+
456
+ def _extract_state_variables_from_action_verbs(self, text: str) -> Set[str]:
457
+ """
458
+ Extract state variables from action verbs by finding what they refer to.
459
+
460
+ Example: "identify past policy" -> extract "policy" (but filter if epistemic)
461
+ Example: "analyze price trends" -> extract "price", "trends"
462
+ Example: "determine demand level" -> extract "demand", "level"
463
+
464
+ Args:
465
+ text: Input text
466
+
467
+ Returns:
468
+ Set of extracted state variable names
469
+ """
470
+ extracted_vars = set()
471
+ text_lower = text.lower()
472
+
473
+ # Pattern: action_verb + (optional adverb) + noun_phrase
474
+ # Match: "identify X", "analyze the X", "determine X", etc.
475
+ for action_verb in self.action_verbs:
476
+ # Pattern 1: "action_verb [the/a/an] noun_phrase"
477
+ pattern1 = rf'\b{action_verb}\s+(?:the|a|an)?\s*(\w+(?:\s+\w+)?)'
478
+ matches = re.finditer(pattern1, text_lower, re.IGNORECASE)
479
+ for match in matches:
480
+ noun_phrase = match.group(1).strip()
481
+ # Clean and validate
482
+ cleaned = self._normalize_variable_name(noun_phrase)
483
+ if cleaned and not self._is_action_verb(cleaned):
484
+ # Check if it's an epistemic term - if so, try to extract what it refers to
485
+ if self._is_epistemic_term(cleaned):
486
+ # Try to find what the epistemic term refers to
487
+ # E.g., "past policy" -> look for what policy refers to
488
+ # This is harder, so we'll skip for now and let other methods handle it
489
+ continue
490
+ # Only add if it's not an epistemic term itself
491
+ if not self._is_epistemic_term(cleaned):
492
+ extracted_vars.add(cleaned)
493
+
494
+ # Pattern 2: "action_verb [what/which/how] noun_phrase"
495
+ pattern2 = rf'\b{action_verb}\s+(?:what|which|how)\s+(\w+(?:\s+\w+)?)'
496
+ matches = re.finditer(pattern2, text_lower, re.IGNORECASE)
497
+ for match in matches:
498
+ noun_phrase = match.group(1).strip()
499
+ cleaned = self._normalize_variable_name(noun_phrase)
500
+ if cleaned and not self._is_action_verb(cleaned) and not self._is_epistemic_term(cleaned):
501
+ extracted_vars.add(cleaned)
502
+
503
+ return extracted_vars
504
+
505
+ def _extract_state_variables_from_epistemic_terms(self, text: str) -> Set[str]:
506
+ """
507
+ Extract state variables from epistemic terms by finding what they refer to.
508
+
509
+ Example: "past policy" -> if we can find what policy refers to, extract that
510
+ Example: "task goal" -> extract the underlying state variable the goal refers to
511
+ Example: "policy decision" -> extract what the decision affects
512
+
513
+ Args:
514
+ text: Input text
515
+
516
+ Returns:
517
+ Set of extracted state variable names
518
+ """
519
+ extracted_vars = set()
520
+ text_lower = text.lower()
521
+
522
+ # Pattern: epistemic_term + "of" + noun_phrase
523
+ # E.g., "policy of X", "goal of Y"
524
+ for epistemic_term in self.epistemic_terms:
525
+ pattern1 = rf'\b{epistemic_term}\s+of\s+(\w+(?:\s+\w+)?)'
526
+ matches = re.finditer(pattern1, text_lower, re.IGNORECASE)
527
+ for match in matches:
528
+ noun_phrase = match.group(1).strip()
529
+ cleaned = self._normalize_variable_name(noun_phrase)
530
+ if cleaned and not self._is_action_verb(cleaned) and not self._is_epistemic_term(cleaned):
531
+ extracted_vars.add(cleaned)
532
+
533
+ # Pattern: adjective + epistemic_term -> extract what it modifies
534
+ # E.g., "past policy" -> look for what policy affects
535
+ # This is harder - we'll use context clues
536
+ epistemic_patterns = [
537
+ r'past\s+(\w+)', # "past X" -> X might be a state variable if not epistemic
538
+ r'(\w+)\s+policy', # "X policy" -> X might be what policy affects
539
+ r'(\w+)\s+decision', # "X decision" -> X might be what decision affects
540
+ ]
541
+
542
+ for pattern in epistemic_patterns:
543
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE)
544
+ for match in matches:
545
+ noun_phrase = match.group(1).strip()
546
+ cleaned = self._normalize_variable_name(noun_phrase)
547
+ # Only add if it's not an action verb or epistemic term itself
548
+ if (cleaned and
549
+ not self._is_action_verb(cleaned) and
550
+ not self._is_epistemic_term(cleaned) and
551
+ cleaned not in self.stop_words):
552
+ extracted_vars.add(cleaned)
553
+
554
+ return extracted_vars
555
+
556
+ def _extract_variables_from_vague_language(self, text: str) -> Set[str]:
557
+ """
558
+ Extract state variables from vague language using semantic understanding.
559
+
560
+ Handles patterns like:
561
+ - "what affects X" -> extract X and what affects it
562
+ - "how does X relate to Y" -> extract X, Y
563
+ - "the relationship between X and Y" -> extract X, Y
564
+ - "factors influencing X" -> extract X and factors
565
+
566
+ Args:
567
+ text: Input text
568
+
569
+ Returns:
570
+ Set of extracted state variable names
571
+ """
572
+ extracted_vars = set()
573
+ text_lower = text.lower()
574
+
575
+ # Pattern: "what affects/influences/causes X"
576
+ affect_patterns = [
577
+ r'what\s+(?:affects|influences|causes|impacts|changes)\s+(\w+(?:\s+\w+)?)',
578
+ r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|cause|impact)',
579
+ r'factors?\s+(?:affecting|influencing|causing|impacting)\s+(\w+(?:\s+\w+)?)',
580
+ ]
581
+
582
+ for pattern in affect_patterns:
583
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE)
584
+ for match in matches:
585
+ noun_phrase = match.group(1).strip()
586
+ cleaned = self._normalize_variable_name(noun_phrase)
587
+ if cleaned and not self._is_action_verb(cleaned) and not self._is_epistemic_term(cleaned):
588
+ extracted_vars.add(cleaned)
589
+
590
+ # Pattern: "relationship between X and Y"
591
+ relationship_pattern = r'relationship\s+(?:between|among)\s+(\w+(?:\s+\w+)?)\s+(?:and|&)\s+(\w+(?:\s+\w+)?)'
592
+ matches = re.finditer(relationship_pattern, text_lower, re.IGNORECASE)
593
+ for match in matches:
594
+ var1 = self._normalize_variable_name(match.group(1).strip())
595
+ var2 = self._normalize_variable_name(match.group(2).strip())
596
+ if var1 and not self._is_action_verb(var1) and not self._is_epistemic_term(var1):
597
+ extracted_vars.add(var1)
598
+ if var2 and not self._is_action_verb(var2) and not self._is_epistemic_term(var2):
599
+ extracted_vars.add(var2)
600
+
601
+ # Pattern: "how does X relate to Y"
602
+ relate_pattern = r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+relate\s+to\s+(\w+(?:\s+\w+)?)'
603
+ matches = re.finditer(relate_pattern, text_lower, re.IGNORECASE)
604
+ for match in matches:
605
+ var1 = self._normalize_variable_name(match.group(1).strip())
606
+ var2 = self._normalize_variable_name(match.group(2).strip())
607
+ if var1 and not self._is_action_verb(var1) and not self._is_epistemic_term(var1):
608
+ extracted_vars.add(var1)
609
+ if var2 and not self._is_action_verb(var2) and not self._is_epistemic_term(var2):
610
+ extracted_vars.add(var2)
611
+
612
+ # Pattern: "the effect of X on Y"
613
+ effect_pattern = r'effect\s+of\s+(\w+(?:\s+\w+)?)\s+on\s+(\w+(?:\s+\w+)?)'
614
+ matches = re.finditer(effect_pattern, text_lower, re.IGNORECASE)
615
+ for match in matches:
616
+ var1 = self._normalize_variable_name(match.group(1).strip())
617
+ var2 = self._normalize_variable_name(match.group(2).strip())
618
+ if var1 and not self._is_action_verb(var1) and not self._is_epistemic_term(var1):
619
+ extracted_vars.add(var1)
620
+ if var2 and not self._is_action_verb(var2) and not self._is_epistemic_term(var2):
621
+ extracted_vars.add(var2)
622
+
623
+ return extracted_vars
624
+
625
+ def _extract_with_context(self, text: str) -> List[Dict[str, Any]]:
626
+ """
627
+ Extract variables and relationships with context awareness.
628
+ Enhanced to handle numerical values, conditionals, questions, action verbs, and epistemic terms.
629
+
630
+ Args:
631
+ text: Input text
632
+
633
+ Returns:
634
+ List of extracted relationships with context
635
+ """
636
+ relationships = []
637
+ text_lower = text.lower()
638
+
639
+ # Extract using all patterns
640
+ for pattern, rel_type, confidence in self.patterns:
641
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE)
642
+ for match in matches:
643
+ # Handle patterns with 1 or 2 groups
644
+ if match.lastindex >= 2:
645
+ source_raw = match.group(1).strip()
646
+ target_raw = match.group(2).strip()
647
+ elif match.lastindex == 1:
648
+ # Single group patterns (like question_target)
649
+ source_raw = match.group(1).strip()
650
+ target_raw = None
651
+ else:
652
+ continue
653
+
654
+ # Normalize variable names (remove numerical values and percentages)
655
+ source = self._normalize_variable_name(source_raw)
656
+ if target_raw:
657
+ target = self._normalize_variable_name(target_raw)
658
+ else:
659
+ target = None
660
+
661
+ # Skip if too short or stop words
662
+ if not source or len(source.split()) == 0:
663
+ continue
664
+ if source in self.stop_words:
665
+ continue
666
+
667
+ # For single-group patterns (questions), extract target from context
668
+ if not target and rel_type in ['question_target', 'question_target_time', 'state_description', 'state_equals']:
669
+ # Try to find what the question is about
670
+ # Look for "what is X" -> X is the target variable
671
+ if 'what' in text_lower or 'which' in text_lower:
672
+ # Extract all variables mentioned before the question
673
+ # This is a heuristic - the question target is usually mentioned earlier
674
+ pass # Will be handled by standalone variable extraction
675
+
676
+ # For state descriptions, infer relationships
677
+ if rel_type in ['state_description', 'state_equals', 'state_of'] and target:
678
+ # State descriptions like "X is Y" don't create causal edges directly
679
+ # But we can infer that variables mentioned together might be related
680
+ continue
681
+
682
+ # Skip if target is invalid
683
+ if target and (len(target.split()) == 0 or target in self.stop_words):
684
+ continue
685
+
686
+ # Check for negation
687
+ start_pos = match.start()
688
+ end_pos = match.end()
689
+ is_negated = self._detect_negation(text, start_pos, end_pos)
690
+
691
+ # Adjust confidence for negation
692
+ if is_negated:
693
+ confidence *= 0.3 # Much lower confidence for negated relationships
694
+
695
+ # Only add if we have both source and target (or it's a question pattern)
696
+ if target or rel_type in ['question_target', 'question_target_time', 'what_is', 'who_is', 'where_is', 'when_is', 'how_works', 'what_means']:
697
+ # Determine relationship category
698
+ relationship_category = 'causal' # default
699
+ if rel_type in ['is_a', 'belongs_to', 'is', 'defined_as', 'means', 'refers_to', 'equivalent_to', 'similar_to']:
700
+ relationship_category = 'taxonomic'
701
+ elif rel_type in ['has_property', 'contains', 'includes', 'part_of', 'consists_of']:
702
+ relationship_category = 'meronymic'
703
+ elif rel_type in ['located_in', 'found_in']:
704
+ relationship_category = 'spatial'
705
+ elif rel_type in ['used_for', 'functions_as']:
706
+ relationship_category = 'functional'
707
+ elif rel_type in ['temporal', 'precedes', 'follows', 'before', 'after', 'delayed']:
708
+ relationship_category = 'temporal'
709
+ elif rel_type in ['what_is', 'who_is', 'where_is', 'when_is', 'how_works', 'what_means']:
710
+ relationship_category = 'definitional'
711
+ elif rel_type in ['factual', 'became', 'changed_to']:
712
+ relationship_category = 'factual'
713
+ elif rel_type in ['causes', 'affects', 'influences', 'depends_on', 'leads_to', 'results_in', 'impacts', 'drives', 'determines', 'controls', 'caused_by', 'affected_by', 'results_from', 'increases', 'decreases']:
714
+ relationship_category = 'causal'
715
+
716
+ relationships.append({
717
+ 'source': source,
718
+ 'target': target or source, # For questions, use source as both
719
+ 'type': rel_type,
720
+ 'category': relationship_category,
721
+ 'confidence': confidence,
722
+ 'negated': is_negated,
723
+ 'raw_source': source_raw,
724
+ 'raw_target': target_raw or source_raw
725
+ })
726
+
727
+ # Post-process: For conditional questions, infer relationships between mentioned variables
728
+ if 'if' in text_lower and 'what' in text_lower:
729
+ # Extract all variables mentioned
730
+ all_vars = self._extract_standalone_variables(text)
731
+ var_list = sorted(list(all_vars))
732
+
733
+ # If we have multiple variables, infer they might be related
734
+ if len(var_list) >= 2:
735
+ # Common pattern: "If X is Y, what is Z?" -> X might affect Z
736
+ for i in range(len(var_list) - 1):
737
+ relationships.append({
738
+ 'source': var_list[i],
739
+ 'target': var_list[-1], # Last variable is usually the question target
740
+ 'type': 'inferred_from_question',
741
+ 'confidence': 0.5,
742
+ 'negated': False,
743
+ 'raw_source': var_list[i],
744
+ 'raw_target': var_list[-1]
745
+ })
746
+
747
+ # NEW: Extract state variables from action verbs and epistemic terms
748
+ # This helps handle vague language like "identify past policy" or "analyze the system"
749
+ action_verb_vars = self._extract_state_variables_from_action_verbs(text)
750
+ epistemic_vars = self._extract_state_variables_from_epistemic_terms(text)
751
+ vague_language_vars = self._extract_variables_from_vague_language(text)
752
+
753
+ # Add relationships for extracted variables (if we can infer them)
754
+ # For action verbs: if we have "identify X" and "determine Y", infer X might affect Y
755
+ all_extracted = action_verb_vars | epistemic_vars | vague_language_vars
756
+ if len(all_extracted) >= 2:
757
+ # Create inferred relationships between extracted variables
758
+ extracted_list = sorted(list(all_extracted))
759
+ for i in range(len(extracted_list) - 1):
760
+ # Only add if not already in relationships
761
+ already_exists = any(
762
+ r['source'] == extracted_list[i] and r['target'] == extracted_list[i+1]
763
+ for r in relationships
764
+ )
765
+ if not already_exists:
766
+ relationships.append({
767
+ 'source': extracted_list[i],
768
+ 'target': extracted_list[i+1],
769
+ 'type': 'inferred_from_action_verb',
770
+ 'confidence': 0.4, # Lower confidence for inferred relationships
771
+ 'negated': False,
772
+ 'raw_source': extracted_list[i],
773
+ 'raw_target': extracted_list[i+1]
774
+ })
775
+
776
+ return relationships
777
+
778
+ def _extract_variables_with_values(self, text: str) -> Dict[str, Any]:
779
+ """
780
+ Extract variables that have numerical values attached.
781
+
782
+ Args:
783
+ text: Input text
784
+
785
+ Returns:
786
+ Dictionary mapping variables to their values
787
+ """
788
+ variables_with_values = {}
789
+ text_lower = text.lower()
790
+
791
+ # Pattern: "variable is value" or "variable = value" or "variable: value"
792
+ patterns = [
793
+ r'(\w+(?:\s+\w+)?)\s+is\s+(\d+[.,]?\d*%?|\d+[.,]?\d*\s*[a-z]+)',
794
+ r'(\w+(?:\s+\w+)?)\s*[=:]\s*(\d+[.,]?\d*%?|\d+[.,]?\d*\s*[a-z]+)',
795
+ r'(\w+(?:\s+\w+)?)\s+of\s+(\d+[.,]?\d*%?)',
796
+ ]
797
+
798
+ for pattern in patterns:
799
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE)
800
+ for match in matches:
801
+ var = self._normalize_variable_name(match.group(1))
802
+ value = match.group(2).strip()
803
+ if var and var not in self.stop_words:
804
+ variables_with_values[var] = value
805
+
806
+ return variables_with_values
807
+
808
+ def _extract_standalone_variables(self, text: str) -> Set[str]:
809
+ """
810
+ Extract standalone variables using multiple strategies.
811
+
812
+ Args:
813
+ text: Input text
814
+
815
+ Returns:
816
+ Set of variable names
817
+ """
818
+ variables = set()
819
+ text_lower = text.lower()
820
+
821
+ # Strategy 1: Extract variables with values (new)
822
+ variables_with_values = self._extract_variables_with_values(text)
823
+ variables.update(variables_with_values.keys())
824
+
825
+ # Strategy 2: Keyword-based extraction
826
+ words = re.findall(r'\b\w+\b', text_lower)
827
+ for word in words:
828
+ if word in self.stop_words:
829
+ continue
830
+ # Check if word contains or matches keywords
831
+ for keyword in self.variable_keywords:
832
+ if keyword in word or word in keyword:
833
+ variables.add(word)
834
+
835
+ # Strategy 3: Noun phrase extraction (enhanced to handle "X is Y" patterns)
836
+ # Extract noun phrases before "is", "=", ":" followed by numbers
837
+ state_patterns = [
838
+ r'(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+(?:\d|%)',
839
+ r'(\w+(?:\s+\w+)?)\s*[=:]\s*(?:\d|%)',
840
+ ]
841
+ for pattern in state_patterns:
842
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE)
843
+ for match in matches:
844
+ var = self._normalize_variable_name(match.group(1))
845
+ if var and var not in self.stop_words:
846
+ variables.add(var)
847
+
848
+ # Strategy 4: Standard noun phrase extraction
849
+ noun_phrases = self._extract_noun_phrases(text_lower)
850
+ for phrase in noun_phrases:
851
+ # Filter out phrases that are just stop words
852
+ words = phrase.split()
853
+ if words and not all(w in self.stop_words for w in words):
854
+ variables.add(phrase)
855
+
856
+ # Strategy 5: Capitalized words (proper nouns or emphasized terms)
857
+ capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
858
+ for word in capitalized:
859
+ normalized = self._normalize_variable_name(word)
860
+ if normalized and normalized not in self.stop_words:
861
+ variables.add(normalized)
862
+
863
+ # Strategy 6: Quoted phrases
864
+ quoted = re.findall(r'"([^"]+)"|\'([^\']+)\'', text)
865
+ for match in quoted:
866
+ phrase = (match[0] or match[1]).strip().lower()
867
+ if phrase and phrase not in self.stop_words:
868
+ variables.add(phrase)
869
+
870
+ # Strategy 7: Terms after "of", "for", "in" (common variable indicators)
871
+ of_pattern = r'\b(?:of|for|in|about|regarding)\s+(\w+(?:\s+\w+)?)'
872
+ of_matches = re.finditer(of_pattern, text_lower)
873
+ for match in of_matches:
874
+ var = self._normalize_variable_name(match.group(1))
875
+ if var and var not in self.stop_words:
876
+ variables.add(var)
877
+
878
+ # Strategy 8: Extract from questions (what is X, what will X be)
879
+ question_patterns = [
880
+ r'(?:what|which|how\s+much|how\s+many)\s+(?:is|are|will|would|should)\s+(?:the\s+)?(\w+(?:\s+\w+)?)',
881
+ r'(?:what|which)\s+is\s+(?:the\s+)?(\w+(?:\s+\w+)?)\s+(?:of|in|for)',
882
+ ]
883
+ for pattern in question_patterns:
884
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE)
885
+ for match in matches:
886
+ var = self._normalize_variable_name(match.group(1))
887
+ if var and var not in self.stop_words:
888
+ variables.add(var)
889
+
890
+ # Strategy 9: Extract variables mentioned with "&" or "and" (common in state descriptions)
891
+ and_pattern = r'(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+[\d%]+\s*(?:&|and)\s+(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)'
892
+ and_matches = re.finditer(and_pattern, text_lower, re.IGNORECASE)
893
+ for match in and_matches:
894
+ var1 = self._normalize_variable_name(match.group(1))
895
+ var2 = self._normalize_variable_name(match.group(2))
896
+ if var1 and var1 not in self.stop_words:
897
+ variables.add(var1)
898
+ if var2 and var2 not in self.stop_words:
899
+ variables.add(var2)
900
+
901
+ # Strategy 10: Extract from action verbs (e.g., "identify X" -> extract X)
902
+ # This helps handle vague language by finding what action verbs refer to
903
+ action_verb_vars = self._extract_state_variables_from_action_verbs(text)
904
+ variables.update(action_verb_vars)
905
+
906
+ # Strategy 11: Extract from epistemic terms (e.g., "policy of X" -> extract X)
907
+ epistemic_vars = self._extract_state_variables_from_epistemic_terms(text)
908
+ variables.update(epistemic_vars)
909
+
910
+ # Strategy 12: Extract from vague language patterns
911
+ vague_vars = self._extract_variables_from_vague_language(text)
912
+ variables.update(vague_vars)
913
+
914
+ return variables
915
+
916
+ def _resolve_references(self, text: str, variables: Set[str]) -> Set[str]:
917
+ """
918
+ Resolve pronoun and reference resolution.
919
+
920
+ Args:
921
+ text: Input text
922
+ variables: Existing variables
923
+
924
+ Returns:
925
+ Updated set of variables with resolved references
926
+ """
927
+ # Simple pronoun resolution: if we see "it", "this", "that" referring to variables
928
+ # This is a simplified version - full resolution would require more context
929
+ resolved = variables.copy()
930
+
931
+ # Look for patterns like "this X", "that X", "these X", "those X"
932
+ reference_pattern = r'\b(this|that|these|those)\s+(\w+(?:\s+\w+)?)'
933
+ matches = re.finditer(reference_pattern, text.lower())
934
+ for match in matches:
935
+ var = self._normalize_variable_name(match.group(2))
936
+ if var and var not in self.stop_words:
937
+ resolved.add(var)
938
+
939
+ return resolved
940
+
941
+ def _merge_similar_variables(self, variables: Set[str]) -> Set[str]:
942
+ """
943
+ Merge similar variable names (plurals, variations).
944
+
945
+ Args:
946
+ variables: Set of variable names
947
+
948
+ Returns:
949
+ Merged set of variables
950
+ """
951
+ merged = set()
952
+ variable_list = list(variables)
953
+
954
+ for var in variable_list:
955
+ # Check if similar variable already exists
956
+ is_duplicate = False
957
+ for existing in merged:
958
+ # Check for plural/singular
959
+ if var == existing or var == existing + 's' or var + 's' == existing:
960
+ is_duplicate = True
961
+ break
962
+ # Check for common variations
963
+ if var.replace('_', ' ') == existing.replace('_', ' '):
964
+ is_duplicate = True
965
+ break
966
+
967
+ if not is_duplicate:
968
+ merged.add(var)
969
+
970
+ return merged
971
+
972
+ def _filter_valid_variables(self, variables: Set[str]) -> Set[str]:
973
+ """
974
+ Filter variables to keep only valid ones for causal analysis.
975
+
976
+ Args:
977
+ variables: Set of variable names
978
+
979
+ Returns:
980
+ Set of valid variable names
981
+ """
982
+ valid = set()
983
+
984
+ for var in variables:
985
+ # Use _clean_variable to validate
986
+ cleaned = self._clean_variable(var)
987
+ if cleaned:
988
+ # Additional checks
989
+ words = cleaned.split()
990
+
991
+ # Filter out single words that aren't keywords
992
+ if len(words) == 1:
993
+ if cleaned not in self.variable_keywords:
994
+ # Check if it's a meaningful single word
995
+ if cleaned.lower() in self.stop_words:
996
+ continue
997
+ # Very short single words are likely invalid
998
+ if len(cleaned) < 4:
999
+ continue
1000
+
1001
+ # Filter out variables that are clearly value descriptors
1002
+ value_descriptors = ['buy', 'sell', 'percent', 'percentage']
1003
+ if any(desc in cleaned.lower() for desc in value_descriptors):
1004
+ continue
1005
+
1006
+ # Filter out variables that start with "if"
1007
+ if cleaned.lower().startswith('if '):
1008
+ continue
1009
+
1010
+ valid.add(cleaned)
1011
+
1012
+ return valid
1013
+
1014
+ def _is_action_verb(self, var: str) -> bool:
1015
+ """
1016
+ Check if a variable is actually an action verb (epistemic/intentional action).
1017
+
1018
+ Action verbs like "identify", "analyze" should NOT be treated as causal variables.
1019
+
1020
+ Uses both local action_verbs list and dictionary part-of-speech checking.
1021
+
1022
+ Args:
1023
+ var: Variable name to check
1024
+
1025
+ Returns:
1026
+ True if it's an action verb
1027
+ """
1028
+ var_lower = var.lower()
1029
+ words = var_lower.split()
1030
+
1031
+ # Check if any word is an action verb (local list)
1032
+ for word in words:
1033
+ if word in self.action_verbs:
1034
+ return True
1035
+ # Check for verb forms (ing, ed, s)
1036
+ base_word = word.rstrip('s').rstrip('ed').rstrip('ing')
1037
+ if base_word in self.action_verbs:
1038
+ return True
1039
+
1040
+ # Use dictionary to check part of speech (more accurate)
1041
+ if self.lexical_compiler and self.lexical_compiler.enable_dictionary:
1042
+ # For single-word variables, check if it's a verb
1043
+ if len(words) == 1:
1044
+ if self.lexical_compiler.is_action_verb(words[0]):
1045
+ return True
1046
+ # For multi-word, check each word
1047
+ else:
1048
+ for word in words:
1049
+ if self.lexical_compiler.is_action_verb(word):
1050
+ return True
1051
+
1052
+ return False
1053
+
1054
+ def _is_epistemic_term(self, var: str) -> bool:
1055
+ """
1056
+ Check if a variable is an epistemic/intentional term (task, policy, etc.).
1057
+
1058
+ These are not causal state variables - they're about knowledge/intentions.
1059
+
1060
+ Args:
1061
+ var: Variable name to check
1062
+
1063
+ Returns:
1064
+ True if it's an epistemic term
1065
+ """
1066
+ var_lower = var.lower()
1067
+ words = var_lower.split()
1068
+
1069
+ # Check if any word is an epistemic term
1070
+ for word in words:
1071
+ if word in self.epistemic_terms:
1072
+ return True
1073
+
1074
+ # Check for common epistemic patterns
1075
+ epistemic_patterns = [
1076
+ r'past\s+\w+', # "past policy"
1077
+ r'\w+\s+policy', # "X policy"
1078
+ r'\w+\s+task', # "X task"
1079
+ r'\w+\s+goal', # "X goal"
1080
+ r'\w+\s+decision', # "X decision"
1081
+ ]
1082
+
1083
+ for pattern in epistemic_patterns:
1084
+ if re.search(pattern, var_lower):
1085
+ return True
1086
+
1087
+ return False
1088
+
1089
+ def validate_causal_relationship(
1090
+ self,
1091
+ source: str,
1092
+ target: str,
1093
+ graph: Dict[str, Any]
1094
+ ) -> Tuple[bool, Optional[str]]:
1095
+ """
1096
+ Validate causal relationship using do-calculus and d-separation.
1097
+
1098
+ Implements formal causal validation:
1099
+ - Correlation vs. causation: P(Y | X) ≠ P(Y | do(X))
1100
+ - D-separation: X ⊥ Y | Z if d-separated in graph
1101
+ - Temporal ordering: if X causes Y, then time(X) < time(Y)
1102
+ - Confounder detection: backdoor criterion
1103
+
1104
+ Args:
1105
+ source: Source variable
1106
+ target: Target variable
1107
+ graph: Graph state
1108
+
1109
+ Returns:
1110
+ Tuple of (is_valid, error_message)
1111
+ """
1112
+ nodes = graph.get('nodes', [])
1113
+ edges = graph.get('edges', [])
1114
+
1115
+ # Check if variables exist
1116
+ if source not in nodes or target not in nodes:
1117
+ return False, f"Variables {source} or {target} not in graph"
1118
+
1119
+ # Check for direct edge (simplified causal validation)
1120
+ if (source, target) in edges:
1121
+ # Valid causal edge
1122
+ return True, None
1123
+
1124
+ # Check for confounders using backdoor criterion (simplified)
1125
+ # Look for common causes
1126
+ common_causes = []
1127
+ for node in nodes:
1128
+ if node != source and node != target:
1129
+ # Check if node is a parent of both source and target
1130
+ has_edge_to_source = (node, source) in edges
1131
+ has_edge_to_target = (node, target) in edges
1132
+ if has_edge_to_source and has_edge_to_target:
1133
+ common_causes.append(node)
1134
+
1135
+ if common_causes:
1136
+ return False, f"Potential confounder detected: {common_causes[0]}"
1137
+
1138
+ return True, None
1139
+
1140
+ def _clean_variable(self, var: str) -> Optional[str]:
1141
+ """
1142
+ Clean and validate a variable name.
1143
+
1144
+ Args:
1145
+ var: Raw variable name
1146
+
1147
+ Returns:
1148
+ Cleaned variable name or None if invalid
1149
+ """
1150
+ if not var:
1151
+ return None
1152
+
1153
+ # Normalize
1154
+ var = self._normalize_variable_name(var)
1155
+ var_lower = var.lower()
1156
+
1157
+ # CRITICAL: Filter out action verbs (epistemic/intentional actions)
1158
+ # These are NOT causal state variables - they're tasks, not observables
1159
+ if self._is_action_verb(var):
1160
+ return None
1161
+
1162
+ # CRITICAL: Filter out epistemic/intentional terms
1163
+ # These are about knowledge/intentions, not causal state variables
1164
+ if self._is_epistemic_term(var):
1165
+ return None
1166
+
1167
+ # Filter out relationship phrases (contain causal verbs)
1168
+ if any(verb in var_lower for verb in self.causal_verbs):
1169
+ return None
1170
+
1171
+ # Filter out if it contains relationship indicators
1172
+ relationship_indicators = ['depends', 'causes', 'affects', 'influences', 'leads', 'results', 'impacts']
1173
+ if any(indicator in var_lower for indicator in relationship_indicators):
1174
+ return None
1175
+
1176
+ # Filter out value descriptions (buy, sell, etc. when they're part of percentages)
1177
+ value_descriptors = ['buy', 'sell', 'percent', 'percentage', '%']
1178
+ if var in value_descriptors:
1179
+ return None
1180
+
1181
+ # Filter out time units that are standalone (but keep "7 days" as a variable)
1182
+ time_units = ['day', 'days', 'hour', 'hours', 'minute', 'minutes', 'second', 'seconds']
1183
+ if var in time_units and len(var.split()) == 1:
1184
+ return None
1185
+
1186
+ # Remove common conjunctions at start/end
1187
+ words = var.split()
1188
+ if words:
1189
+ # Remove "and", "or", "the", "a", "an", "if" from start
1190
+ while words and words[0].lower() in {'and', 'or', 'the', 'a', 'an', 'if'}:
1191
+ words = words[1:]
1192
+ # Remove "and", "or" from end
1193
+ while words and words[-1].lower() in {'and', 'or'}:
1194
+ words = words[:-1]
1195
+
1196
+ if not words:
1197
+ return None
1198
+
1199
+ var = ' '.join(words)
1200
+
1201
+ # Filter out if it's just stop words
1202
+ if var in self.stop_words:
1203
+ return None
1204
+
1205
+ # Filter out if all words are stop words
1206
+ if all(w in self.stop_words for w in var.split()):
1207
+ return None
1208
+
1209
+ # Stricter filtering for single-word variables
1210
+ if len(words) == 1:
1211
+ # Filter out single-word variables that are likely invalid
1212
+ invalid_single_words = {
1213
+ 'if', 'and', 'or', 'but', 'the', 'a', 'an', 'buy', 'sell',
1214
+ 'days', 'day', 'hours', 'hour', 'minutes', 'minute',
1215
+ 'seconds', 'second', 'of', 'in', 'on', 'at', 'to', 'for',
1216
+ 'from', 'with', 'by', 'as', 'is', 'was', 'are', 'were',
1217
+ 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does',
1218
+ 'did', 'will', 'would', 'could', 'should', 'may', 'might',
1219
+ 'must', 'can', 'this', 'that', 'these', 'those', 'what',
1220
+ 'which', 'who', 'whom', 'where', 'when', 'why', 'how'
1221
+ }
1222
+ if var.lower() in invalid_single_words:
1223
+ return None
1224
+
1225
+ # Filter out very short single-word variables (unless it's a known keyword)
1226
+ if len(var) < 3 and var not in self.variable_keywords:
1227
+ return None
1228
+
1229
+ # Only allow single-word variables if they're in the keyword list
1230
+ if var not in self.variable_keywords:
1231
+ # Check if it's a meaningful single word (not a stop word)
1232
+ if var.lower() in self.stop_words:
1233
+ return None
1234
+
1235
+ # Filter out variables that are just conjunctions
1236
+ if var.lower() in {'and', 'or', 'but', 'the', 'a', 'an', 'if'}:
1237
+ return None
1238
+
1239
+ # Filter out very long phrases (likely not a single variable)
1240
+ if len(words) > 4:
1241
+ return None
1242
+
1243
+ # Final check: Reject if it's an action verb or epistemic term
1244
+ if self._is_action_verb(var) or self._is_epistemic_term(var):
1245
+ return None
1246
+
1247
+ # Optional: Use dictionary to validate word (if lexical compiler available)
1248
+ # This helps filter out made-up words or typos that passed other filters
1249
+ # Note: This is a soft check - we don't require dictionary validation for all words
1250
+ # as domain-specific terms may not be in standard dictionaries
1251
+ if self.lexical_compiler and self.lexical_compiler.enable_dictionary:
1252
+ # For single-word variables, check if it's a valid word
1253
+ # Multi-word phrases are more likely to be domain-specific
1254
+ if len(words) == 1:
1255
+ # Check if word exists in dictionary
1256
+ if not self.lexical_compiler.is_valid_word(words[0]):
1257
+ # Word not found - could be a typo or domain-specific term
1258
+ # We'll still allow it but with lower confidence
1259
+ logger.debug(f"Word '{words[0]}' not found in dictionary - may be domain-specific or typo")
1260
+
1261
+ return var
1262
+
1263
+ def _extract_clean_variables_from_relationships(self, relationships: List[Dict[str, Any]]) -> Set[str]:
1264
+ """
1265
+ Extract clean variables from relationships.
1266
+
1267
+ Args:
1268
+ relationships: List of relationship dictionaries
1269
+
1270
+ Returns:
1271
+ Set of clean variable names
1272
+ """
1273
+ variables = set()
1274
+ for rel in relationships:
1275
+ source = self._clean_variable(rel.get('source', ''))
1276
+ target = self._clean_variable(rel.get('target', ''))
1277
+ if source:
1278
+ variables.add(source)
1279
+ if target:
1280
+ variables.add(target)
1281
+ return variables
1282
+
1283
+ def _infer_relationships_from_context(self, variables: Set[str], text: str) -> List[Tuple[str, str]]:
1284
+ """
1285
+ Infer relationships from context when explicit patterns aren't found.
1286
+ Enhanced to handle conditional questions and state descriptions.
1287
+
1288
+ Args:
1289
+ variables: Set of extracted variables
1290
+ text: Original text
1291
+
1292
+ Returns:
1293
+ List of inferred (source, target) tuples
1294
+ """
1295
+ inferred = []
1296
+ var_list = sorted(list(variables))
1297
+ text_lower = text.lower()
1298
+
1299
+ # Clean variable list - remove value descriptors
1300
+ cleaned_vars = [v for v in var_list if self._clean_variable(v)]
1301
+
1302
+ # If we have a conditional question pattern: "If X is Y, what is Z?"
1303
+ if 'if' in text_lower and ('what' in text_lower or 'which' in text_lower or 'expected' in text_lower):
1304
+ # Find variables mentioned before "what" or "expected"
1305
+ question_markers = ['what', 'which', 'expected']
1306
+ question_pos = -1
1307
+ for marker in question_markers:
1308
+ pos = text_lower.find(marker)
1309
+ if pos > 0:
1310
+ question_pos = pos
1311
+ break
1312
+
1313
+ if question_pos > 0:
1314
+ before_question = text_lower[:question_pos]
1315
+ after_question = text_lower[question_pos:]
1316
+
1317
+ # Variables before question are likely causes (state variables)
1318
+ before_vars = [v for v in cleaned_vars if v.lower() in before_question and 'expected' not in v.lower()]
1319
+ # Variables after question are likely effects (question target)
1320
+ after_vars = [v for v in cleaned_vars if v.lower() in after_question or 'expected' in v.lower()]
1321
+
1322
+ # Also look for "expected X" pattern
1323
+ expected_pattern = r'expected\s+(\w+(?:\s+\w+)?)'
1324
+ expected_match = re.search(expected_pattern, text_lower, re.IGNORECASE)
1325
+ if expected_match:
1326
+ expected_var = self._clean_variable(expected_match.group(1))
1327
+ if expected_var and expected_var not in after_vars:
1328
+ after_vars.append(expected_var)
1329
+
1330
+ # Create relationships: state variables -> question target
1331
+ if before_vars and after_vars:
1332
+ for before_var in before_vars:
1333
+ for after_var in after_vars:
1334
+ if before_var != after_var:
1335
+ inferred.append((before_var, after_var))
1336
+ elif before_vars and not after_vars:
1337
+ # If no explicit target, use the most likely target (e.g., "expected price")
1338
+ # Look for variables with "expected" or mentioned in question
1339
+ question_var_pattern = r'(?:what|which|expected)\s+(?:is|are|will|would|the\s+)?(\w+(?:\s+\w+)?)'
1340
+ q_match = re.search(question_var_pattern, text_lower, re.IGNORECASE)
1341
+ if q_match:
1342
+ q_var = self._clean_variable(q_match.group(1))
1343
+ if q_var and q_var in cleaned_vars:
1344
+ for before_var in before_vars:
1345
+ if before_var != q_var:
1346
+ inferred.append((before_var, q_var))
1347
+
1348
+ # If we have state descriptions with multiple variables
1349
+ # Pattern: "X is Y & Z is W" -> X and Z might affect the question target
1350
+ if '&' in text or (' and ' in text_lower and 'is' in text_lower):
1351
+ # Find variables mentioned with "is" followed by values
1352
+ state_pattern = r'(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+[\d%]+'
1353
+ state_matches = list(re.finditer(state_pattern, text_lower, re.IGNORECASE))
1354
+
1355
+ if len(state_matches) >= 1:
1356
+ # Variables mentioned in state descriptions
1357
+ state_vars = []
1358
+ for m in state_matches:
1359
+ var = self._clean_variable(m.group(1))
1360
+ if var and var in cleaned_vars:
1361
+ state_vars.append(var)
1362
+
1363
+ # Find question target
1364
+ question_vars = []
1365
+ # Look for "expected X" or "what is X"
1366
+ expected_pattern = r'expected\s+(\w+(?:\s+\w+)?)'
1367
+ what_pattern = r'what\s+(?:is|are|will|would)\s+(?:the\s+)?(\w+(?:\s+\w+)?)'
1368
+
1369
+ for pattern in [expected_pattern, what_pattern]:
1370
+ match = re.search(pattern, text_lower, re.IGNORECASE)
1371
+ if match:
1372
+ q_var = self._clean_variable(match.group(1))
1373
+ if q_var and q_var in cleaned_vars:
1374
+ question_vars.append(q_var)
1375
+
1376
+ # If no explicit question var, look for variables with "price" or similar
1377
+ if not question_vars:
1378
+ price_vars = [v for v in cleaned_vars if 'price' in v.lower() and 'expected' not in v.lower()]
1379
+ if price_vars:
1380
+ question_vars = price_vars[:1] # Take first one
1381
+
1382
+ # Create relationships from state variables to question target
1383
+ for state_var in state_vars:
1384
+ for q_var in question_vars:
1385
+ if state_var != q_var:
1386
+ inferred.append((state_var, q_var))
1387
+
1388
+ # Remove duplicates
1389
+ inferred = list(set(inferred))
1390
+
1391
+ return inferred
1392
+
1393
+ def extract_variables_from_task(self, task: str) -> Dict[str, Any]:
1394
+ """
1395
+ Advanced extraction of variables and relationships from natural language.
1396
+
1397
+ Automatically extracts variables and relationships (causal, knowledge, etc.) from natural language text.
1398
+ Enhanced to handle:
1399
+ - Causal relationships (depends on, affects, causes)
1400
+ - General knowledge relationships (is-a, has, part-of, located-in)
1401
+ - Numerical values, conditionals, questions, and state descriptions
1402
+ - Definitions, facts, and taxonomic relationships
1403
+
1404
+ Args:
1405
+ task: Natural language task description
1406
+
1407
+ Returns:
1408
+ Dictionary with 'variables', 'edges', 'relationships', and metadata
1409
+
1410
+ Example:
1411
+ >>> agent = HybridAgent()
1412
+ >>> result = agent.extract_causal_variables("price depends on demand and supply")
1413
+ >>> print(result['variables']) # ['price', 'demand', 'supply']
1414
+ >>> print(result['edges']) # [('price', 'demand'), ('price', 'supply')]
1415
+
1416
+ >>> result = agent.extract_causal_variables("A dog is a mammal")
1417
+ >>> print(result['variables']) # ['dog', 'mammal']
1418
+ >>> print(result['edges']) # [('dog', 'mammal')] with type='is_a'
1419
+
1420
+ >>> result = agent.extract_causal_variables("Paris is in France")
1421
+ >>> print(result['variables']) # ['Paris', 'France']
1422
+ >>> print(result['edges']) # [('Paris', 'France')] with type='located_in'
1423
+ """
1424
+ # Extract relationships with context
1425
+ relationships = self._extract_with_context(task)
1426
+
1427
+ # Extract clean variables from relationships first (most reliable)
1428
+ variables = self._extract_clean_variables_from_relationships(relationships)
1429
+
1430
+ # Extract standalone variables (supplementary) - this now handles state descriptions
1431
+ standalone_vars = self._extract_standalone_variables(task)
1432
+
1433
+ # Clean standalone variables
1434
+ for var in standalone_vars:
1435
+ cleaned = self._clean_variable(var)
1436
+ if cleaned:
1437
+ variables.add(cleaned)
1438
+
1439
+ # NEW: Extract state variables from action verbs and epistemic terms
1440
+ # This helps handle vague language by finding what action verbs/epistemic terms refer to
1441
+ action_verb_vars = self._extract_state_variables_from_action_verbs(task)
1442
+ epistemic_vars = self._extract_state_variables_from_epistemic_terms(task)
1443
+ vague_language_vars = self._extract_variables_from_vague_language(task)
1444
+
1445
+ # Add extracted variables (they're already cleaned by the extraction methods)
1446
+ for var in action_verb_vars:
1447
+ cleaned = self._clean_variable(var)
1448
+ if cleaned:
1449
+ variables.add(cleaned)
1450
+
1451
+ for var in epistemic_vars:
1452
+ cleaned = self._clean_variable(var)
1453
+ if cleaned:
1454
+ variables.add(cleaned)
1455
+
1456
+ for var in vague_language_vars:
1457
+ cleaned = self._clean_variable(var)
1458
+ if cleaned:
1459
+ variables.add(cleaned)
1460
+
1461
+ # Resolve references
1462
+ variables = self._resolve_references(task, variables)
1463
+
1464
+ # Merge similar variables
1465
+ variables = self._merge_similar_variables(variables)
1466
+
1467
+ # Final filtering: remove invalid variables
1468
+ variables = self._filter_valid_variables(variables)
1469
+
1470
+ # Build clean edges from relationships
1471
+ edges = []
1472
+ for rel in relationships:
1473
+ if not rel.get('negated', False): # Only add non-negated relationships
1474
+ source = self._clean_variable(rel['source'])
1475
+ target = self._clean_variable(rel['target'])
1476
+ if source and target and source != target:
1477
+ # Filter out edges with "of" at the end (e.g., "price of")
1478
+ if not target.endswith(' of') and not source.endswith(' of'):
1479
+ edges.append((source, target))
1480
+
1481
+ # If no explicit edges found, try to infer from context
1482
+ if not edges and variables:
1483
+ inferred_edges = self._infer_relationships_from_context(variables, task)
1484
+ edges.extend(inferred_edges)
1485
+
1486
+ # Clean up edges: remove edges to/from invalid variables
1487
+ # Only include edges between valid variables
1488
+ valid_vars_set = variables
1489
+ cleaned_edges = []
1490
+ for source, target in edges:
1491
+ source_clean = self._clean_variable(source)
1492
+ target_clean = self._clean_variable(target)
1493
+
1494
+ # Both must be valid and in the valid variables set
1495
+ if (source_clean and target_clean and
1496
+ source_clean != target_clean and
1497
+ source_clean in valid_vars_set and
1498
+ target_clean in valid_vars_set):
1499
+ # Don't create edges to time units unless they're part of a compound variable
1500
+ if target_clean in ['days', 'day', 'hours', 'hour'] and len(target_clean.split()) == 1:
1501
+ continue
1502
+ # Don't create edges from single words to compound phrases that contain them
1503
+ if source_clean in target_clean.split() or target_clean in source_clean.split():
1504
+ # Only skip if one is clearly a subset of the other
1505
+ if len(source_clean.split()) < len(target_clean.split()) or len(target_clean.split()) < len(source_clean.split()):
1506
+ continue
1507
+ cleaned_edges.append((source_clean, target_clean))
1508
+
1509
+ # Remove duplicate edges
1510
+ edges = list(set(cleaned_edges))
1511
+
1512
+ # Prioritize edges: prefer edges to "expected X" or question targets
1513
+ question_targets = [v for v in variables if 'expected' in v.lower()]
1514
+ if question_targets:
1515
+ # Keep edges that go to question targets
1516
+ prioritized_edges = [e for e in edges if e[1] in question_targets]
1517
+ # Add other edges that don't conflict
1518
+ for e in edges:
1519
+ if e not in prioritized_edges:
1520
+ # Only add if source doesn't already have an edge to a question target
1521
+ if not any(e[0] == p[0] for p in prioritized_edges):
1522
+ prioritized_edges.append(e)
1523
+ edges = prioritized_edges if prioritized_edges else edges
1524
+
1525
+ # Extract metadata
1526
+ metadata = {
1527
+ 'total_relationships': len(relationships),
1528
+ 'negated_relationships': sum(1 for r in relationships if r.get('negated', False)),
1529
+ 'average_confidence': sum(r['confidence'] for r in relationships) / len(relationships) if relationships else 0.0,
1530
+ 'variables_extracted': len(variables),
1531
+ 'edges_extracted': len(edges),
1532
+ 'variables_with_values': self._extract_variables_with_values(task)
1533
+ }
1534
+
1535
+ return {
1536
+ 'variables': sorted(list(variables)), # Sorted for consistency
1537
+ 'edges': edges,
1538
+ 'relationships': relationships,
1539
+ 'metadata': metadata
1540
+ }
1541
+
1542
+ def infer_causal_structure(self, variables: List[str], context: Optional[str] = None) -> List[Tuple[str, str]]:
1543
+ """
1544
+ Infer causal structure from variables using advanced logical inference.
1545
+
1546
+ Args:
1547
+ variables: List of variable names
1548
+ context: Optional context text for better inference
1549
+
1550
+ Returns:
1551
+ List of (source, target) tuples representing causal edges
1552
+ """
1553
+ edges = []
1554
+
1555
+ if not variables:
1556
+ return edges
1557
+
1558
+ # Strategy 1: Sequential inference (variables mentioned in order)
1559
+ # Only if we have 2-4 variables (too many would create too many edges)
1560
+ if 2 <= len(variables) <= 4:
1561
+ for i in range(len(variables) - 1):
1562
+ source = variables[i]
1563
+ target = variables[i + 1]
1564
+ # Only add if not creating cycles
1565
+ if not self.graph_manager.has_path(target, source):
1566
+ edges.append((source, target))
1567
+
1568
+ # Strategy 2: Domain-specific heuristics
1569
+ # Common patterns: input -> process -> output, cause -> effect
1570
+ variable_lower = [v.lower() for v in variables]
1571
+
1572
+ # Look for common causal patterns
1573
+ input_keywords = ['input', 'source', 'cause', 'factor', 'driver', 'trigger']
1574
+ output_keywords = ['output', 'result', 'effect', 'outcome', 'consequence', 'impact']
1575
+ process_keywords = ['process', 'mechanism', 'method', 'approach', 'system']
1576
+
1577
+ inputs = [v for v, v_lower in zip(variables, variable_lower)
1578
+ if any(kw in v_lower for kw in input_keywords)]
1579
+ outputs = [v for v, v_lower in zip(variables, variable_lower)
1580
+ if any(kw in v_lower for kw in output_keywords)]
1581
+ processes = [v for v, v_lower in zip(variables, variable_lower)
1582
+ if any(kw in v_lower for kw in process_keywords)]
1583
+
1584
+ # Input -> Process -> Output pattern
1585
+ if inputs and processes:
1586
+ for inp in inputs:
1587
+ for proc in processes:
1588
+ if not self.graph_manager.has_path(proc, inp):
1589
+ edges.append((inp, proc))
1590
+
1591
+ if processes and outputs:
1592
+ for proc in processes:
1593
+ for out in outputs:
1594
+ if not self.graph_manager.has_path(out, proc):
1595
+ edges.append((proc, out))
1596
+
1597
+ # Direct input -> output (if no process)
1598
+ if inputs and outputs and not processes:
1599
+ for inp in inputs:
1600
+ for out in outputs:
1601
+ if not self.graph_manager.has_path(out, inp):
1602
+ edges.append((inp, out))
1603
+
1604
+ # Strategy 3: Context-based inference (if context provided)
1605
+ if context:
1606
+ context_lower = context.lower()
1607
+ # Look for mentions of variables in context
1608
+ for i, var1 in enumerate(variables):
1609
+ for var2 in variables[i+1:]:
1610
+ # Check if var1 appears before var2 in context
1611
+ pos1 = context_lower.find(var1.lower())
1612
+ pos2 = context_lower.find(var2.lower())
1613
+ if pos1 != -1 and pos2 != -1 and pos1 < pos2:
1614
+ # Check if there's a causal word between them
1615
+ between = context_lower[pos1:pos2]
1616
+ if any(verb in between for verb in self.causal_verbs):
1617
+ if not self.graph_manager.has_path(var2, var1):
1618
+ edges.append((var1, var2))
1619
+
1620
+ # Remove duplicates
1621
+ edges = list(set(edges))
1622
+
1623
+ return edges
1624
+
1625
+ def validate_causal_graph(self) -> Tuple[bool, Optional[str]]:
1626
+ """
1627
+ Validate that the causal graph is a valid DAG.
1628
+
1629
+ Returns:
1630
+ Tuple of (is_valid, error_message)
1631
+ """
1632
+ if not self.graph_manager.is_dag():
1633
+ return False, "Graph contains cycles"
1634
+ return True, None
1635
+
1636
+ def apply_causal_rules(self, state: Dict[str, float]) -> Dict[str, float]:
1637
+ """
1638
+ Apply rule-based causal reasoning to a state.
1639
+
1640
+ Args:
1641
+ state: Dictionary mapping variables to values
1642
+
1643
+ Returns:
1644
+ Updated state after applying causal rules
1645
+ """
1646
+ result = state.copy()
1647
+
1648
+ # Get topological order
1649
+ try:
1650
+ order = self.graph_manager.topological_sort()
1651
+ except Exception:
1652
+ order = list(state.keys())
1653
+
1654
+ # Apply causal propagation
1655
+ for node in order:
1656
+ if node not in result:
1657
+ continue
1658
+
1659
+ parents = self.graph_manager.get_parents(node)
1660
+ if not parents:
1661
+ continue
1662
+
1663
+ # Simple linear combination rule
1664
+ value = result.get(node, 0.0)
1665
+ for parent in parents:
1666
+ if parent in result:
1667
+ strength = self.graph_manager.edge_strength(parent, node)
1668
+ value += result[parent] * strength * 0.1 # Dampening factor
1669
+
1670
+ result[node] = value
1671
+
1672
+ return result
1673
+
1674
+
1675
+ class StatisticalEngine:
1676
+ """
1677
+ Statistical inference engine wrapping StatisticalMethods.
1678
+
1679
+ Provides Bayesian inference, regression-based edge estimation,
1680
+ and uncertainty quantification.
1681
+ """
1682
+
1683
+ def __init__(
1684
+ self,
1685
+ graph_manager: GraphManager,
1686
+ prediction_framework: PredictionFramework,
1687
+ seed: int = 42
1688
+ ):
1689
+ """
1690
+ Initialize statistical engine.
1691
+
1692
+ Args:
1693
+ graph_manager: GraphManager instance
1694
+ prediction_framework: PredictionFramework instance
1695
+ seed: Random seed
1696
+ """
1697
+ self.graph_manager = graph_manager
1698
+ self.prediction_framework = prediction_framework
1699
+ self.statistical_methods = StatisticalMethods(
1700
+ graph_manager=graph_manager,
1701
+ prediction_framework=prediction_framework,
1702
+ seed=seed
1703
+ )
1704
+
1705
+ def fit_from_dataframe(
1706
+ self,
1707
+ df: Any,
1708
+ variables: List[str],
1709
+ window: int = 30,
1710
+ decay_alpha: float = 0.9,
1711
+ ridge_lambda: float = 0.0,
1712
+ enforce_signs: bool = True
1713
+ ) -> None:
1714
+ """
1715
+ Fit edge strengths from data using Bayesian regression.
1716
+
1717
+ Args:
1718
+ df: pandas DataFrame with historical data
1719
+ variables: List of variable names to fit
1720
+ window: Rolling window size
1721
+ decay_alpha: Decay factor for recency weighting
1722
+ ridge_lambda: Ridge regularization parameter
1723
+ enforce_signs: Whether to enforce edge sign constraints
1724
+ """
1725
+ if not PANDAS_AVAILABLE:
1726
+ raise ImportError("pandas is required for statistical fitting")
1727
+
1728
+ self.statistical_methods.fit_from_dataframe(
1729
+ df=df,
1730
+ variables=variables,
1731
+ window=window,
1732
+ decay_alpha=decay_alpha,
1733
+ ridge_lambda=ridge_lambda,
1734
+ enforce_signs=enforce_signs
1735
+ )
1736
+
1737
+ # Update prediction framework standardization stats
1738
+ self.prediction_framework.standardization_stats = (
1739
+ self.statistical_methods.standardization_stats.copy()
1740
+ )
1741
+
1742
+ def quantify_uncertainty(
1743
+ self,
1744
+ df: Any,
1745
+ variables: List[str],
1746
+ windows: int = 200,
1747
+ alpha: float = 0.95
1748
+ ) -> Dict[str, Any]:
1749
+ """
1750
+ Quantify uncertainty using bootstrap resampling.
1751
+
1752
+ Args:
1753
+ df: pandas DataFrame
1754
+ variables: List of variable names
1755
+ windows: Number of bootstrap samples
1756
+ alpha: Confidence level
1757
+
1758
+ Returns:
1759
+ Dictionary with edge confidence intervals
1760
+ """
1761
+ if not PANDAS_AVAILABLE:
1762
+ return {}
1763
+
1764
+ return self.statistical_methods.quantify_uncertainty(
1765
+ df=df,
1766
+ variables=variables,
1767
+ windows=windows,
1768
+ alpha=alpha
1769
+ )
1770
+
1771
+ def assess_causal_strength(self, source: str, target: str) -> float:
1772
+ """
1773
+ Assess causal strength between two variables.
1774
+
1775
+ Args:
1776
+ source: Source variable
1777
+ target: Target variable
1778
+
1779
+ Returns:
1780
+ Causal strength (0.0 if no edge exists)
1781
+ """
1782
+ return self.graph_manager.edge_strength(source, target)
1783
+
1784
+ def generate_probabilistic_counterfactuals(
1785
+ self,
1786
+ factual_state: Dict[str, float],
1787
+ target_variables: List[str],
1788
+ n_scenarios: int = 5
1789
+ ) -> List[Dict[str, Any]]:
1790
+ """
1791
+ Generate probabilistic counterfactual scenarios.
1792
+
1793
+ Args:
1794
+ factual_state: Current factual state
1795
+ target_variables: Variables to intervene on
1796
+ n_scenarios: Number of scenarios to generate
1797
+
1798
+ Returns:
1799
+ List of counterfactual scenario dictionaries
1800
+ """
1801
+ scenarios = []
1802
+
1803
+ for i in range(n_scenarios):
1804
+ # Generate intervention values (deterministic sampling using seeded RNG)
1805
+ interventions = {}
1806
+ # Use consistency engine's RNG if available, otherwise use statistical engine's
1807
+ rng = getattr(self, '_rng', None)
1808
+ if rng is None and hasattr(self.statistical_engine, '_rng'):
1809
+ rng = self.statistical_engine._rng
1810
+ elif rng is None:
1811
+ # Fallback: create deterministic RNG with seed
1812
+ if NUMPY_AVAILABLE:
1813
+ rng = np.random.default_rng(42)
1814
+ else:
1815
+ import random
1816
+ random.seed(42)
1817
+ rng = random
1818
+
1819
+ for var in target_variables:
1820
+ if var in factual_state:
1821
+ base_value = factual_state[var]
1822
+ # Sample around base value (deterministic)
1823
+ if NUMPY_AVAILABLE and hasattr(rng, 'random'):
1824
+ random_val = float(rng.random())
1825
+ else:
1826
+ random_val = rng.random() if hasattr(rng, 'random') else 0.5
1827
+ intervention_value = base_value * (0.5 + random_val)
1828
+ interventions[var] = intervention_value
1829
+
1830
+ # Predict outcomes
1831
+ try:
1832
+ predicted = self.prediction_framework.predict_outcomes(
1833
+ factual_state=factual_state,
1834
+ interventions=interventions
1835
+ )
1836
+
1837
+ scenarios.append({
1838
+ 'name': f'Scenario {i+1}',
1839
+ 'interventions': interventions,
1840
+ 'expected_outcomes': predicted,
1841
+ 'probability': 1.0 / n_scenarios
1842
+ })
1843
+ except Exception as e:
1844
+ logger.warning(f"Failed to generate scenario {i+1}: {e}")
1845
+ continue
1846
+
1847
+ return scenarios
1848
+
1849
+
1850
+ class RuleBasedNLG:
1851
+ """
1852
+ Enhanced rule-based natural language generation for LLM replacement.
1853
+
1854
+ Generates natural, conversational responses from graph state and reasoning results.
1855
+ Uses pragmatic layer for tone adjustment and confidence-based language.
1856
+ """
1857
+
1858
+ def __init__(self):
1859
+ """Initialize enhanced rule-based NLG."""
1860
+ self.templates = {
1861
+ 'causal_analysis': """## Causal Analysis
1862
+
1863
+ **Variables Identified:** {variables}
1864
+
1865
+ **Causal Relationships:**
1866
+ {relationships}
1867
+
1868
+ **Graph Structure:**
1869
+ {graph_structure}
1870
+
1871
+ **Key Insights:**
1872
+ {insights}
1873
+ """,
1874
+ 'knowledge_analysis': """## Knowledge Analysis
1875
+
1876
+ **Entities Identified:** {variables}
1877
+
1878
+ **Relationships:**
1879
+ {relationships}
1880
+
1881
+ **Graph Structure:**
1882
+ {graph_structure}
1883
+
1884
+ **Key Insights:**
1885
+ {insights}
1886
+ """,
1887
+ 'general_analysis': """## Analysis
1888
+
1889
+ **Entities/Variables Identified:** {variables}
1890
+
1891
+ **Relationships:**
1892
+ {relationships}
1893
+
1894
+ **Graph Structure:**
1895
+ {graph_structure}
1896
+
1897
+ **Key Insights:**
1898
+ {insights}
1899
+ """,
1900
+ 'counterfactual': """## Counterfactual Scenario: {name}
1901
+
1902
+ **Interventions:**
1903
+ {interventions}
1904
+
1905
+ **Expected Outcomes:**
1906
+ {outcomes}
1907
+
1908
+ **Probability:** {probability:.2%}
1909
+ """,
1910
+ 'statistical_summary': """## Statistical Summary
1911
+
1912
+ **Edge Strengths:**
1913
+ {edge_strengths}
1914
+
1915
+ **Uncertainty:**
1916
+ {uncertainty}
1917
+
1918
+ **Confidence Intervals:**
1919
+ {confidence_intervals}
1920
+ """,
1921
+ 'recommendation': """## Recommendations
1922
+
1923
+ Based on the causal analysis:
1924
+
1925
+ {recommendations}
1926
+ """,
1927
+ 'conversational_intro': """I've analyzed your question about {topic}. Here's what I found:
1928
+
1929
+ """,
1930
+ 'conversational_summary': """
1931
+ Based on the causal relationships I've identified, {summary}
1932
+
1933
+ """,
1934
+ 'question_answer': """To answer your question: {question}
1935
+
1936
+ {answer}
1937
+
1938
+ This conclusion is derived from the causal graph structure, which shows {explanation}.
1939
+
1940
+ """,
1941
+ 'explanation': """Let me explain how I reached this conclusion:
1942
+
1943
+ {explanation}
1944
+
1945
+ The causal relationships in the graph indicate that {insight}.
1946
+
1947
+ """,
1948
+ }
1949
+
1950
+ # Conversational connectors
1951
+ self.connectors = {
1952
+ 'high_confidence': ['Based on', 'According to', 'The evidence shows', 'Analysis indicates'],
1953
+ 'medium_confidence': ['It appears that', 'The data suggests', 'This likely means', 'It seems'],
1954
+ 'low_confidence': ['It may be that', 'Possibly', 'This could indicate', 'There might be'],
1955
+ 'transition': ['Furthermore', 'Additionally', 'Moreover', 'In addition', 'Also'],
1956
+ 'conclusion': ['Therefore', 'Thus', 'As a result', 'Consequently', 'Hence']
1957
+ }
1958
+
1959
+ # Natural language patterns for different intents
1960
+ self.intent_responses = {
1961
+ 'question': "Let me answer your question based on the relationships I've identified.",
1962
+ 'analysis': "I've performed an analysis of the relationships you described.",
1963
+ 'prediction': "Based on the structure, here's what I predict:",
1964
+ 'counterfactual': "Let me explore what would happen if we changed certain variables:",
1965
+ 'recommendation': "Based on the analysis, here are my recommendations:",
1966
+ 'extraction': "I've extracted the following structure from your description:",
1967
+ 'definition': "Here's what I know about that:",
1968
+ 'person_query': "Here's information about that person:",
1969
+ 'location_query': "Here's the location information:",
1970
+ 'temporal_query': "Here's the temporal information:",
1971
+ 'explanation': "Let me explain:",
1972
+ 'comparison': "Comparing the entities, I found:"
1973
+ }
1974
+
1975
+ def format_causal_analysis(self, analysis: Dict[str, Any]) -> str:
1976
+ """
1977
+ Format causal analysis results into natural language.
1978
+
1979
+ Args:
1980
+ analysis: Dictionary with analysis results
1981
+
1982
+ Returns:
1983
+ Formatted natural language text
1984
+ """
1985
+ variables = analysis.get('variables', [])
1986
+ relationships = analysis.get('relationships', [])
1987
+ graph_structure = analysis.get('graph_structure', '')
1988
+ insights = analysis.get('insights', [])
1989
+
1990
+ # Format relationships
1991
+ rel_text = []
1992
+ for rel in relationships:
1993
+ source = rel.get('source', '')
1994
+ target = rel.get('target', '')
1995
+ rel_type = rel.get('type', '')
1996
+ strength = rel.get('strength', 0.0)
1997
+ rel_text.append(f"- {source} -> {target} (type: {rel_type}, strength: {strength:.3f})")
1998
+
1999
+ # Format insights
2000
+ insights_text = []
2001
+ if isinstance(insights, list):
2002
+ for insight in insights:
2003
+ insights_text.append(f"- {insight}")
2004
+ else:
2005
+ insights_text.append(f"- {insights}")
2006
+
2007
+ return self.templates['causal_analysis'].format(
2008
+ variables=', '.join(variables) if variables else 'None identified',
2009
+ relationships='\n'.join(rel_text) if rel_text else 'No relationships found',
2010
+ graph_structure=graph_structure or 'No graph structure available',
2011
+ insights='\n'.join(insights_text) if insights_text else 'No insights generated'
2012
+ )
2013
+
2014
+ def format_counterfactuals(self, scenarios: List[Dict[str, Any]]) -> str:
2015
+ """
2016
+ Format counterfactual scenarios into natural language.
2017
+
2018
+ Args:
2019
+ scenarios: List of counterfactual scenario dictionaries
2020
+
2021
+ Returns:
2022
+ Formatted natural language text
2023
+ """
2024
+ if not scenarios:
2025
+ return "No counterfactual scenarios generated."
2026
+
2027
+ formatted = []
2028
+ for scenario in scenarios:
2029
+ name = scenario.get('name', 'Unknown')
2030
+ interventions = scenario.get('interventions', {})
2031
+ outcomes = scenario.get('expected_outcomes', {})
2032
+ probability = scenario.get('probability', 0.0)
2033
+
2034
+ # Format interventions
2035
+ inter_text = []
2036
+ for var, val in interventions.items():
2037
+ inter_text.append(f"- {var}: {val:.3f}")
2038
+
2039
+ # Format outcomes
2040
+ out_text = []
2041
+ for var, val in outcomes.items():
2042
+ out_text.append(f"- {var}: {val:.3f}")
2043
+
2044
+ formatted.append(self.templates['counterfactual'].format(
2045
+ name=name,
2046
+ interventions='\n'.join(inter_text) if inter_text else 'None',
2047
+ outcomes='\n'.join(out_text) if out_text else 'None',
2048
+ probability=probability
2049
+ ))
2050
+
2051
+ return '\n\n'.join(formatted)
2052
+
2053
+ def format_statistical_results(self, results: Dict[str, Any]) -> str:
2054
+ """
2055
+ Format statistical results into natural language.
2056
+
2057
+ Args:
2058
+ results: Dictionary with statistical results
2059
+
2060
+ Returns:
2061
+ Formatted natural language text
2062
+ """
2063
+ edge_strengths = results.get('edge_strengths', {})
2064
+ uncertainty = results.get('uncertainty', {})
2065
+ confidence_intervals = results.get('confidence_intervals', {})
2066
+
2067
+ # Format edge strengths
2068
+ strength_text = []
2069
+ for (source, target), strength in edge_strengths.items():
2070
+ strength_text.append(f"- {source} -> {target}: {strength:.3f}")
2071
+
2072
+ # Format uncertainty
2073
+ uncertainty_text = []
2074
+ for key, val in uncertainty.items():
2075
+ uncertainty_text.append(f"- {key}: {val:.3f}")
2076
+
2077
+ # Format confidence intervals
2078
+ ci_text = []
2079
+ for key, ci in confidence_intervals.items():
2080
+ if isinstance(ci, dict):
2081
+ lower = ci.get('lower', 0.0)
2082
+ upper = ci.get('upper', 0.0)
2083
+ ci_text.append(f"- {key}: [{lower:.3f}, {upper:.3f}]")
2084
+ else:
2085
+ ci_text.append(f"- {key}: {ci}")
2086
+
2087
+ return self.templates['statistical_summary'].format(
2088
+ edge_strengths='\n'.join(strength_text) if strength_text else 'None',
2089
+ uncertainty='\n'.join(uncertainty_text) if uncertainty_text else 'None',
2090
+ confidence_intervals='\n'.join(ci_text) if ci_text else 'None'
2091
+ )
2092
+
2093
+ def generate_response(
2094
+ self,
2095
+ reasoning_result: Dict[str, Any],
2096
+ response_type: str = 'full',
2097
+ pragmatic_info: Optional[Dict[str, Any]] = None,
2098
+ show_reasoning: bool = False,
2099
+ reasoning_chain: Optional[ReasoningChain] = None
2100
+ ) -> str:
2101
+ """
2102
+ Generate enhanced natural language response with conversational tone.
2103
+
2104
+ Args:
2105
+ reasoning_result: Dictionary with reasoning results
2106
+ response_type: Type of response ('full', 'analysis', 'counterfactuals', 'statistical', 'conversational')
2107
+ pragmatic_info: Optional pragmatic information (register, hedging, explicitness)
2108
+
2109
+ Returns:
2110
+ Natural language response
2111
+ """
2112
+ intent = reasoning_result.get('intent', {})
2113
+ intent_type = intent.get('type', 'analysis')
2114
+
2115
+ # Get pragmatic information
2116
+ if pragmatic_info is None:
2117
+ pragmatic_info = reasoning_result.get('pragmatic', {})
2118
+
2119
+ register = pragmatic_info.get('register', 'neutral')
2120
+ hedging = pragmatic_info.get('hedging', 'likely')
2121
+
2122
+ if response_type == 'conversational':
2123
+ # Enhanced conversational format with chain-of-thought
2124
+ parts = []
2125
+
2126
+ # Show chain-of-thought reasoning if requested
2127
+ if show_reasoning and reasoning_chain:
2128
+ reasoning_text = self._format_reasoning_chain(reasoning_chain)
2129
+ parts.append(reasoning_text)
2130
+
2131
+ # Conversational introduction
2132
+ task = reasoning_result.get('task', '')
2133
+ if task:
2134
+ # Extract topic from task
2135
+ topic = self._extract_topic(task)
2136
+ intro = self.intent_responses.get(intent_type, "I've analyzed your request.")
2137
+ parts.append(intro + "\n\n")
2138
+
2139
+ # Graph-first answer if available (most authoritative)
2140
+ graph_answer = reasoning_result.get('graph_first_answer', {})
2141
+ if graph_answer and graph_answer.get('answer'):
2142
+ # Determine graph type from relationships or result
2143
+ analysis = reasoning_result.get('analysis', {})
2144
+ analysis_relationships = analysis.get('relationships', [])
2145
+ has_general = any(
2146
+ isinstance(rel, dict) and rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual']
2147
+ for rel in analysis_relationships
2148
+ )
2149
+ graph_type_str = 'knowledge' if has_general else 'causal'
2150
+ parts.append(self._format_graph_answer(graph_answer, hedging, graph_type_str))
2151
+
2152
+ # Analysis with natural language (causal or general knowledge)
2153
+ if 'analysis' in reasoning_result:
2154
+ analysis = reasoning_result['analysis']
2155
+ # Determine if this is causal or general knowledge
2156
+ analysis_relationships = analysis.get('relationships', [])
2157
+ has_causal = any(
2158
+ (isinstance(rel, dict) and (rel.get('category') == 'causal' or rel.get('type', '').startswith('causal'))) or
2159
+ (isinstance(rel, str) and 'causal' in rel.lower())
2160
+ for rel in analysis_relationships
2161
+ )
2162
+ has_general = any(
2163
+ isinstance(rel, dict) and rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual']
2164
+ for rel in analysis_relationships
2165
+ )
2166
+
2167
+ if has_causal and not has_general:
2168
+ analysis_text = self._format_analysis_conversational(
2169
+ analysis,
2170
+ register,
2171
+ hedging
2172
+ )
2173
+ elif has_general:
2174
+ analysis_text = self._format_knowledge_analysis_conversational(
2175
+ analysis,
2176
+ register,
2177
+ hedging
2178
+ )
2179
+ else:
2180
+ analysis_text = self._format_analysis_conversational(
2181
+ analysis,
2182
+ register,
2183
+ hedging
2184
+ )
2185
+ parts.append(analysis_text)
2186
+
2187
+ # Answer specific questions
2188
+ if intent_type in ['question', 'prediction'] and 'analysis' in reasoning_result:
2189
+ answer = self._generate_question_answer(reasoning_result, hedging)
2190
+ if answer:
2191
+ parts.append(answer)
2192
+
2193
+ # Counterfactuals with explanation
2194
+ if 'counterfactuals' in reasoning_result and reasoning_result['counterfactuals']:
2195
+ cf_text = self._format_counterfactuals_conversational(
2196
+ reasoning_result['counterfactuals'],
2197
+ hedging
2198
+ )
2199
+ parts.append(cf_text)
2200
+
2201
+ # Recommendations
2202
+ recommendations = reasoning_result.get('recommendations', [])
2203
+ if recommendations:
2204
+ rec_text = self._format_recommendations_conversational(recommendations, hedging)
2205
+ parts.append(rec_text)
2206
+
2207
+ # Statistical results (if available and relevant)
2208
+ if 'statistical' in reasoning_result and reasoning_result['statistical']:
2209
+ if intent_type in ['analysis', 'prediction', 'comparison']:
2210
+ stat_text = self._format_statistical_conversational(
2211
+ reasoning_result['statistical'],
2212
+ hedging
2213
+ )
2214
+ parts.append(stat_text)
2215
+
2216
+ # Show transparency information if available
2217
+ if 'transparency' in reasoning_result and response_type != 'brief':
2218
+ transparency_text = self._format_transparency(reasoning_result['transparency'])
2219
+ parts.append(transparency_text)
2220
+
2221
+ return '\n\n'.join(parts)
2222
+
2223
+ elif response_type == 'full':
2224
+ # Original full format for backwards compatibility
2225
+ parts = []
2226
+
2227
+ task = reasoning_result.get('task', '')
2228
+ if task:
2229
+ parts.append(f"## Task Analysis\n\nAnalyzing: *{task}*\n")
2230
+
2231
+ if intent_type == 'extraction':
2232
+ parts.append("## Extracted Causal Structure\n")
2233
+ elif intent_type == 'counterfactual':
2234
+ parts.append("## Counterfactual Analysis\n")
2235
+ elif intent_type == 'recommendation':
2236
+ parts.append("## Causal Analysis & Recommendations\n")
2237
+ else:
2238
+ parts.append("## Causal Analysis\n")
2239
+
2240
+ if 'analysis' in reasoning_result:
2241
+ parts.append(self.format_causal_analysis(reasoning_result['analysis']))
2242
+
2243
+ recommendations = reasoning_result.get('recommendations', [])
2244
+ if recommendations:
2245
+ parts.append("\n## Recommendations\n")
2246
+ for i, rec in enumerate(recommendations, 1):
2247
+ parts.append(f"{i}. {rec}")
2248
+
2249
+ if 'counterfactuals' in reasoning_result and reasoning_result['counterfactuals']:
2250
+ parts.append("\n" + self.format_counterfactuals(reasoning_result['counterfactuals']))
2251
+
2252
+ if 'statistical' in reasoning_result and reasoning_result['statistical']:
2253
+ if intent_type in ['analysis', 'prediction', 'comparison']:
2254
+ parts.append("\n" + self.format_statistical_results(reasoning_result['statistical']))
2255
+
2256
+ return '\n\n'.join(parts)
2257
+
2258
+ elif response_type == 'analysis':
2259
+ return self.format_causal_analysis(reasoning_result.get('analysis', {}))
2260
+
2261
+ elif response_type == 'counterfactuals':
2262
+ return self.format_counterfactuals(reasoning_result.get('counterfactuals', []))
2263
+
2264
+ elif response_type == 'statistical':
2265
+ return self.format_statistical_results(reasoning_result.get('statistical', {}))
2266
+
2267
+ else:
2268
+ return str(reasoning_result)
2269
+
2270
+ def _extract_topic(self, task: str) -> str:
2271
+ """Extract topic from task for conversational intro."""
2272
+ # Simple extraction - take first few words
2273
+ words = task.split()[:5]
2274
+ return ' '.join(words)
2275
+
2276
+ def _format_reasoning_chain(self, reasoning_chain: ReasoningChain) -> str:
2277
+ """
2278
+ Format reasoning chain for display.
2279
+
2280
+ Args:
2281
+ reasoning_chain: Reasoning chain to format
2282
+
2283
+ Returns:
2284
+ Formatted reasoning text
2285
+ """
2286
+ parts = ["## Chain-of-Thought Reasoning\n"]
2287
+
2288
+ for i, step in enumerate(reasoning_chain.steps, 1):
2289
+ step_text = f"**Step {i}: {step.operation}**\n"
2290
+ if step.input_state:
2291
+ step_text += f" Input: {str(step.input_state)[:100]}...\n"
2292
+ if step.output_state:
2293
+ step_text += f" Output: {str(step.output_state)[:100]}...\n"
2294
+ if step.conclusion:
2295
+ step_text += f" Conclusion: {step.conclusion}\n"
2296
+ if step.confidence < 1.0:
2297
+ step_text += f" Confidence: {step.confidence:.2f}\n"
2298
+
2299
+ parts.append(step_text)
2300
+
2301
+ return "\n".join(parts)
2302
+
2303
+ def _format_transparency(self, transparency: Dict[str, Any]) -> str:
2304
+ """
2305
+ Format transparency information.
2306
+
2307
+ Args:
2308
+ transparency: Transparency dictionary
2309
+
2310
+ Returns:
2311
+ Formatted transparency text
2312
+ """
2313
+ parts = ["## Transparency\n"]
2314
+
2315
+ confidence_viz = transparency.get('confidence', {})
2316
+ if confidence_viz:
2317
+ mean_conf = confidence_viz.get('mean_confidence', 0.0)
2318
+ std_conf = confidence_viz.get('std_confidence', 0.0)
2319
+ parts.append(f"**Confidence:** {mean_conf:.2f} ± {std_conf:.2f}")
2320
+
2321
+ graph_structure = transparency.get('graph_structure', {})
2322
+ if graph_structure:
2323
+ parts.append(f"**Graph Structure:** {graph_structure.get('structure_type', 'unknown')}")
2324
+ parts.append(f" - Nodes: {graph_structure.get('num_nodes', 0)}")
2325
+ parts.append(f" - Edges: {graph_structure.get('num_edges', 0)}")
2326
+
2327
+ return "\n".join(parts)
2328
+
2329
+ def _format_reasoning_chain(self, reasoning_chain: ReasoningChain) -> str:
2330
+ """
2331
+ Format reasoning chain for display.
2332
+
2333
+ Args:
2334
+ reasoning_chain: Reasoning chain to format
2335
+
2336
+ Returns:
2337
+ Formatted reasoning text
2338
+ """
2339
+ parts = ["## Chain-of-Thought Reasoning\n"]
2340
+
2341
+ for i, step in enumerate(reasoning_chain.steps, 1):
2342
+ step_text = f"**Step {i}: {step.operation}**\n"
2343
+ if step.input_state:
2344
+ step_text += f" Input: {str(step.input_state)[:100]}...\n"
2345
+ if step.output_state:
2346
+ step_text += f" Output: {str(step.output_state)[:100]}...\n"
2347
+ if step.conclusion:
2348
+ step_text += f" Conclusion: {step.conclusion}\n"
2349
+ if step.confidence < 1.0:
2350
+ step_text += f" Confidence: {step.confidence:.2f}\n"
2351
+
2352
+ parts.append(step_text)
2353
+
2354
+ return "\n".join(parts)
2355
+
2356
+ def _format_transparency(self, transparency: Dict[str, Any]) -> str:
2357
+ """
2358
+ Format transparency information.
2359
+
2360
+ Args:
2361
+ transparency: Transparency dictionary
2362
+
2363
+ Returns:
2364
+ Formatted transparency text
2365
+ """
2366
+ parts = ["## Transparency\n"]
2367
+
2368
+ confidence_viz = transparency.get('confidence', {})
2369
+ if confidence_viz:
2370
+ mean_conf = confidence_viz.get('mean_confidence', 0.0)
2371
+ std_conf = confidence_viz.get('std_confidence', 0.0)
2372
+ parts.append(f"**Confidence:** {mean_conf:.2f} ± {std_conf:.2f}")
2373
+
2374
+ graph_structure = transparency.get('graph_structure', {})
2375
+ if graph_structure:
2376
+ parts.append(f"**Graph Structure:** {graph_structure.get('structure_type', 'unknown')}")
2377
+ parts.append(f" - Nodes: {graph_structure.get('num_nodes', 0)}")
2378
+ parts.append(f" - Edges: {graph_structure.get('num_edges', 0)}")
2379
+
2380
+ return "\n".join(parts)
2381
+
2382
+ def _format_graph_answer(self, graph_answer: Dict[str, Any], hedging: str, graph_type: str = 'causal') -> str:
2383
+ """Format graph-first answer conversationally."""
2384
+ answer = graph_answer.get('answer', '')
2385
+ evidence = graph_answer.get('supporting_evidence', [])
2386
+
2387
+ if answer:
2388
+ if graph_type in ['knowledge', 'mixed']:
2389
+ result = f"Based on the knowledge graph, {hedging} {answer.lower()}\n\n"
2390
+ else:
2391
+ result = f"Based on the causal graph structure, {hedging} {answer.lower()}\n\n"
2392
+ if evidence:
2393
+ result += "This conclusion is supported by:\n"
2394
+ for ev in evidence[:3]: # Limit to 3 pieces of evidence
2395
+ ev_type = ev.get('type', 'evidence')
2396
+ result += f"- {ev_type}: {str(ev)[:100]}\n"
2397
+ return result
2398
+ return ""
2399
+
2400
+ def _format_knowledge_analysis_conversational(
2401
+ self,
2402
+ analysis: Dict[str, Any],
2403
+ register: str,
2404
+ hedging: str
2405
+ ) -> str:
2406
+ """Format general knowledge analysis in conversational style."""
2407
+ parts = []
2408
+ variables = analysis.get('variables', [])
2409
+ relationships = analysis.get('relationships', [])
2410
+
2411
+ if variables:
2412
+ parts.append(f"I've identified {len(variables)} entities: {', '.join(variables[:5])}")
2413
+ if len(variables) > 5:
2414
+ parts.append(f" and {len(variables) - 5} more")
2415
+ parts.append(".\n\n")
2416
+
2417
+ if relationships:
2418
+ parts.append("Here are the relationships I found:\n\n")
2419
+ for rel in relationships[:5]:
2420
+ if isinstance(rel, dict):
2421
+ source = rel.get('source', '')
2422
+ target = rel.get('target', '')
2423
+ rel_type = rel.get('type', 'related')
2424
+ category = rel.get('category', 'general')
2425
+
2426
+ # Format based on relationship type
2427
+ if category == 'taxonomic':
2428
+ parts.append(f"- {source} is a type of {target}\n")
2429
+ elif category == 'meronymic':
2430
+ if rel_type == 'part_of':
2431
+ parts.append(f"- {source} is part of {target}\n")
2432
+ elif rel_type == 'has_property':
2433
+ parts.append(f"- {source} has {target}\n")
2434
+ elif rel_type == 'contains':
2435
+ parts.append(f"- {source} contains {target}\n")
2436
+ elif category == 'spatial':
2437
+ parts.append(f"- {source} is located in {target}\n")
2438
+ elif category == 'functional':
2439
+ parts.append(f"- {source} is used for {target}\n")
2440
+ elif category == 'definitional':
2441
+ parts.append(f"- {source} is {target}\n")
2442
+ else:
2443
+ parts.append(f"- {source} is related to {target}\n")
2444
+
2445
+ if len(relationships) > 5:
2446
+ parts.append(f"\n... and {len(relationships) - 5} more relationships.\n")
2447
+
2448
+ insights = analysis.get('insights', [])
2449
+ if insights:
2450
+ parts.append("\n**Key Insights:**\n")
2451
+ for insight in insights[:3]:
2452
+ parts.append(f"- {insight}\n")
2453
+
2454
+ return ''.join(parts)
2455
+
2456
+ def _format_analysis_conversational(
2457
+ self,
2458
+ analysis: Dict[str, Any],
2459
+ register: str,
2460
+ hedging: str
2461
+ ) -> str:
2462
+ """Format analysis (causal or general knowledge) in conversational style."""
2463
+ variables = analysis.get('variables', [])
2464
+ relationships = analysis.get('relationships', [])
2465
+ insights = analysis.get('insights', [])
2466
+
2467
+ parts = []
2468
+
2469
+ # Determine if this is causal or general knowledge
2470
+ has_causal = any(
2471
+ isinstance(rel, dict) and rel.get('category') == 'causal'
2472
+ for rel in relationships
2473
+ )
2474
+ has_general = any(
2475
+ isinstance(rel, dict) and rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual']
2476
+ for rel in relationships
2477
+ )
2478
+
2479
+ if variables:
2480
+ if has_general and not has_causal:
2481
+ var_text = ', '.join(variables[:5])
2482
+ if len(variables) > 5:
2483
+ var_text += f", and {len(variables) - 5} more"
2484
+ parts.append(f"I've identified {len(variables)} entities: {var_text}.\n")
2485
+ else:
2486
+ var_text = ', '.join(variables[:5])
2487
+ if len(variables) > 5:
2488
+ var_text += f", and {len(variables) - 5} more"
2489
+ parts.append(f"I've identified {len(variables)} key variables: {var_text}.\n")
2490
+
2491
+ if relationships:
2492
+ if has_general and not has_causal:
2493
+ parts.append(f"Here are the relationships I found:\n\n")
2494
+ for rel in relationships[:5]:
2495
+ if isinstance(rel, dict):
2496
+ source = rel.get('source', '')
2497
+ target = rel.get('target', '')
2498
+ category = rel.get('category', 'general')
2499
+ rel_type = rel.get('type', 'related')
2500
+
2501
+ if category == 'taxonomic':
2502
+ parts.append(f"- {source} is a type of {target}\n")
2503
+ elif category == 'meronymic':
2504
+ if rel_type == 'part_of':
2505
+ parts.append(f"- {source} is part of {target}\n")
2506
+ elif rel_type == 'has_property':
2507
+ parts.append(f"- {source} has {target}\n")
2508
+ else:
2509
+ parts.append(f"- {source} -> {target}\n")
2510
+ elif category == 'spatial':
2511
+ parts.append(f"- {source} is located in {target}\n")
2512
+ elif category == 'functional':
2513
+ parts.append(f"- {source} is used for {target}\n")
2514
+ else:
2515
+ parts.append(f"- {source} is related to {target}\n")
2516
+
2517
+ if len(relationships) > 5:
2518
+ parts.append(f"\n... and {len(relationships) - 5} more relationships.\n")
2519
+ else:
2520
+ parts.append(f"These variables are connected through {len(relationships)} causal relationships.\n")
2521
+
2522
+ # Highlight strongest relationship
2523
+ if relationships:
2524
+ strongest = max(relationships, key=lambda x: abs(x.get('strength', 0)) if isinstance(x, dict) else 0)
2525
+ if isinstance(strongest, dict):
2526
+ source = strongest.get('source', '')
2527
+ target = strongest.get('target', '')
2528
+ strength = strongest.get('strength', 0)
2529
+ confidence = strongest.get('confidence', 0.8)
2530
+
2531
+ connector = self.connectors.get('high_confidence' if confidence > 0.7 else 'medium_confidence', ['It appears'])[0]
2532
+ parts.append(
2533
+ f"{connector}, the strongest relationship is between '{source}' and '{target}' "
2534
+ f"(strength: {strength:.2f}, confidence: {confidence:.1%}).\n"
2535
+ )
2536
+
2537
+ if insights:
2538
+ parts.append("\nKey insights:\n")
2539
+ for insight in insights[:3]: # Limit to 3 insights
2540
+ parts.append(f"- {insight}\n")
2541
+
2542
+ return ''.join(parts)
2543
+
2544
+ def _generate_question_answer(
2545
+ self,
2546
+ reasoning_result: Dict[str, Any],
2547
+ hedging: str
2548
+ ) -> str:
2549
+ """Generate direct answer to a question."""
2550
+ task = reasoning_result.get('task', '')
2551
+ analysis = reasoning_result.get('analysis', {})
2552
+ graph_answer = reasoning_result.get('graph_first_answer', {})
2553
+
2554
+ # Try to extract answer from graph-first reasoning first
2555
+ if graph_answer and graph_answer.get('answer'):
2556
+ return f"**Answer:** {graph_answer['answer']}\n"
2557
+
2558
+ # Fallback to analysis-based answer
2559
+ variables = analysis.get('variables', [])
2560
+ relationships = analysis.get('relationships', [])
2561
+
2562
+ if 'what' in task.lower() or 'which' in task.lower():
2563
+ if variables:
2564
+ return f"**Answer:** The key variables involved are: {', '.join(variables[:3])}.\n"
2565
+
2566
+ if 'how' in task.lower() or 'why' in task.lower():
2567
+ if relationships:
2568
+ strongest = max(relationships, key=lambda x: abs(x.get('strength', 0)))
2569
+ source = strongest.get('source', '')
2570
+ target = strongest.get('target', '')
2571
+ return f"**Answer:** {hedging.capitalize()}, '{source}' affects '{target}' through a causal relationship.\n"
2572
+
2573
+ return ""
2574
+
2575
+ def _format_counterfactuals_conversational(
2576
+ self,
2577
+ counterfactuals: List[Dict[str, Any]],
2578
+ hedging: str
2579
+ ) -> str:
2580
+ """Format counterfactuals conversationally."""
2581
+ if not counterfactuals:
2582
+ return ""
2583
+
2584
+ parts = ["## Exploring Alternative Scenarios\n\n"]
2585
+ parts.append(f"Let me explore {len(counterfactuals)} alternative scenarios:\n\n")
2586
+
2587
+ for i, scenario in enumerate(counterfactuals[:3], 1): # Limit to 3 scenarios
2588
+ name = scenario.get('name', f'Scenario {i}')
2589
+ interventions = scenario.get('interventions', {})
2590
+ outcomes = scenario.get('expected_outcomes', {})
2591
+ probability = scenario.get('probability', 0.0)
2592
+
2593
+ parts.append(f"**{name}** ({probability:.1%} probability):\n")
2594
+
2595
+ if interventions:
2596
+ parts.append("If we change:\n")
2597
+ for var, val in list(interventions.items())[:3]:
2598
+ parts.append(f"- {var} to {val:.2f}\n")
2599
+
2600
+ if outcomes:
2601
+ parts.append("Then we would expect:\n")
2602
+ for var, val in list(outcomes.items())[:3]:
2603
+ parts.append(f"- {var}: {val:.2f}\n")
2604
+
2605
+ parts.append("\n")
2606
+
2607
+ return ''.join(parts)
2608
+
2609
+ def _format_recommendations_conversational(
2610
+ self,
2611
+ recommendations: List[str],
2612
+ hedging: str
2613
+ ) -> str:
2614
+ """Format recommendations conversationally."""
2615
+ if not recommendations:
2616
+ return ""
2617
+
2618
+ parts = ["## Recommendations\n\n"]
2619
+ parts.append("Based on my analysis, here's what I recommend:\n\n")
2620
+
2621
+ for i, rec in enumerate(recommendations[:5], 1): # Limit to 5 recommendations
2622
+ parts.append(f"{i}. {rec}\n")
2623
+
2624
+ return ''.join(parts)
2625
+
2626
+ def _format_statistical_conversational(
2627
+ self,
2628
+ statistical: Dict[str, Any],
2629
+ hedging: str
2630
+ ) -> str:
2631
+ """Format statistical results conversationally."""
2632
+ parts = ["## Statistical Analysis\n\n"]
2633
+
2634
+ edge_strengths = statistical.get('edge_strengths', {})
2635
+ if edge_strengths:
2636
+ parts.append(f"The statistical analysis reveals {len(edge_strengths)} causal relationships with quantified strengths.\n")
2637
+
2638
+ # Highlight strongest edges
2639
+ sorted_edges = sorted(edge_strengths.items(), key=lambda x: abs(x[1]), reverse=True)
2640
+ if sorted_edges:
2641
+ parts.append("The strongest relationships are:\n")
2642
+ for (source, target), strength in sorted_edges[:3]:
2643
+ parts.append(f"- {source} -> {target}: {strength:.3f}\n")
2644
+
2645
+ uncertainty = statistical.get('uncertainty', {})
2646
+ if uncertainty:
2647
+ parts.append(f"\nUncertainty analysis indicates {hedging} confidence in these relationships.\n")
2648
+
2649
+ return ''.join(parts)
2650
+
2651
+
2652
+ class HybridOrchestrator:
2653
+ """
2654
+ Orchestrates hybrid reasoning with all LLM-enhanced components.
2655
+
2656
+ Integrates:
2657
+ - Reasoning tracking for chain-of-thought
2658
+ - Explanation generation
2659
+ - Self-verification
2660
+ - Consistency guarantees
2661
+ """
2662
+
2663
+ def __init__(
2664
+ self,
2665
+ symbolic_reasoner: SymbolicReasoner,
2666
+ statistical_engine: StatisticalEngine,
2667
+ nlg: RuleBasedNLG,
2668
+ graph_first_reasoner: Optional[GraphFirstReasoner] = None,
2669
+ text_corrector: Optional[TextCorrector] = None,
2670
+ lexical_compiler: Optional[LexicalCompiler] = None,
2671
+ grammatical_compiler: Optional[GrammaticalCompiler] = None,
2672
+ pragmatic_compiler: Optional[PragmaticCompiler] = None,
2673
+ reasoning_tracker: Optional[ReasoningTracker] = None,
2674
+ explanation_builder: Optional[ExplanationBuilder] = None,
2675
+ transparency_layer: Optional[TransparencyLayer] = None,
2676
+ consistency_checker: Optional[ConsistencyChecker] = None,
2677
+ error_detector: Optional[ErrorDetector] = None,
2678
+ self_corrector: Optional[SelfCorrector] = None,
2679
+ consistency_engine: Optional[ConsistencyEngine] = None
2680
+ ):
2681
+ """
2682
+ Initialize hybrid orchestrator.
2683
+
2684
+ Args:
2685
+ symbolic_reasoner: Symbolic reasoner instance
2686
+ statistical_engine: Statistical engine instance
2687
+ nlg: Natural language generator
2688
+ graph_first_reasoner: Optional graph-first reasoner
2689
+ text_corrector: Optional text corrector
2690
+ lexical_compiler: Optional lexical compiler
2691
+ grammatical_compiler: Optional grammatical compiler
2692
+ pragmatic_compiler: Optional pragmatic compiler
2693
+ reasoning_tracker: Optional reasoning tracker for chain-of-thought
2694
+ explanation_builder: Optional explanation builder
2695
+ transparency_layer: Optional transparency layer
2696
+ consistency_checker: Optional consistency checker
2697
+ error_detector: Optional error detector
2698
+ self_corrector: Optional self corrector
2699
+ consistency_engine: Optional consistency engine
2700
+ """
2701
+ self.symbolic_reasoner = symbolic_reasoner
2702
+ self.statistical_engine = statistical_engine
2703
+ self.nlg = nlg
2704
+ self.graph_first_reasoner = graph_first_reasoner
2705
+ self.text_corrector = text_corrector
2706
+ self.lexical_compiler = lexical_compiler
2707
+ self.grammatical_compiler = grammatical_compiler
2708
+ self.pragmatic_compiler = pragmatic_compiler
2709
+ self.reasoning_tracker = reasoning_tracker
2710
+ self.explanation_builder = explanation_builder
2711
+ self.transparency_layer = transparency_layer
2712
+ self.consistency_checker = consistency_checker
2713
+ self.error_detector = error_detector
2714
+ self.self_corrector = self_corrector
2715
+ self.consistency_engine = consistency_engine
2716
+
2717
+ def _parse_task_intent(self, task: str) -> Dict[str, Any]:
2718
+ """
2719
+ Parse task to understand user intent and extract query type.
2720
+
2721
+ Args:
2722
+ task: Natural language task
2723
+
2724
+ Returns:
2725
+ Dictionary with intent information
2726
+ """
2727
+ task_lower = task.lower()
2728
+ intent = {
2729
+ 'type': 'analysis', # default
2730
+ 'question_type': None,
2731
+ 'target_variables': [],
2732
+ 'intervention_variables': [],
2733
+ 'comparison_requested': False,
2734
+ }
2735
+
2736
+ # Question type detection
2737
+ if any(word in task_lower for word in ['what', 'which', 'who']):
2738
+ intent['question_type'] = 'what'
2739
+ elif any(word in task_lower for word in ['how', 'why']):
2740
+ intent['question_type'] = 'how'
2741
+ elif any(word in task_lower for word in ['when', 'where']):
2742
+ intent['question_type'] = 'when_where'
2743
+ elif '?' in task:
2744
+ intent['question_type'] = 'general_question'
2745
+
2746
+ # Intent type detection
2747
+ if any(word in task_lower for word in ['extract', 'identify', 'find', 'list']):
2748
+ intent['type'] = 'extraction'
2749
+ elif any(word in task_lower for word in ['analyze', 'analyze', 'examine', 'study']):
2750
+ intent['type'] = 'analysis'
2751
+ elif any(word in task_lower for word in ['predict', 'forecast', 'estimate']):
2752
+ intent['type'] = 'prediction'
2753
+ elif any(word in task_lower for word in ['compare', 'versus', 'vs', 'difference']):
2754
+ intent['type'] = 'comparison'
2755
+ intent['comparison_requested'] = True
2756
+ elif any(word in task_lower for word in ['what if', 'if', 'suppose', 'assume']):
2757
+ intent['type'] = 'counterfactual'
2758
+ elif any(word in task_lower for word in ['recommend', 'suggest', 'should', 'best']):
2759
+ intent['type'] = 'recommendation'
2760
+ elif any(word in task_lower for word in ['what is', 'what are', 'define', 'definition', 'meaning']):
2761
+ intent['type'] = 'definition'
2762
+ elif any(word in task_lower for word in ['who is', 'who are']):
2763
+ intent['type'] = 'person_query'
2764
+ elif any(word in task_lower for word in ['where is', 'where are', 'location']):
2765
+ intent['type'] = 'location_query'
2766
+ elif any(word in task_lower for word in ['when is', 'when was', 'when did', 'date', 'time']):
2767
+ intent['type'] = 'temporal_query'
2768
+ elif any(word in task_lower for word in ['explain', 'describe', 'tell me about']):
2769
+ intent['type'] = 'explanation'
2770
+
2771
+ # Extract target variables (what user wants to know about)
2772
+ target_patterns = [
2773
+ r'(?:about|regarding|for|of)\s+(\w+(?:\s+\w+)?)',
2774
+ r'(?:affecting|impacting|influencing)\s+(\w+(?:\s+\w+)?)',
2775
+ r'(?:on|in)\s+(\w+(?:\s+\w+)?)',
2776
+ ]
2777
+ for pattern in target_patterns:
2778
+ matches = re.finditer(pattern, task_lower)
2779
+ for match in matches:
2780
+ var = self.symbolic_reasoner._normalize_variable_name(match.group(1))
2781
+ if var and var not in self.symbolic_reasoner.stop_words:
2782
+ intent['target_variables'].append(var)
2783
+
2784
+ # Extract intervention variables (what user wants to change)
2785
+ intervention_patterns = [
2786
+ r'(?:if|when|suppose)\s+(\w+(?:\s+\w+)?)\s+(?:changes?|increases?|decreases?)',
2787
+ r'(?:change|modify|adjust)\s+(\w+(?:\s+\w+)?)',
2788
+ ]
2789
+ for pattern in intervention_patterns:
2790
+ matches = re.finditer(pattern, task_lower)
2791
+ for match in matches:
2792
+ var = self.symbolic_reasoner._normalize_variable_name(match.group(1))
2793
+ if var and var not in self.symbolic_reasoner.stop_words:
2794
+ intent['intervention_variables'].append(var)
2795
+
2796
+ return intent
2797
+
2798
+ def _parse_extracted_values(self, variables_with_values: Dict[str, str]) -> Dict[str, float]:
2799
+ """
2800
+ Parse extracted string values into float values.
2801
+
2802
+ Handles:
2803
+ - "20000" -> 20000.0
2804
+ - "61%" -> 0.61
2805
+ - "61% buy, 39% sell" -> extracts main percentage (61% -> 0.61)
2806
+
2807
+ Args:
2808
+ variables_with_values: Dictionary mapping variable names to string values
2809
+
2810
+ Returns:
2811
+ Dictionary mapping variable names to float values
2812
+ """
2813
+ parsed = {}
2814
+
2815
+ for var, value_str in variables_with_values.items():
2816
+ if not value_str:
2817
+ continue
2818
+
2819
+ try:
2820
+ # Remove whitespace
2821
+ value_str = value_str.strip()
2822
+
2823
+ # Handle percentages
2824
+ if '%' in value_str:
2825
+ # Extract first percentage if multiple (e.g., "61% buy, 39% sell" -> 61%)
2826
+ percent_match = re.search(r'(\d+[.,]?\d*)\s*%', value_str)
2827
+ if percent_match:
2828
+ percent_value = float(percent_match.group(1).replace(',', '.'))
2829
+ # Convert percentage to decimal (61% -> 0.61)
2830
+ parsed[var] = percent_value / 100.0
2831
+ else:
2832
+ # Try to extract any number before %
2833
+ num_match = re.search(r'(\d+[.,]?\d*)', value_str)
2834
+ if num_match:
2835
+ parsed[var] = float(num_match.group(1).replace(',', '.')) / 100.0
2836
+ else:
2837
+ # Regular number
2838
+ # Remove any non-numeric characters except decimal point and comma
2839
+ clean_value = re.sub(r'[^\d.,-]', '', value_str)
2840
+ if clean_value:
2841
+ # Handle comma as decimal separator (European format)
2842
+ if ',' in clean_value and '.' not in clean_value:
2843
+ clean_value = clean_value.replace(',', '.')
2844
+ # Handle comma as thousands separator
2845
+ elif ',' in clean_value and '.' in clean_value:
2846
+ # Assume last comma/period is decimal separator
2847
+ parts = clean_value.replace(',', ' ').replace('.', ' ').split()
2848
+ if len(parts) > 1:
2849
+ clean_value = '.'.join(parts)
2850
+ else:
2851
+ clean_value = clean_value.replace(',', '')
2852
+
2853
+ parsed[var] = float(clean_value)
2854
+ except (ValueError, AttributeError) as e:
2855
+ logger.debug(f"Failed to parse value '{value_str}' for variable '{var}': {e}")
2856
+ continue
2857
+
2858
+ return parsed
2859
+
2860
+ def _detect_and_parse_json_scm(self, task: str) -> Optional[Dict[str, Any]]:
2861
+ """
2862
+ Detect and parse JSON SCM (Structural Causal Model) from task.
2863
+
2864
+ Args:
2865
+ task: Task string that may contain JSON SCM
2866
+
2867
+ Returns:
2868
+ Parsed SCM dictionary or None if not detected
2869
+ """
2870
+ # Try to find JSON in the task
2871
+ # Look for JSON object pattern - find the largest JSON object
2872
+ # This handles cases where there's text before/after the JSON
2873
+ json_matches = list(re.finditer(r'\{[\s\S]*?\}', task))
2874
+ if not json_matches:
2875
+ return None
2876
+
2877
+ # Try the largest match first (most likely to be complete JSON)
2878
+ json_matches_sorted = sorted(json_matches, key=lambda m: len(m.group(0)), reverse=True)
2879
+
2880
+ for json_match in json_matches_sorted:
2881
+ json_str = json_match.group(0)
2882
+
2883
+ try:
2884
+ scm_data = json.loads(json_str)
2885
+
2886
+ # Validate it's an SCM structure
2887
+ if not isinstance(scm_data, dict):
2888
+ continue
2889
+
2890
+ # Check for SCM indicators
2891
+ has_variables = 'variables' in scm_data
2892
+ has_equations = 'equations' in scm_data
2893
+ has_roles = any(
2894
+ isinstance(v, dict) and 'role' in v
2895
+ for v in scm_data.get('variables', [])
2896
+ )
2897
+
2898
+ if has_variables and (has_equations or has_roles):
2899
+ logger.info(f"Detected SCM structure with {len(scm_data.get('variables', []))} variables")
2900
+ return scm_data
2901
+
2902
+ except json.JSONDecodeError:
2903
+ # Try to extract and fix JSON
2904
+ try:
2905
+ # Remove comments and fix common issues
2906
+ json_str_clean = re.sub(r'//.*?$', '', json_str, flags=re.MULTILINE)
2907
+ json_str_clean = re.sub(r'/\*.*?\*/', '', json_str_clean, flags=re.DOTALL)
2908
+ scm_data = json.loads(json_str_clean)
2909
+
2910
+ if isinstance(scm_data, dict) and 'variables' in scm_data:
2911
+ logger.info(f"Detected SCM structure (after cleaning) with {len(scm_data.get('variables', []))} variables")
2912
+ return scm_data
2913
+ except json.JSONDecodeError:
2914
+ continue
2915
+
2916
+ return None
2917
+
2918
+ def _parse_scm_to_graph(self, scm_data: Dict[str, Any]) -> Dict[str, Any]:
2919
+ """
2920
+ Parse structured SCM JSON into causal graph.
2921
+
2922
+ Args:
2923
+ scm_data: Parsed SCM dictionary
2924
+
2925
+ Returns:
2926
+ Dictionary with extracted variables, edges, and metadata
2927
+ """
2928
+ variables = []
2929
+ edges = []
2930
+ relationships = []
2931
+ variables_with_values = {}
2932
+
2933
+ # Extract variables
2934
+ var_list = scm_data.get('variables', [])
2935
+ for var_def in var_list:
2936
+ if isinstance(var_def, dict):
2937
+ var_id = var_def.get('id', '')
2938
+ var_role = var_def.get('role', '')
2939
+ if var_id:
2940
+ variables.append(var_id)
2941
+ # Store role information
2942
+ if var_role:
2943
+ variables_with_values[f"{var_id}_role"] = var_role
2944
+
2945
+ # Extract relationships from equations
2946
+ equations = scm_data.get('equations', [])
2947
+ for eq in equations:
2948
+ if isinstance(eq, dict):
2949
+ defines = eq.get('defines', '')
2950
+ parents = eq.get('parents', [])
2951
+
2952
+ # Extract variable name from defines (e.g., "S[t+1]" -> "S")
2953
+ defines_var = re.sub(r'\[.*?\]', '', defines).strip()
2954
+
2955
+ # Create edges from parents to defined variable
2956
+ for parent in parents:
2957
+ # Extract variable name from parent (e.g., "S[t]" -> "S", "alpha" -> "alpha")
2958
+ parent_var = re.sub(r'\[.*?\]', '', str(parent)).strip()
2959
+
2960
+ if defines_var and parent_var and defines_var != parent_var:
2961
+ # Only create edges between state variables (not parameters/constants)
2962
+ parent_role = None
2963
+ defines_role = None
2964
+
2965
+ # Find roles
2966
+ for var_def in var_list:
2967
+ if isinstance(var_def, dict):
2968
+ if var_def.get('id') == parent_var:
2969
+ parent_role = var_def.get('role', '')
2970
+ if var_def.get('id') == defines_var:
2971
+ defines_role = var_def.get('role', '')
2972
+
2973
+ # Only create causal edges (not parameter/constant relationships)
2974
+ # State variables can cause other state variables
2975
+ # Interventions can affect state variables
2976
+ # Exogenous can affect state variables
2977
+ if (defines_role in ['state', 'derived'] and
2978
+ parent_role in ['state', 'intervention', 'exogenous']):
2979
+ edges.append((parent_var, defines_var))
2980
+ relationships.append({
2981
+ 'source': parent_var,
2982
+ 'target': defines_var,
2983
+ 'type': 'causal',
2984
+ 'confidence': 1.0, # High confidence for explicit SCM
2985
+ 'negated': False,
2986
+ 'raw_source': parent_var,
2987
+ 'raw_target': defines_var,
2988
+ 'from_equation': eq.get('id', ''),
2989
+ 'equation': eq.get('expr', '')
2990
+ })
2991
+
2992
+ # Extract initial state values
2993
+ given = scm_data.get('given', {})
2994
+ initial_state = given.get('initial_state', {})
2995
+ for var, value in initial_state.items():
2996
+ var_name = re.sub(r'\[.*?\]', '', var).strip()
2997
+ if var_name in variables:
2998
+ variables_with_values[var_name] = str(value)
2999
+
3000
+ # Extract parameter values
3001
+ parameters = given.get('parameters', {})
3002
+ for param, value in parameters.items():
3003
+ if param in variables:
3004
+ variables_with_values[param] = str(value)
3005
+
3006
+ return {
3007
+ 'variables': variables,
3008
+ 'edges': edges,
3009
+ 'relationships': relationships,
3010
+ 'metadata': {
3011
+ 'variables_with_values': variables_with_values,
3012
+ 'scm_structure': True,
3013
+ 'task_id': scm_data.get('task_id', ''),
3014
+ 'equations_count': len(equations),
3015
+ 'variables_count': len(variables)
3016
+ }
3017
+ }
3018
+
3019
+ def reason_hybrid(
3020
+ self,
3021
+ task: str,
3022
+ data: Optional[Any] = None,
3023
+ context: Optional[ConversationContext] = None
3024
+ ) -> Dict[str, Any]:
3025
+ """
3026
+ Execute graph-first hybrid reasoning pipeline with reasoning tracking.
3027
+
3028
+ Pipeline:
3029
+ 1. JSON/SCM detection and parsing (if structured input)
3030
+ 2. Text correction (non-destructive)
3031
+ 3. Language compilation (lexical -> grammatical -> pragmatic)
3032
+ 4. Symbolic extraction (to graph with provenance)
3033
+ 5. Graph-first reasoning (answer from graph state ONLY)
3034
+ 6. Natural language generation (from graph state)
3035
+ 7. Self-verification and error correction
3036
+ 8. Explanation generation
3037
+
3038
+ Args:
3039
+ task: Natural language task description or JSON SCM
3040
+ data: Optional pandas DataFrame for statistical inference
3041
+ context: Optional conversation context
3042
+
3043
+ Returns:
3044
+ Dictionary with reasoning results derived from graph state
3045
+ """
3046
+ # Create reasoning chain if tracking enabled
3047
+ if self.reasoning_tracker:
3048
+ self.reasoning_tracker.create_chain()
3049
+
3050
+ result = {
3051
+ 'task': task,
3052
+ 'intent': {},
3053
+ 'analysis': {},
3054
+ 'counterfactuals': [],
3055
+ 'statistical': {},
3056
+ 'graph_structure': '',
3057
+ 'recommendations': [],
3058
+ 'graph_first_answer': None,
3059
+ 'scm_parsed': False,
3060
+ 'reasoning_chain': None,
3061
+ 'explanation': None
3062
+ }
3063
+
3064
+ # Track reasoning step: SCM detection
3065
+ if self.reasoning_tracker:
3066
+ self.reasoning_tracker.add_step(
3067
+ step_type=StepType.EXTRACTION,
3068
+ operation="detect_json_scm",
3069
+ input_state={'task': task},
3070
+ output_state={},
3071
+ conclusion="SCM detection"
3072
+ )
3073
+
3074
+ # Step 0: Detect and parse JSON SCM if present
3075
+ scm_data = self._detect_and_parse_json_scm(task)
3076
+ if scm_data:
3077
+ logger.info("Detected structured JSON SCM - parsing directly")
3078
+ result['scm_parsed'] = True
3079
+ result['scm_data'] = scm_data
3080
+
3081
+ # Track reasoning step: SCM parsing
3082
+ if self.reasoning_tracker:
3083
+ self.reasoning_tracker.add_step(
3084
+ step_type=StepType.EXTRACTION,
3085
+ operation="parse_scm",
3086
+ input_state={'scm_data': scm_data},
3087
+ output_state={},
3088
+ conclusion="SCM parsed"
3089
+ )
3090
+
3091
+ # Parse SCM to graph structure
3092
+ scm_extraction = self._parse_scm_to_graph(scm_data)
3093
+ result['scm_extraction'] = scm_extraction
3094
+
3095
+ # Use SCM extraction instead of natural language extraction
3096
+ variables = scm_extraction.get('variables', [])
3097
+ relationships_extracted = scm_extraction.get('relationships', [])
3098
+ edges = scm_extraction.get('edges', [])
3099
+
3100
+ # All SCM variables are valid (they're explicitly defined)
3101
+ valid_variables = set(variables)
3102
+
3103
+ # Add edges directly from SCM (high confidence - explicit structure)
3104
+ edges_added = 0
3105
+ for rel in relationships_extracted:
3106
+ source = rel['source']
3107
+ target = rel['target']
3108
+
3109
+ self.symbolic_reasoner.graph_manager.add_relationship(
3110
+ source=source,
3111
+ target=target,
3112
+ strength=1.0,
3113
+ confidence=rel.get('confidence', 1.0),
3114
+ from_scm=True,
3115
+ equation_id=rel.get('from_equation', ''),
3116
+ equation_expr=rel.get('equation', '')
3117
+ )
3118
+ edges_added += 1
3119
+
3120
+ result['edges_added'] = edges_added
3121
+ result['scm_parsing_success'] = True
3122
+
3123
+ # Track reasoning step: Graph construction from SCM
3124
+ if self.reasoning_tracker:
3125
+ self.reasoning_tracker.add_step(
3126
+ step_type=StepType.TRANSFORMATION,
3127
+ operation="build_graph_from_scm",
3128
+ input_state={'variables': variables, 'relationships': relationships_extracted},
3129
+ output_state={'edges_added': edges_added},
3130
+ conclusion=f"Graph built with {edges_added} edges"
3131
+ )
3132
+
3133
+ # Skip natural language extraction for SCM
3134
+ # Continue to graph-first reasoning and analysis
3135
+ corrected_task = task # Keep original for display
3136
+ else:
3137
+ # Step 0: Text correction (non-destructive) - only for natural language
3138
+ corrected_task = task
3139
+ corrected_tokens = None
3140
+ if self.text_corrector:
3141
+ correction_result = self.text_corrector.correct_text(task)
3142
+ corrected_task = correction_result['corrected_text']
3143
+ corrected_tokens = correction_result['corrected_tokens']
3144
+ result['correction'] = {
3145
+ 'original': task,
3146
+ 'corrected': corrected_task,
3147
+ 'confidence': correction_result['confidence']
3148
+ }
3149
+
3150
+ # Step 0.5: Language compilation (lexical -> grammatical -> pragmatic)
3151
+ # Skip for SCM (already parsed)
3152
+ if not result.get('scm_parsed'):
3153
+ if self.lexical_compiler and self.grammatical_compiler:
3154
+ # Normalize terms using lexical compiler
3155
+ if corrected_tokens:
3156
+ for token in corrected_tokens:
3157
+ normalized = self.lexical_compiler.normalize_term(token.normalized_form)
3158
+ if normalized != token.normalized_form:
3159
+ token.normalized_form = normalized
3160
+
3161
+ # Optional: Validate word using dictionary (helps filter invalid terms)
3162
+ # This is a soft check - we don't reject words that aren't in dictionary
3163
+ # as domain-specific terms may not be in standard dictionaries
3164
+ if self.lexical_compiler.enable_dictionary:
3165
+ word_info = self.lexical_compiler.get_word_info(token.normalized_form)
3166
+ if word_info:
3167
+ # Store word info for later use (part of speech, synonyms, etc.)
3168
+ token.metadata = word_info
3169
+
3170
+ # Parse grammatical structure
3171
+ if self.grammatical_compiler:
3172
+ parse_tree = self.grammatical_compiler.parse_dependencies(corrected_task)
3173
+ causal_structure = self.grammatical_compiler.extract_causal_structure(parse_tree)
3174
+ if causal_structure:
3175
+ result['causal_structure'] = {
3176
+ 'cause': causal_structure.cause,
3177
+ 'effect': causal_structure.effect,
3178
+ 'relation_type': causal_structure.relation_type,
3179
+ 'confidence': causal_structure.confidence
3180
+ }
3181
+
3182
+ # Step 0.6: Parse task intent
3183
+ intent = self._parse_task_intent(corrected_task)
3184
+ result['intent'] = intent
3185
+
3186
+ # Step 1: Advanced symbolic extraction (use corrected task) OR use SCM extraction
3187
+ if result.get('scm_parsed'):
3188
+ # Use SCM extraction (already done above and stored in result)
3189
+ extraction = result.get('scm_extraction', {})
3190
+ variables = extraction.get('variables', [])
3191
+ relationships_extracted = extraction.get('relationships', [])
3192
+ edges = extraction.get('edges', [])
3193
+ valid_variables = set(variables) # All SCM variables are valid
3194
+ logger.info(f"Using SCM extraction: {len(variables)} variables, {len(relationships_extracted)} relationships")
3195
+ else:
3196
+ # Natural language extraction with few-shot learning (if enabled)
3197
+ # Check if we have learned patterns to use
3198
+ if hasattr(self.symbolic_reasoner, 'adaptive_extractor') and self.symbolic_reasoner.adaptive_extractor:
3199
+ # Try adaptive extraction first
3200
+ try:
3201
+ adaptive_result = self.symbolic_reasoner.adaptive_extractor.adapt_extraction(corrected_task)
3202
+ if adaptive_result.get('variables'):
3203
+ # Use adaptive extraction result
3204
+ extraction = {
3205
+ 'variables': adaptive_result.get('variables', []),
3206
+ 'edges': adaptive_result.get('edges', []),
3207
+ 'relationships': []
3208
+ }
3209
+ # Convert edges to relationships format
3210
+ for source, target in adaptive_result.get('edges', []):
3211
+ extraction['relationships'].append({
3212
+ 'source': source,
3213
+ 'target': target,
3214
+ 'type': 'causal',
3215
+ 'confidence': adaptive_result.get('confidence', 0.8)
3216
+ })
3217
+ else:
3218
+ # Fall back to standard extraction
3219
+ extraction = self.symbolic_reasoner.extract_variables_from_task(corrected_task)
3220
+ except Exception as e:
3221
+ logger.warning(f"Adaptive extraction failed: {e}, falling back to standard extraction")
3222
+ extraction = self.symbolic_reasoner.extract_variables_from_task(corrected_task)
3223
+ else:
3224
+ # Standard extraction
3225
+ extraction = self.symbolic_reasoner.extract_variables_from_task(corrected_task)
3226
+
3227
+ variables = extraction.get('variables', [])
3228
+ relationships_extracted = extraction.get('relationships', [])
3229
+ edges = extraction.get('edges', [])
3230
+
3231
+ # Track reasoning step: Variable extraction
3232
+ if self.reasoning_tracker:
3233
+ self.reasoning_tracker.add_step(
3234
+ step_type=StepType.EXTRACTION,
3235
+ operation="extract_variables",
3236
+ input_state={'task': corrected_task},
3237
+ output_state={'variables': variables, 'relationships': len(relationships_extracted)},
3238
+ conclusion=f"Extracted {len(variables)} variables"
3239
+ )
3240
+
3241
+ # valid_variables will be set below after filtering
3242
+
3243
+ # CRITICAL: Filter variables before graph construction - only use valid variables
3244
+ # This prevents treating action verbs and epistemic terms as causal variables
3245
+ # BUT: Skip filtering for SCM (all SCM variables are explicitly defined and valid)
3246
+ if not result.get('scm_parsed'):
3247
+ valid_variables = {v for v in variables if self.symbolic_reasoner._clean_variable(v)}
3248
+ # For general knowledge tasks, be more permissive with filtering
3249
+ current_graph_type = self.symbolic_reasoner.graph_manager.graph_type
3250
+ if current_graph_type in ['knowledge', 'mixed']:
3251
+ # For general knowledge: Only filter action verbs/epistemic terms, keep entities
3252
+ filtered = set()
3253
+ for var in valid_variables:
3254
+ if not self.symbolic_reasoner._is_action_verb(var) and not self.symbolic_reasoner._is_epistemic_term(var):
3255
+ cleaned = self.symbolic_reasoner._clean_variable(var)
3256
+ if cleaned and cleaned not in self.symbolic_reasoner.stop_words:
3257
+ filtered.add(cleaned)
3258
+ valid_variables = filtered
3259
+ else:
3260
+ # For causal tasks: Strict filtering
3261
+ valid_variables = self.symbolic_reasoner._filter_valid_variables(valid_variables)
3262
+
3263
+ # Enhanced epistemic validation with GroundingValidator
3264
+ # Check epistemic grounding: ∀v ∈ V, ∃ path from observables O to v
3265
+ observable_variables = valid_variables.copy() # For now, all valid variables are considered observable
3266
+ if self.consistency_checker:
3267
+ graph_state_temp = {
3268
+ 'nodes': list(valid_variables),
3269
+ 'edges': edges
3270
+ }
3271
+ all_grounded, ungrounded = self.consistency_checker.verify_epistemic_grounding(
3272
+ graph_state_temp,
3273
+ observable_variables
3274
+ )
3275
+ if not all_grounded and ungrounded:
3276
+ logger.warning(f"Ungrounded variables detected: {ungrounded}")
3277
+
3278
+ # Epistemic validation: Check if we have sufficient grounding
3279
+ # If task is too vague (no explicit causal relationships, only action verbs/epistemic terms),
3280
+ # we should warn or reject
3281
+ has_explicit_causal_structure = len(relationships_extracted) > 0 or len(edges) > 0
3282
+ has_valid_state_variables = len(valid_variables) > 0
3283
+
3284
+ # Check if variables are mostly action verbs/epistemic terms (bad sign)
3285
+ action_verb_count = sum(1 for v in variables if self.symbolic_reasoner._is_action_verb(v))
3286
+ epistemic_term_count = sum(1 for v in variables if self.symbolic_reasoner._is_epistemic_term(v))
3287
+ total_vars = len(variables)
3288
+
3289
+ if total_vars > 0:
3290
+ action_epistemic_ratio = (action_verb_count + epistemic_term_count) / total_vars
3291
+ if action_epistemic_ratio > 0.5 and not has_explicit_causal_structure:
3292
+ result['epistemic_warning'] = (
3293
+ f"Warning: Task appears to contain mostly action verbs or epistemic terms "
3294
+ f"({action_verb_count + epistemic_term_count}/{total_vars} variables), "
3295
+ f"not causal state variables. Causal relationships cannot be inferred from "
3296
+ f"intent statements alone. Please provide explicit state variables and "
3297
+ f"causal relationships, or an existing SCM with logged policy decisions."
3298
+ )
3299
+ logger.warning(result['epistemic_warning'])
3300
+
3301
+ # Determine task type (causal vs general knowledge)
3302
+ task_intent = self._parse_task_intent(task)
3303
+
3304
+ # Check relationships_extracted for categories (if available)
3305
+ relationships_categories = [rel.get('category', 'causal') for rel in relationships_extracted if isinstance(rel, dict)]
3306
+ has_causal_rels = any(cat == 'causal' for cat in relationships_categories)
3307
+ has_general_rels = any(cat in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual'] for cat in relationships_categories)
3308
+
3309
+ graph_type = self.symbolic_reasoner.graph_manager.graph_type
3310
+ is_causal_task = (
3311
+ task_intent.get('type') in ['analysis', 'prediction', 'counterfactual', 'comparison'] or
3312
+ has_causal_rels or
3313
+ graph_type == 'causal'
3314
+ )
3315
+ is_general_knowledge_task = (
3316
+ task_intent.get('type') in ['definition', 'person_query', 'location_query', 'temporal_query', 'explanation'] or
3317
+ has_general_rels or
3318
+ graph_type in ['knowledge', 'mixed']
3319
+ )
3320
+
3321
+ # Epistemic validation only applies to causal tasks
3322
+ if is_causal_task and not is_general_knowledge_task:
3323
+ # If no valid state variables after filtering, task is epistemically underspecified
3324
+ if not has_valid_state_variables and not has_explicit_causal_structure:
3325
+ result['epistemic_error'] = (
3326
+ "Task is epistemically underspecified. No valid causal state variables were "
3327
+ "extracted. A CRCA agent requires:\n"
3328
+ "- Explicit state variables (not action verbs like 'identify' or epistemic terms like 'policy')\n"
3329
+ "- Transition relations (causal relationships between variables)\n"
3330
+ "- Intervention hooks (variables that can be manipulated)\n"
3331
+ "- Optionally: An existing SCM, logged policy decisions, defined collapse predicates\n\n"
3332
+ "Please provide a task with explicit causal structure, not just intent statements."
3333
+ )
3334
+ logger.error(result['epistemic_error'])
3335
+ # Don't proceed with graph construction if we have no valid variables
3336
+ result['analysis'] = {
3337
+ 'variables': [],
3338
+ 'relationships': [],
3339
+ 'graph_structure': 'No valid causal structure extracted',
3340
+ 'insights': [result['epistemic_error']],
3341
+ 'epistemic_underspecified': True
3342
+ }
3343
+ return result
3344
+
3345
+ # Add edges to graph with confidence scores
3346
+ # For SCM, all relationships are already added above, so skip this section
3347
+ if not result.get('scm_parsed'):
3348
+ edges_added = 0
3349
+ for rel in relationships_extracted:
3350
+ if not rel.get('negated', False): # Skip negated relationships
3351
+ # For general knowledge, be more permissive with cleaning
3352
+ # Don't filter out entities just because they're short or not in keywords
3353
+ rel_category = rel.get('category', 'causal')
3354
+ if rel_category != 'causal':
3355
+ # For general knowledge: Just normalize, don't filter
3356
+ source_clean = self.symbolic_reasoner._normalize_variable_name(rel['source'])
3357
+ target_clean = self.symbolic_reasoner._normalize_variable_name(rel['target'])
3358
+ # Remove stop words but keep the variable
3359
+ if source_clean in self.symbolic_reasoner.stop_words:
3360
+ source_clean = None
3361
+ if target_clean in self.symbolic_reasoner.stop_words:
3362
+ target_clean = None
3363
+ else:
3364
+ # For causal: Use strict cleaning
3365
+ source_clean = self.symbolic_reasoner._clean_variable(rel['source'])
3366
+ target_clean = self.symbolic_reasoner._clean_variable(rel['target'])
3367
+
3368
+ if not source_clean or not target_clean:
3369
+ continue
3370
+
3371
+ rel_type = rel.get('type', 'causal')
3372
+
3373
+ # For causal relationships: Reject action verbs/epistemic terms (strict validation)
3374
+ if rel_category == 'causal':
3375
+ if self.symbolic_reasoner._is_action_verb(source_clean):
3376
+ logger.warning(f"Rejected causal relationship: source '{source_clean}' is an action verb")
3377
+ continue
3378
+ if self.symbolic_reasoner._is_action_verb(target_clean):
3379
+ logger.warning(f"Rejected causal relationship: target '{target_clean}' is an action verb")
3380
+ continue
3381
+ if self.symbolic_reasoner._is_epistemic_term(source_clean):
3382
+ logger.warning(f"Rejected causal relationship: source '{source_clean}' is an epistemic term")
3383
+ continue
3384
+ if self.symbolic_reasoner._is_epistemic_term(target_clean):
3385
+ logger.warning(f"Rejected causal relationship: target '{target_clean}' is an epistemic term")
3386
+ continue
3387
+
3388
+ # Only add if both are valid state variables for causal relationships
3389
+ if (source_clean in valid_variables and target_clean in valid_variables):
3390
+ # Validate causal relationship using do-calculus
3391
+ graph_state_temp = {
3392
+ 'nodes': list(valid_variables),
3393
+ 'edges': [(s, t) for s, t in self.symbolic_reasoner.graph_manager.get_edges()]
3394
+ }
3395
+ is_valid_causal, causal_error = self.symbolic_reasoner.validate_causal_relationship(
3396
+ source_clean,
3397
+ target_clean,
3398
+ graph_state_temp
3399
+ )
3400
+
3401
+ if is_valid_causal:
3402
+ self.symbolic_reasoner.graph_manager.add_relationship(
3403
+ source=source_clean,
3404
+ target=target_clean,
3405
+ strength=1.0,
3406
+ relation_type=rel_type,
3407
+ confidence=rel.get('confidence', 0.8),
3408
+ category=rel_category
3409
+ )
3410
+ edges_added += 1
3411
+ else:
3412
+ logger.debug(f"Skipping invalid causal relationship {source_clean} -> {target_clean}: {causal_error}")
3413
+
3414
+ # For general knowledge relationships: More permissive (allow entities, concepts, etc.)
3415
+ else:
3416
+ # Add to valid_variables if not already there (general knowledge can include new entities)
3417
+ if source_clean not in valid_variables:
3418
+ valid_variables.add(source_clean)
3419
+ if target_clean not in valid_variables:
3420
+ valid_variables.add(target_clean)
3421
+
3422
+ # Add relationship (no strict validation for general knowledge)
3423
+ self.symbolic_reasoner.graph_manager.add_relationship(
3424
+ source=source_clean,
3425
+ target=target_clean,
3426
+ strength=1.0,
3427
+ relation_type=rel_type,
3428
+ confidence=rel.get('confidence', 0.8),
3429
+ category=rel_category
3430
+ )
3431
+ edges_added += 1
3432
+ else:
3433
+ # For SCM, edges were already added above
3434
+ edges_added = result.get('edges_added', 0)
3435
+
3436
+ # Add direct edges
3437
+ # Skip for SCM (already added)
3438
+ if not result.get('scm_parsed'):
3439
+ for source, target in edges:
3440
+ source_clean = self.symbolic_reasoner._clean_variable(source)
3441
+ target_clean = self.symbolic_reasoner._clean_variable(target)
3442
+
3443
+ if not source_clean or not target_clean:
3444
+ continue
3445
+
3446
+ # Determine relationship category from context
3447
+ # If graph type is causal, apply strict filtering
3448
+ # If graph type is knowledge/mixed, be more permissive
3449
+ graph_type = self.symbolic_reasoner.graph_manager.graph_type
3450
+ if graph_type == 'causal':
3451
+ # For causal graphs: Reject action verbs/epistemic terms
3452
+ if (self.symbolic_reasoner._is_action_verb(source_clean) or
3453
+ self.symbolic_reasoner._is_epistemic_term(source_clean) or
3454
+ self.symbolic_reasoner._is_action_verb(target_clean) or
3455
+ self.symbolic_reasoner._is_epistemic_term(target_clean)):
3456
+ continue
3457
+
3458
+ # Only add if both are valid state variables
3459
+ if (source_clean in valid_variables and target_clean in valid_variables):
3460
+ self.symbolic_reasoner.graph_manager.add_relationship(
3461
+ source=source_clean,
3462
+ target=target_clean,
3463
+ strength=1.0,
3464
+ relation_type='causal',
3465
+ confidence=0.8,
3466
+ category='causal'
3467
+ )
3468
+ edges_added += 1
3469
+ else:
3470
+ # For knowledge/mixed graphs: More permissive
3471
+ if source_clean not in valid_variables:
3472
+ valid_variables.add(source_clean)
3473
+ if target_clean not in valid_variables:
3474
+ valid_variables.add(target_clean)
3475
+
3476
+ self.symbolic_reasoner.graph_manager.add_relationship(
3477
+ source=source_clean,
3478
+ target=target_clean,
3479
+ strength=1.0,
3480
+ relation_type='related',
3481
+ confidence=0.8,
3482
+ category='general'
3483
+ )
3484
+ edges_added += 1
3485
+
3486
+ # Track if we actually added any edges
3487
+ result['edges_added'] = edges_added
3488
+ if edges_added == 0 and len(relationships_extracted) > 0:
3489
+ result['epistemic_warning'] = (
3490
+ "No valid causal relationships were added to the graph. All extracted relationships "
3491
+ "involved action verbs or epistemic terms rather than causal state variables. "
3492
+ "Please provide explicit state variables and causal relationships."
3493
+ )
3494
+
3495
+ # Infer additional structure if needed (with context) - only for valid variables
3496
+ # Skip for SCM (structure is explicit)
3497
+ if not result.get('scm_parsed') and not edges and valid_variables:
3498
+ inferred_edges = self.symbolic_reasoner.infer_causal_structure(list(valid_variables), context=task)
3499
+ for source, target in inferred_edges:
3500
+ source_clean = self.symbolic_reasoner._clean_variable(source)
3501
+ target_clean = self.symbolic_reasoner._clean_variable(target)
3502
+ # Only add if both are valid
3503
+ if (source_clean and target_clean and
3504
+ source_clean in valid_variables and
3505
+ target_clean in valid_variables):
3506
+ self.symbolic_reasoner.graph_manager.add_relationship(
3507
+ source=source_clean,
3508
+ target=target_clean,
3509
+ strength=0.5,
3510
+ confidence=0.5
3511
+ )
3512
+
3513
+ # Validate graph with consistency checker
3514
+ if self.consistency_checker:
3515
+ graph_state = {
3516
+ 'nodes': list(valid_variables),
3517
+ 'edges': [(s, t) for s, t in self.symbolic_reasoner.graph_manager.get_edges()
3518
+ if s in valid_variables and t in valid_variables]
3519
+ }
3520
+ is_consistent, consistency_error = self.consistency_checker.verify_consistency(graph_state)
3521
+ if not is_consistent:
3522
+ logger.warning(f"Graph consistency check failed: {consistency_error}")
3523
+ # Try to correct if self_corrector available
3524
+ if self.self_corrector:
3525
+ errors = [{'type': 'inconsistency', 'message': consistency_error, 'graph': graph_state}]
3526
+ corrections = self.self_corrector.correct_errors(errors, graph_state)
3527
+ if corrections:
3528
+ logger.info(f"Applied {len(corrections)} corrections")
3529
+
3530
+ # Validate graph
3531
+ is_valid, error = self.symbolic_reasoner.validate_causal_graph()
3532
+ if not is_valid:
3533
+ logger.warning(f"Graph validation failed: {error}")
3534
+
3535
+ # Track reasoning step: Graph validation
3536
+ if self.reasoning_tracker:
3537
+ self.reasoning_tracker.add_step(
3538
+ step_type=StepType.VALIDATION,
3539
+ operation="validate_graph",
3540
+ input_state={'graph': graph_state if self.consistency_checker else {}},
3541
+ output_state={'is_valid': is_valid, 'error': error},
3542
+ conclusion="Graph validated" if is_valid else f"Graph validation failed: {error}"
3543
+ )
3544
+
3545
+ # Step 2: Statistical fitting (if data available)
3546
+ if data is not None and PANDAS_AVAILABLE:
3547
+ try:
3548
+ # Only use valid variables for statistical fitting
3549
+ graph_nodes = self.symbolic_reasoner.graph_manager.get_nodes()
3550
+ all_variables = [v for v in graph_nodes if v in valid_variables]
3551
+ if all_variables:
3552
+ self.statistical_engine.fit_from_dataframe(
3553
+ df=data,
3554
+ variables=all_variables,
3555
+ window=min(30, len(data)),
3556
+ decay_alpha=0.9
3557
+ )
3558
+
3559
+ # Quantify uncertainty
3560
+ uncertainty = self.statistical_engine.quantify_uncertainty(
3561
+ df=data,
3562
+ variables=all_variables,
3563
+ windows=min(200, len(data))
3564
+ )
3565
+
3566
+ result['statistical'] = {
3567
+ 'edge_strengths': {
3568
+ (s, t): self.statistical_engine.assess_causal_strength(s, t)
3569
+ for s, t in self.symbolic_reasoner.graph_manager.get_edges()
3570
+ },
3571
+ 'uncertainty': uncertainty,
3572
+ 'confidence_intervals': uncertainty.get('edge_intervals', {})
3573
+ }
3574
+ except Exception as e:
3575
+ logger.warning(f"Statistical fitting failed: {e}")
3576
+
3577
+ # Step 3: Build comprehensive analysis result
3578
+ # Only include relationships between valid variables
3579
+ graph_nodes_all = self.symbolic_reasoner.graph_manager.get_nodes()
3580
+ graph_edges_all = self.symbolic_reasoner.graph_manager.get_edges()
3581
+
3582
+ # Filter to only valid variables
3583
+ graph_nodes = [n for n in graph_nodes_all if n in valid_variables]
3584
+ graph_edges = [(s, t) for s, t in graph_edges_all if s in valid_variables and t in valid_variables]
3585
+
3586
+ relationships = []
3587
+ for source, target in graph_edges:
3588
+ edge_data = self.symbolic_reasoner.graph_manager.graph.get(source, {}).get(target, {})
3589
+ strength = self.statistical_engine.assess_causal_strength(source, target) if edge_data.get('category') == 'causal' else 1.0
3590
+ confidence = edge_data.get('confidence', 0.8)
3591
+ relation_type = edge_data.get('relation_type', 'causal')
3592
+ category = edge_data.get('category', 'causal') # Get category from edge metadata
3593
+ relationships.append({
3594
+ 'source': source,
3595
+ 'target': target,
3596
+ 'type': relation_type,
3597
+ 'category': category,
3598
+ 'strength': strength,
3599
+ 'confidence': confidence
3600
+ })
3601
+
3602
+ # Determine relationship type for graph structure description
3603
+ has_causal = any(rel.get('category') == 'causal' for rel in relationships)
3604
+ has_general = any(rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual'] for rel in relationships)
3605
+
3606
+ if has_general and not has_causal:
3607
+ rel_type_label = "knowledge relationships"
3608
+ elif has_causal:
3609
+ rel_type_label = "causal relationships"
3610
+ else:
3611
+ rel_type_label = "relationships"
3612
+
3613
+ # Generate graph structure description (only valid variables)
3614
+ graph_structure = f"Nodes: {', '.join(sorted(graph_nodes))}\nEdges: {len(graph_edges)} {rel_type_label}"
3615
+
3616
+ # Generate insights based on intent
3617
+ insights = []
3618
+ if relationships:
3619
+ # Strongest relationship
3620
+ strongest = max(relationships, key=lambda x: abs(x.get('strength', 0)))
3621
+ category = strongest.get('category', 'causal')
3622
+ if category == 'causal':
3623
+ insights.append(
3624
+ f"Strongest causal relationship: {strongest['source']} -> {strongest['target']} "
3625
+ f"(strength: {strongest['strength']:.3f}, confidence: {strongest['confidence']:.2f})"
3626
+ )
3627
+ elif category == 'taxonomic':
3628
+ insights.append(
3629
+ f"Taxonomic relationship: {strongest['source']} is a type of {strongest['target']} "
3630
+ f"(confidence: {strongest['confidence']:.2f})"
3631
+ )
3632
+ else:
3633
+ insights.append(
3634
+ f"Strongest relationship: {strongest['source']} -> {strongest['target']} "
3635
+ f"(type: {category}, confidence: {strongest['confidence']:.2f})"
3636
+ )
3637
+
3638
+ # Most connected variable
3639
+ node_degrees = defaultdict(int)
3640
+ for rel in relationships:
3641
+ node_degrees[rel['source']] += 1
3642
+ node_degrees[rel['target']] += 1
3643
+ if node_degrees:
3644
+ most_connected = max(node_degrees.items(), key=lambda x: x[1])
3645
+ insights.append(
3646
+ f"Most connected variable: {most_connected[0]} ({most_connected[1]} relationships)"
3647
+ )
3648
+
3649
+ # Generate recommendations if requested
3650
+ recommendations = []
3651
+ if intent['type'] == 'recommendation' and relationships:
3652
+ # Find variables with high out-degree (causes) that could be intervened on
3653
+ out_degrees = defaultdict(int)
3654
+ for rel in relationships:
3655
+ out_degrees[rel['source']] += abs(rel['strength'])
3656
+
3657
+ if out_degrees:
3658
+ top_levers = sorted(out_degrees.items(), key=lambda x: x[1], reverse=True)[:3]
3659
+ for var, total_effect in top_levers:
3660
+ recommendations.append(
3661
+ f"Consider intervening on '{var}' - it has strong causal effects on multiple outcomes"
3662
+ )
3663
+
3664
+ # Parse extracted values to create factual state (used for both analysis and counterfactuals)
3665
+ # For SCM, use the values from the SCM structure
3666
+ if result.get('scm_parsed'):
3667
+ variables_with_values = extraction.get('metadata', {}).get('variables_with_values', {})
3668
+ else:
3669
+ variables_with_values = extraction.get('metadata', {}).get('variables_with_values', {})
3670
+ extracted_values = self._parse_extracted_values(variables_with_values)
3671
+
3672
+ # Create factual state using extracted values, fallback to 0.0
3673
+ factual_state = {}
3674
+ if graph_nodes:
3675
+ for var in graph_nodes:
3676
+ # Use extracted value if available, otherwise 0.0
3677
+ # Try exact match first
3678
+ if var in extracted_values:
3679
+ factual_state[var] = extracted_values[var]
3680
+ else:
3681
+ # Try partial match (e.g., "product price" matches "product price")
3682
+ matched = False
3683
+ for extracted_var, value in extracted_values.items():
3684
+ # Normalize both for comparison
3685
+ var_normalized = var.lower().replace(' ', '')
3686
+ extracted_var_normalized = extracted_var.lower().replace(' ', '')
3687
+ if var_normalized in extracted_var_normalized or extracted_var_normalized in var_normalized:
3688
+ factual_state[var] = value
3689
+ matched = True
3690
+ break
3691
+ if not matched:
3692
+ factual_state[var] = 0.0
3693
+
3694
+ result['analysis'] = {
3695
+ 'variables': sorted(list(graph_nodes)), # Only valid variables
3696
+ 'relationships': relationships, # Only valid relationships
3697
+ 'graph_structure': graph_structure,
3698
+ 'insights': insights,
3699
+ 'extraction_metadata': extraction.get('metadata', {}),
3700
+ 'factual_state': factual_state
3701
+ }
3702
+ result['recommendations'] = recommendations
3703
+
3704
+ # Step 4: Graph-first reasoning (answer from graph state ONLY)
3705
+ if self.graph_first_reasoner and graph_nodes:
3706
+ try:
3707
+ graph_state = {
3708
+ 'nodes': graph_nodes,
3709
+ 'edges': graph_edges,
3710
+ 'edge_data': {
3711
+ (s, t): self.symbolic_reasoner.graph_manager.graph.get(s, {}).get(t, {})
3712
+ for s, t in graph_edges
3713
+ }
3714
+ }
3715
+
3716
+ # Track reasoning step: Graph-first reasoning
3717
+ if self.reasoning_tracker:
3718
+ self.reasoning_tracker.add_step(
3719
+ step_type=StepType.INFERENCE,
3720
+ operation="graph_first_reasoning",
3721
+ input_state={'graph_state': graph_state, 'query': corrected_task},
3722
+ output_state={},
3723
+ conclusion="Graph-first reasoning"
3724
+ )
3725
+
3726
+ # Reason from graph state only
3727
+ graph_answer = self.graph_first_reasoner.reason_from_graph_state(
3728
+ state=graph_state,
3729
+ query=corrected_task,
3730
+ graph_manager=self.symbolic_reasoner.graph_manager
3731
+ )
3732
+ result['graph_first_answer'] = graph_answer
3733
+
3734
+ # Track reasoning step: Graph answer
3735
+ if self.reasoning_tracker and graph_answer.get('answer'):
3736
+ self.reasoning_tracker.add_step(
3737
+ step_type=StepType.INFERENCE,
3738
+ operation="graph_answer",
3739
+ input_state={},
3740
+ output_state={'answer': graph_answer.get('answer')},
3741
+ conclusion=graph_answer.get('answer', '')
3742
+ )
3743
+
3744
+ # If graph-first reasoning provides an answer, use it
3745
+ if graph_answer.get('answer'):
3746
+ result['analysis']['graph_first_insight'] = graph_answer['answer']
3747
+ except Exception as e:
3748
+ logger.warning(f"Graph-first reasoning failed: {e}")
3749
+
3750
+ # Step 5: Generate counterfactuals (if requested or if we have a state)
3751
+ if intent['type'] == 'counterfactual' or (graph_nodes and not intent['type'] == 'extraction'):
3752
+
3753
+ # Use intervention variables from intent if available
3754
+ # Prefer state variables (variables with extracted values) over question targets
3755
+ target_vars = intent.get('intervention_variables', [])
3756
+ if not target_vars:
3757
+ # Use variables that have extracted values (state variables)
3758
+ state_vars = [v for v in graph_nodes if v in extracted_values or any(v in k for k in extracted_values.keys())]
3759
+ if state_vars:
3760
+ target_vars = state_vars[:3] # Use first 3 state variables
3761
+ else:
3762
+ target_vars = [v for v in graph_nodes if 'expected' not in v.lower()][:3] # Exclude question targets
3763
+
3764
+ # Filter target_vars to only valid variables
3765
+ target_vars = [v for v in target_vars if v in graph_nodes]
3766
+
3767
+ if target_vars:
3768
+ try:
3769
+ counterfactuals = self.statistical_engine.generate_probabilistic_counterfactuals(
3770
+ factual_state=factual_state,
3771
+ target_variables=target_vars,
3772
+ n_scenarios=min(5, len(target_vars) + 2)
3773
+ )
3774
+ result['counterfactuals'] = counterfactuals
3775
+ except Exception as e:
3776
+ logger.warning(f"Counterfactual generation failed: {e}")
3777
+
3778
+ # Step 6: Apply pragmatic layer for response generation
3779
+ if self.pragmatic_compiler and result.get('analysis'):
3780
+ # Determine confidence and complexity for pragmatic decisions
3781
+ avg_confidence = sum([r.get('confidence', 0.8) for r in relationships]) / len(relationships) if relationships else 0.8
3782
+ complexity = len(graph_nodes) + len(graph_edges)
3783
+
3784
+ register = self.pragmatic_compiler.select_register(avg_confidence, complexity)
3785
+ result['pragmatic'] = {
3786
+ 'register': register,
3787
+ 'hedging': self.pragmatic_compiler.generate_hedging(avg_confidence),
3788
+ 'explicitness': self.pragmatic_compiler.adjust_explicitness(len(graph_nodes))
3789
+ }
3790
+ else:
3791
+ # Default pragmatic info if compiler not available
3792
+ result['pragmatic'] = {
3793
+ 'register': 'neutral',
3794
+ 'hedging': 'likely',
3795
+ 'explicitness': 3
3796
+ }
3797
+
3798
+ # Step 7: Self-verification and error detection
3799
+ if self.error_detector and self.reasoning_tracker and self.reasoning_tracker.current_chain:
3800
+ reasoning_chain = self.reasoning_tracker.current_chain
3801
+ graph_state = {
3802
+ 'nodes': graph_nodes,
3803
+ 'edges': graph_edges
3804
+ }
3805
+ errors = self.error_detector.detect_errors(reasoning_chain, graph_state)
3806
+ if errors:
3807
+ result['errors'] = errors
3808
+ logger.warning(f"Detected {len(errors)} errors")
3809
+
3810
+ # Try to correct errors
3811
+ if self.self_corrector:
3812
+ corrections = self.self_corrector.correct_errors(errors, graph_state)
3813
+ if corrections:
3814
+ result['corrections'] = corrections
3815
+ logger.info(f"Applied {len(corrections)} corrections")
3816
+
3817
+ # Step 8: Generate explanations
3818
+ if self.explanation_builder and self.reasoning_tracker and self.reasoning_tracker.current_chain:
3819
+ reasoning_chain = self.reasoning_tracker.current_chain
3820
+ graph_state = {
3821
+ 'nodes': graph_nodes,
3822
+ 'edges': graph_edges
3823
+ }
3824
+ explanation = self.explanation_builder.generate_explanation(reasoning_chain, graph_state)
3825
+ result['explanation'] = explanation
3826
+
3827
+ # Step 9: Transparency layer
3828
+ if self.transparency_layer and self.reasoning_tracker and self.reasoning_tracker.current_chain:
3829
+ reasoning_chain = self.reasoning_tracker.current_chain
3830
+ trace = self.transparency_layer.show_reasoning_trace(reasoning_chain)
3831
+ confidence_viz = self.transparency_layer.visualize_confidence(reasoning_chain)
3832
+ graph_explanation = self.transparency_layer.explain_graph_structure({
3833
+ 'nodes': graph_nodes,
3834
+ 'edges': graph_edges
3835
+ })
3836
+ result['transparency'] = {
3837
+ 'trace': trace,
3838
+ 'confidence': confidence_viz,
3839
+ 'graph_structure': graph_explanation
3840
+ }
3841
+
3842
+ # Store reasoning chain in result
3843
+ if self.reasoning_tracker and self.reasoning_tracker.current_chain and result:
3844
+ result['reasoning_chain'] = self.reasoning_tracker.current_chain
3845
+ # Mark chain as successful if we have results
3846
+ if result.get('analysis') or result.get('graph_first_answer'):
3847
+ self.reasoning_tracker.current_chain.success = True
3848
+ graph_answer = result.get('graph_first_answer', {})
3849
+ if graph_answer:
3850
+ self.reasoning_tracker.current_chain.final_conclusion = graph_answer.get('answer')
3851
+
3852
+ # Step 10: Validate result completeness
3853
+ self._validate_result(result)
3854
+
3855
+ return result
3856
+
3857
+ def _validate_result(self, result: Dict[str, Any]) -> None:
3858
+ """
3859
+ Validate result completeness and add warnings if needed.
3860
+
3861
+ Args:
3862
+ result: Result dictionary to validate
3863
+ """
3864
+ warnings = []
3865
+
3866
+ # Check if we have any meaningful analysis
3867
+ analysis = result.get('analysis', {})
3868
+ if not analysis.get('variables') and not analysis.get('relationships'):
3869
+ warnings.append("No variables or relationships were extracted from the input.")
3870
+
3871
+ # Check graph-first answer quality
3872
+ graph_answer = result.get('graph_first_answer', {})
3873
+ if not graph_answer or not graph_answer.get('answer'):
3874
+ warnings.append("Graph-first reasoning did not produce a clear answer.")
3875
+
3876
+ # Check counterfactuals if requested
3877
+ intent = result.get('intent', {})
3878
+ if intent.get('type') == 'counterfactual' and not result.get('counterfactuals'):
3879
+ warnings.append("Counterfactual analysis was requested but none were generated.")
3880
+
3881
+ if warnings:
3882
+ result['warnings'] = warnings
3883
+ logger.warning(f"Result validation warnings: {warnings}")
3884
+
3885
+
3886
+ class HybridAgent:
3887
+ """
3888
+ Main hybrid agent class with graph-first reasoning architecture.
3889
+
3890
+ Supports both causal reasoning (CRCA) and general knowledge tasks.
3891
+
3892
+ New Architecture:
3893
+ Text Input -> TextCorrector -> LanguageCompiler -> SymbolicReasoner
3894
+ -> GraphManager -> GraphFirstReasoner -> RuleBasedNLG -> Response
3895
+
3896
+ No LLM dependency - pure symbolic-statistical reasoning with graph-first architecture.
3897
+
3898
+ Graph Types Supported:
3899
+ - "causal": Causal relationships (default for CRCA tasks)
3900
+ - "knowledge": General knowledge graphs (facts, definitions, taxonomic relationships)
3901
+ - "dependency": Dependency relationships
3902
+ - "mixed": Combination of relationship types
3903
+ """
3904
+
3905
+ def __init__(
3906
+ self,
3907
+ graph_type: str = "causal",
3908
+ seed: int = 42,
3909
+ enable_graph_first: bool = True,
3910
+ enable_compression: bool = True,
3911
+ enable_language_compilation: bool = True,
3912
+ enable_error_correction: bool = True,
3913
+ enable_conversation: bool = True,
3914
+ enable_reasoning_tracking: bool = True,
3915
+ enable_few_shot_learning: bool = True,
3916
+ enable_task_decomposition: bool = True,
3917
+ enable_explanations: bool = True,
3918
+ enable_verification: bool = True,
3919
+ enable_consistency: bool = True
3920
+ ):
3921
+ """
3922
+ Initialize hybrid agent with graph-first architecture.
3923
+
3924
+ Args:
3925
+ graph_type: Type of graph (causal, knowledge, dependency, etc.)
3926
+ seed: Random seed for reproducibility
3927
+ enable_graph_first: Enable graph-first reasoning (answers from graph only)
3928
+ enable_compression: Enable graph compression and abstraction
3929
+ enable_language_compilation: Enable language compilation layers
3930
+ enable_error_correction: Enable non-destructive text correction
3931
+ """
3932
+ # Initialize core components
3933
+ self.graph_manager = GraphManager(graph_type=graph_type)
3934
+ self.prediction_framework = PredictionFramework(
3935
+ graph_manager=self.graph_manager
3936
+ )
3937
+
3938
+ # Initialize graph-first components
3939
+ self.graph_first_reasoner = GraphFirstReasoner(graph_manager=self.graph_manager) if enable_graph_first else None
3940
+ self.graph_compressor = GraphCompressor(self.graph_manager) if enable_compression else None
3941
+
3942
+ # Initialize language compilation components
3943
+ if enable_language_compilation:
3944
+ # Enable dictionary integration by default (no API key required)
3945
+ self.lexical_compiler = LexicalCompiler(enable_dictionary=True, cache_enabled=True)
3946
+ self.grammatical_compiler = GrammaticalCompiler()
3947
+ self.pragmatic_compiler = PragmaticCompiler()
3948
+ else:
3949
+ self.lexical_compiler = None
3950
+ self.grammatical_compiler = None
3951
+ self.pragmatic_compiler = None
3952
+
3953
+ # Initialize error correction (with dictionary integration)
3954
+ if enable_error_correction:
3955
+ # Pass lexical compiler to text corrector for dictionary-enhanced correction
3956
+ lexical_for_corrector = self.lexical_compiler if enable_language_compilation else None
3957
+ self.text_corrector = TextCorrector(lexical_compiler=lexical_for_corrector)
3958
+ else:
3959
+ self.text_corrector = None
3960
+
3961
+ # Initialize few-shot learning components (needed before symbolic reasoner)
3962
+ if enable_few_shot_learning:
3963
+ self.example_store = ExampleStore()
3964
+ self.pattern_learner = PatternLearner(self.example_store)
3965
+ self.adaptive_extractor = AdaptiveExtractor(self.pattern_learner, self.example_store)
3966
+ else:
3967
+ self.example_store = None
3968
+ self.pattern_learner = None
3969
+ self.adaptive_extractor = None
3970
+
3971
+ # Initialize reasoning components
3972
+ # Pass lexical compiler to symbolic reasoner for dictionary-enhanced validation
3973
+ lexical_for_reasoner = self.lexical_compiler if enable_language_compilation else None
3974
+ adaptive_extractor_for_reasoner = self.adaptive_extractor if enable_few_shot_learning else None
3975
+ self.symbolic_reasoner = SymbolicReasoner(
3976
+ self.graph_manager,
3977
+ lexical_compiler=lexical_for_reasoner,
3978
+ adaptive_extractor=adaptive_extractor_for_reasoner
3979
+ )
3980
+ self.statistical_engine = StatisticalEngine(
3981
+ graph_manager=self.graph_manager,
3982
+ prediction_framework=self.prediction_framework,
3983
+ seed=seed
3984
+ )
3985
+ self.nlg = RuleBasedNLG()
3986
+ # Initialize LLM-enhanced components
3987
+ if enable_conversation:
3988
+ self.conversation_history = ConversationHistory(decay_lambda=0.1)
3989
+ self.context_tracker = ContextTracker(self.conversation_history)
3990
+ else:
3991
+ self.conversation_history = None
3992
+ self.context_tracker = None
3993
+
3994
+ if enable_reasoning_tracking:
3995
+ self.reasoning_tracker = ReasoningTracker()
3996
+ else:
3997
+ self.reasoning_tracker = None
3998
+
3999
+ if enable_few_shot_learning:
4000
+ self.example_store = ExampleStore()
4001
+ self.pattern_learner = PatternLearner(self.example_store)
4002
+ self.adaptive_extractor = AdaptiveExtractor(self.pattern_learner, self.example_store)
4003
+ else:
4004
+ self.example_store = None
4005
+ self.pattern_learner = None
4006
+ self.adaptive_extractor = None
4007
+
4008
+ if enable_task_decomposition:
4009
+ self.task_analyzer = TaskAnalyzer()
4010
+ self.subtask_executor = SubTaskExecutor()
4011
+ self.plan_generator = PlanGenerator(self.task_analyzer)
4012
+ else:
4013
+ self.task_analyzer = None
4014
+ self.subtask_executor = None
4015
+ self.plan_generator = None
4016
+
4017
+ if enable_explanations:
4018
+ self.explanation_builder = ExplanationBuilder()
4019
+ self.transparency_layer = TransparencyLayer()
4020
+ else:
4021
+ self.explanation_builder = None
4022
+ self.transparency_layer = None
4023
+
4024
+ if enable_verification:
4025
+ self.consistency_checker = ConsistencyChecker()
4026
+ self.error_detector = ErrorDetector()
4027
+ self.self_corrector = SelfCorrector()
4028
+ else:
4029
+ self.consistency_checker = None
4030
+ self.error_detector = None
4031
+ self.self_corrector = None
4032
+
4033
+ if enable_consistency:
4034
+ self.consistency_engine = ConsistencyEngine(seed=seed)
4035
+ else:
4036
+ self.consistency_engine = None
4037
+
4038
+ self.orchestrator = HybridOrchestrator(
4039
+ symbolic_reasoner=self.symbolic_reasoner,
4040
+ statistical_engine=self.statistical_engine,
4041
+ nlg=self.nlg,
4042
+ graph_first_reasoner=self.graph_first_reasoner,
4043
+ text_corrector=self.text_corrector,
4044
+ lexical_compiler=self.lexical_compiler,
4045
+ grammatical_compiler=self.grammatical_compiler,
4046
+ pragmatic_compiler=self.pragmatic_compiler,
4047
+ reasoning_tracker=self.reasoning_tracker,
4048
+ explanation_builder=self.explanation_builder,
4049
+ transparency_layer=self.transparency_layer,
4050
+ consistency_checker=self.consistency_checker,
4051
+ error_detector=self.error_detector,
4052
+ self_corrector=self.self_corrector,
4053
+ consistency_engine=self.consistency_engine
4054
+ )
4055
+
4056
+ def run(
4057
+ self,
4058
+ task: str,
4059
+ data: Optional[Any] = None,
4060
+ response_style: str = 'conversational',
4061
+ context: Optional[ConversationContext] = None,
4062
+ show_reasoning: bool = False
4063
+ ) -> str:
4064
+ """
4065
+ Run hybrid agent on a task with conversation context support.
4066
+
4067
+ Args:
4068
+ task: Task description
4069
+ data: Optional data for statistical inference
4070
+ response_style: Response style ('conversational', 'brief', 'full')
4071
+ context: Optional conversation context
4072
+ show_reasoning: Whether to show chain-of-thought reasoning
4073
+
4074
+ Returns:
4075
+ Natural language response
4076
+ """
4077
+ try:
4078
+ # Handle conversation context
4079
+ if self.conversation_history and context is None:
4080
+ # Use existing context if available
4081
+ context = self.conversation_history.context
4082
+
4083
+ # Add user message to conversation
4084
+ if self.conversation_history:
4085
+ self.conversation_history.add_message(
4086
+ role=MessageRole.USER,
4087
+ content=task
4088
+ )
4089
+
4090
+ # Resolve references in task using context
4091
+ if self.context_tracker and context:
4092
+ resolved_task = self._resolve_task_references(task, context)
4093
+ else:
4094
+ resolved_task = task
4095
+
4096
+ # Task decomposition (if enabled)
4097
+ if self.task_analyzer and self.plan_generator:
4098
+ plan = self.plan_generator.generate_plan(resolved_task)
4099
+ if plan['estimated_steps'] > 1:
4100
+ # Complex task - use decomposition
4101
+ logger.info(f"Decomposing task into {plan['estimated_steps']} subtasks")
4102
+ # For now, proceed with original task
4103
+ # Future: Execute subtasks in parallel
4104
+
4105
+ # Validate input
4106
+ if not resolved_task or not isinstance(resolved_task, str):
4107
+ return "I need a valid task description to analyze. Please provide a question or statement about causal relationships."
4108
+
4109
+ if len(resolved_task.strip()) == 0:
4110
+ return "Please provide a non-empty task description."
4111
+
4112
+ # Execute hybrid reasoning with graph-first architecture
4113
+ result = self.orchestrator.reason_hybrid(task=resolved_task, data=data, context=context)
4114
+
4115
+ # Validate result
4116
+ if not result:
4117
+ return "I couldn't process your request. Please try rephrasing with clearer causal relationships."
4118
+
4119
+ # Add agent response to conversation
4120
+ if self.conversation_history:
4121
+ # Will add after response is generated
4122
+ pass
4123
+
4124
+ # Get pragmatic information for response generation
4125
+ pragmatic_info = result.get('pragmatic', {
4126
+ 'register': 'neutral',
4127
+ 'hedging': 'likely',
4128
+ 'explicitness': 3
4129
+ })
4130
+
4131
+ # Generate natural language response from graph state
4132
+ reasoning_chain = None
4133
+ if self.reasoning_tracker and self.reasoning_tracker.current_chain:
4134
+ reasoning_chain = self.reasoning_tracker.current_chain
4135
+
4136
+ if show_reasoning and reasoning_chain:
4137
+ # Include chain-of-thought reasoning
4138
+ if self.explanation_builder:
4139
+ explanation = self.explanation_builder.generate_explanation(reasoning_chain, result.get('analysis', {}))
4140
+ result['explanation'] = explanation
4141
+
4142
+ response = self.nlg.generate_response(
4143
+ result,
4144
+ response_type=response_style,
4145
+ pragmatic_info=pragmatic_info,
4146
+ show_reasoning=True,
4147
+ reasoning_chain=reasoning_chain
4148
+ )
4149
+ else:
4150
+ response = self.nlg.generate_response(
4151
+ result,
4152
+ response_type=response_style,
4153
+ pragmatic_info=pragmatic_info,
4154
+ show_reasoning=False,
4155
+ reasoning_chain=reasoning_chain
4156
+ )
4157
+
4158
+ # Add agent response to conversation
4159
+ if self.conversation_history:
4160
+ self.conversation_history.add_message(
4161
+ role=MessageRole.AGENT,
4162
+ content=response,
4163
+ metadata={'result': result}
4164
+ )
4165
+
4166
+ return response
4167
+
4168
+ except Exception as e:
4169
+ logger.error(f"Error in hybrid agent run: {e}", exc_info=True)
4170
+ return f"I encountered an error processing your request: {str(e)}. Please try rephrasing."
4171
+
4172
+ def _resolve_task_references(
4173
+ self,
4174
+ task: str,
4175
+ context: ConversationContext
4176
+ ) -> str:
4177
+ """
4178
+ Resolve references in task using conversation context.
4179
+
4180
+ Args:
4181
+ task: Original task
4182
+ context: Conversation context
4183
+
4184
+ Returns:
4185
+ Task with resolved references
4186
+ """
4187
+ if not self.context_tracker:
4188
+ return task
4189
+
4190
+ # Resolve common references
4191
+ resolved = task
4192
+ references = ['it', 'that', 'this', 'the price', 'the variable']
4193
+
4194
+ for ref in references:
4195
+ if ref.lower() in task.lower():
4196
+ resolved_var = self.context_tracker.resolve_reference(ref, context.current_turn)
4197
+ if resolved_var:
4198
+ resolved = resolved.replace(ref, resolved_var)
4199
+
4200
+ return resolved
4201
+
4202
+ def update_context(
4203
+ self,
4204
+ context: Optional[ConversationContext],
4205
+ user_message: str,
4206
+ agent_response: str
4207
+ ) -> ConversationContext:
4208
+ """
4209
+ Update conversation context after interaction.
4210
+
4211
+ Args:
4212
+ context: Current context
4213
+ user_message: User message
4214
+ agent_response: Agent response
4215
+
4216
+ Returns:
4217
+ Updated context
4218
+ """
4219
+ if self.conversation_history:
4220
+ return self.conversation_history.context
4221
+ return context
4222
+
4223
+ def learn_from_examples(
4224
+ self,
4225
+ examples: List[Tuple[str, Dict[str, Any]]]
4226
+ ) -> None:
4227
+ """
4228
+ Learn from examples for few-shot learning.
4229
+
4230
+ Args:
4231
+ examples: List of (input_text, output_structure) tuples
4232
+ """
4233
+ if not self.example_store or not self.pattern_learner:
4234
+ logger.warning("Few-shot learning not enabled")
4235
+ return
4236
+
4237
+ # Add examples to store
4238
+ for input_text, output in examples:
4239
+ self.example_store.add_example(input_text, output)
4240
+
4241
+ # Learn patterns
4242
+ self.pattern_learner.learn_from_examples(examples)
4243
+ logger.info(f"Learned from {len(examples)} examples")
4244
+
4245
+ def _generate_brief_response(
4246
+ self,
4247
+ result: Dict[str, Any],
4248
+ pragmatic_info: Dict[str, Any]
4249
+ ) -> str:
4250
+ """Generate brief summary response."""
4251
+ hedging = pragmatic_info.get('hedging', 'likely')
4252
+
4253
+ # Try graph-first answer first
4254
+ graph_answer = result.get('graph_first_answer', {})
4255
+ if graph_answer and graph_answer.get('answer'):
4256
+ return f"{hedging.capitalize()}, {graph_answer['answer']}"
4257
+
4258
+ # Fallback to analysis summary
4259
+ analysis = result.get('analysis', {})
4260
+ variables = analysis.get('variables', [])
4261
+ relationships = analysis.get('relationships', [])
4262
+
4263
+ if variables and relationships:
4264
+ return f"I've identified {len(variables)} variables with {len(relationships)} causal relationships. {hedging.capitalize()}, the strongest relationship is between '{relationships[0].get('source', '')}' and '{relationships[0].get('target', '')}'."
4265
+
4266
+ return "I've analyzed your request, but couldn't extract clear causal relationships. Please provide more specific information about the variables and their relationships."
4267
+
4268
+ def _generate_fallback_response(self, result: Dict[str, Any]) -> str:
4269
+ """Generate fallback response when main generation fails."""
4270
+ task = result.get('task', 'your request')
4271
+ return f"I've processed {task}, but couldn't generate a detailed response. The analysis may need more information or clearer causal relationships."
4272
+
4273
+ def query_graph(self, question: str) -> Dict[str, Any]:
4274
+ """
4275
+ Query graph state directly (graph-first reasoning).
4276
+
4277
+ Args:
4278
+ question: Question to answer from graph state
4279
+
4280
+ Returns:
4281
+ Dictionary with answer derived from graph state
4282
+ """
4283
+ if self.graph_first_reasoner is None:
4284
+ raise ValueError("Graph-first reasoning is not enabled")
4285
+
4286
+ return self.graph_first_reasoner.query_graph_state(question, self.graph_manager)
4287
+
4288
+ def reason_from_graph_state(self, state: Dict[str, Any], query: str) -> Dict[str, Any]:
4289
+ """
4290
+ Pure graph reasoning from explicit graph state.
4291
+
4292
+ Args:
4293
+ state: Graph state dictionary
4294
+ query: Query string
4295
+
4296
+ Returns:
4297
+ Dictionary with reasoning results
4298
+ """
4299
+ if self.graph_first_reasoner is None:
4300
+ raise ValueError("Graph-first reasoning is not enabled")
4301
+
4302
+ return self.graph_first_reasoner.reason_from_graph_state(state, query, self.graph_manager)
4303
+
4304
+ def extract_causal_variables(self, task: str) -> Dict[str, Any]:
4305
+ """
4306
+ Extract causal variables from a task.
4307
+
4308
+ Args:
4309
+ task: Natural language task description
4310
+
4311
+ Returns:
4312
+ Dictionary with extracted variables and relationships
4313
+ """
4314
+ extraction = self.symbolic_reasoner.extract_variables_from_task(task)
4315
+
4316
+ # Add edges to graph
4317
+ for source, target in extraction.get('edges', []):
4318
+ self.graph_manager.add_relationship(
4319
+ source=source,
4320
+ target=target,
4321
+ strength=1.0,
4322
+ confidence=0.8
4323
+ )
4324
+
4325
+ return extraction
4326
+
4327
+ def generate_causal_analysis(
4328
+ self,
4329
+ variables: Dict[str, Any],
4330
+ data: Optional[Any] = None
4331
+ ) -> Dict[str, Any]:
4332
+ """
4333
+ Generate causal analysis from variables.
4334
+
4335
+ Args:
4336
+ variables: Dictionary with variables and relationships
4337
+ data: Optional pandas DataFrame for statistical inference
4338
+
4339
+ Returns:
4340
+ Dictionary with causal analysis results
4341
+ """
4342
+ # Build graph from variables
4343
+ var_list = variables.get('variables', [])
4344
+ edges = variables.get('edges', [])
4345
+
4346
+ for source, target in edges:
4347
+ self.graph_manager.add_relationship(
4348
+ source=source,
4349
+ target=target,
4350
+ strength=1.0
4351
+ )
4352
+
4353
+ # Fit statistical model if data available
4354
+ if data is not None and PANDAS_AVAILABLE:
4355
+ try:
4356
+ self.statistical_engine.fit_from_dataframe(
4357
+ df=data,
4358
+ variables=var_list
4359
+ )
4360
+ except Exception as e:
4361
+ logger.warning(f"Statistical fitting failed: {e}")
4362
+
4363
+ # Build analysis
4364
+ relationships = []
4365
+ for source, target in self.graph_manager.get_edges():
4366
+ strength = self.statistical_engine.assess_causal_strength(source, target)
4367
+ relationships.append({
4368
+ 'source': source,
4369
+ 'target': target,
4370
+ 'strength': strength
4371
+ })
4372
+
4373
+ return {
4374
+ 'variables': var_list,
4375
+ 'relationships': relationships,
4376
+ 'graph_structure': f"{len(var_list)} variables, {len(relationships)} relationships"
4377
+ }
4378
+
4379
+ def generate_counterfactuals(
4380
+ self,
4381
+ state: Dict[str, float],
4382
+ target_vars: List[str]
4383
+ ) -> List[Dict[str, Any]]:
4384
+ """Generate counterfactual scenarios.
4385
+
4386
+ Args:
4387
+ state: Factual state dictionary
4388
+ target_vars: List of variables to intervene on
4389
+
4390
+ Returns:
4391
+ List of counterfactual scenario dictionaries
4392
+ """
4393
+ return self.statistical_engine.generate_probabilistic_counterfactuals(
4394
+ factual_state=state,
4395
+ target_variables=target_vars,
4396
+ n_scenarios=5
4397
+ )
4398
+