crca 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- CRCA.py +172 -7
- MODEL_CARD.md +53 -0
- PKG-INFO +8 -2
- RELEASE_NOTES.md +17 -0
- STABILITY.md +19 -0
- architecture/hybrid/consistency_engine.py +362 -0
- architecture/hybrid/conversation_manager.py +421 -0
- architecture/hybrid/explanation_generator.py +452 -0
- architecture/hybrid/few_shot_learner.py +533 -0
- architecture/hybrid/graph_compressor.py +286 -0
- architecture/hybrid/hybrid_agent.py +4398 -0
- architecture/hybrid/language_compiler.py +623 -0
- architecture/hybrid/main,py +0 -0
- architecture/hybrid/reasoning_tracker.py +322 -0
- architecture/hybrid/self_verifier.py +524 -0
- architecture/hybrid/task_decomposer.py +567 -0
- architecture/hybrid/text_corrector.py +341 -0
- benchmark_results/crca_core_benchmarks.json +178 -0
- branches/crca_sd/crca_sd_realtime.py +6 -2
- branches/general_agent/__init__.py +102 -0
- branches/general_agent/general_agent.py +1400 -0
- branches/general_agent/personality.py +169 -0
- branches/general_agent/utils/__init__.py +19 -0
- branches/general_agent/utils/prompt_builder.py +170 -0
- {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/METADATA +8 -2
- {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/RECORD +303 -20
- crca_core/__init__.py +35 -0
- crca_core/benchmarks/__init__.py +14 -0
- crca_core/benchmarks/synthetic_scm.py +103 -0
- crca_core/core/__init__.py +23 -0
- crca_core/core/api.py +120 -0
- crca_core/core/estimate.py +208 -0
- crca_core/core/godclass.py +72 -0
- crca_core/core/intervention_design.py +174 -0
- crca_core/core/lifecycle.py +48 -0
- crca_core/discovery/__init__.py +9 -0
- crca_core/discovery/tabular.py +193 -0
- crca_core/identify/__init__.py +171 -0
- crca_core/identify/backdoor.py +39 -0
- crca_core/identify/frontdoor.py +48 -0
- crca_core/identify/graph.py +106 -0
- crca_core/identify/id_algorithm.py +43 -0
- crca_core/identify/iv.py +48 -0
- crca_core/models/__init__.py +67 -0
- crca_core/models/provenance.py +56 -0
- crca_core/models/refusal.py +39 -0
- crca_core/models/result.py +83 -0
- crca_core/models/spec.py +151 -0
- crca_core/models/validation.py +68 -0
- crca_core/scm/__init__.py +9 -0
- crca_core/scm/linear_gaussian.py +198 -0
- crca_core/timeseries/__init__.py +6 -0
- crca_core/timeseries/pcmci.py +181 -0
- crca_llm/__init__.py +12 -0
- crca_llm/client.py +85 -0
- crca_llm/coauthor.py +118 -0
- crca_llm/orchestrator.py +289 -0
- crca_llm/types.py +21 -0
- crca_reasoning/__init__.py +16 -0
- crca_reasoning/critique.py +54 -0
- crca_reasoning/godclass.py +206 -0
- crca_reasoning/memory.py +24 -0
- crca_reasoning/rationale.py +10 -0
- crca_reasoning/react_controller.py +81 -0
- crca_reasoning/tool_router.py +97 -0
- crca_reasoning/types.py +40 -0
- crca_sd/__init__.py +15 -0
- crca_sd/crca_sd_core.py +2 -0
- crca_sd/crca_sd_governance.py +2 -0
- crca_sd/crca_sd_mpc.py +2 -0
- crca_sd/crca_sd_realtime.py +2 -0
- crca_sd/crca_sd_tui.py +2 -0
- cuda-keyring_1.1-1_all.deb +0 -0
- cuda-keyring_1.1-1_all.deb.1 +0 -0
- docs/IMAGE_ANNOTATION_USAGE.md +539 -0
- docs/INSTALL_DEEPSPEED.md +125 -0
- docs/api/branches/crca-cg.md +19 -0
- docs/api/branches/crca-q.md +27 -0
- docs/api/branches/crca-sd.md +37 -0
- docs/api/branches/general-agent.md +24 -0
- docs/api/branches/overview.md +19 -0
- docs/api/crca/agent-methods.md +62 -0
- docs/api/crca/operations.md +79 -0
- docs/api/crca/overview.md +32 -0
- docs/api/image-annotation/engine.md +52 -0
- docs/api/image-annotation/overview.md +17 -0
- docs/api/schemas/annotation.md +34 -0
- docs/api/schemas/core-schemas.md +82 -0
- docs/api/schemas/overview.md +32 -0
- docs/api/schemas/policy.md +30 -0
- docs/api/utils/conversation.md +22 -0
- docs/api/utils/graph-reasoner.md +32 -0
- docs/api/utils/overview.md +21 -0
- docs/api/utils/router.md +19 -0
- docs/api/utils/utilities.md +97 -0
- docs/architecture/causal-graphs.md +41 -0
- docs/architecture/data-flow.md +29 -0
- docs/architecture/design-principles.md +33 -0
- docs/architecture/hybrid-agent/components.md +38 -0
- docs/architecture/hybrid-agent/consistency.md +26 -0
- docs/architecture/hybrid-agent/overview.md +44 -0
- docs/architecture/hybrid-agent/reasoning.md +22 -0
- docs/architecture/llm-integration.md +26 -0
- docs/architecture/modular-structure.md +37 -0
- docs/architecture/overview.md +69 -0
- docs/architecture/policy-engine-arch.md +29 -0
- docs/branches/crca-cg/corposwarm.md +39 -0
- docs/branches/crca-cg/esg-scoring.md +30 -0
- docs/branches/crca-cg/multi-agent.md +35 -0
- docs/branches/crca-cg/overview.md +40 -0
- docs/branches/crca-q/alternative-data.md +55 -0
- docs/branches/crca-q/architecture.md +71 -0
- docs/branches/crca-q/backtesting.md +45 -0
- docs/branches/crca-q/causal-engine.md +33 -0
- docs/branches/crca-q/execution.md +39 -0
- docs/branches/crca-q/market-data.md +60 -0
- docs/branches/crca-q/overview.md +58 -0
- docs/branches/crca-q/philosophy.md +60 -0
- docs/branches/crca-q/portfolio-optimization.md +66 -0
- docs/branches/crca-q/risk-management.md +102 -0
- docs/branches/crca-q/setup.md +65 -0
- docs/branches/crca-q/signal-generation.md +61 -0
- docs/branches/crca-q/signal-validation.md +43 -0
- docs/branches/crca-sd/core.md +84 -0
- docs/branches/crca-sd/governance.md +53 -0
- docs/branches/crca-sd/mpc-solver.md +65 -0
- docs/branches/crca-sd/overview.md +59 -0
- docs/branches/crca-sd/realtime.md +28 -0
- docs/branches/crca-sd/tui.md +20 -0
- docs/branches/general-agent/overview.md +37 -0
- docs/branches/general-agent/personality.md +36 -0
- docs/branches/general-agent/prompt-builder.md +30 -0
- docs/changelog/index.md +79 -0
- docs/contributing/code-style.md +69 -0
- docs/contributing/documentation.md +43 -0
- docs/contributing/overview.md +29 -0
- docs/contributing/testing.md +29 -0
- docs/core/crcagent/async-operations.md +65 -0
- docs/core/crcagent/automatic-extraction.md +107 -0
- docs/core/crcagent/batch-prediction.md +80 -0
- docs/core/crcagent/bayesian-inference.md +60 -0
- docs/core/crcagent/causal-graph.md +92 -0
- docs/core/crcagent/counterfactuals.md +96 -0
- docs/core/crcagent/deterministic-simulation.md +78 -0
- docs/core/crcagent/dual-mode-operation.md +82 -0
- docs/core/crcagent/initialization.md +88 -0
- docs/core/crcagent/optimization.md +65 -0
- docs/core/crcagent/overview.md +63 -0
- docs/core/crcagent/time-series.md +57 -0
- docs/core/schemas/annotation.md +30 -0
- docs/core/schemas/core-schemas.md +82 -0
- docs/core/schemas/overview.md +30 -0
- docs/core/schemas/policy.md +41 -0
- docs/core/templates/base-agent.md +31 -0
- docs/core/templates/feature-mixins.md +31 -0
- docs/core/templates/overview.md +29 -0
- docs/core/templates/templates-guide.md +75 -0
- docs/core/tools/mcp-client.md +34 -0
- docs/core/tools/overview.md +24 -0
- docs/core/utils/conversation.md +27 -0
- docs/core/utils/graph-reasoner.md +29 -0
- docs/core/utils/overview.md +27 -0
- docs/core/utils/router.md +27 -0
- docs/core/utils/utilities.md +97 -0
- docs/css/custom.css +84 -0
- docs/examples/basic-usage.md +57 -0
- docs/examples/general-agent/general-agent-examples.md +50 -0
- docs/examples/hybrid-agent/hybrid-agent-examples.md +56 -0
- docs/examples/image-annotation/image-annotation-examples.md +54 -0
- docs/examples/integration/integration-examples.md +58 -0
- docs/examples/overview.md +37 -0
- docs/examples/trading/trading-examples.md +46 -0
- docs/features/causal-reasoning/advanced-topics.md +101 -0
- docs/features/causal-reasoning/counterfactuals.md +43 -0
- docs/features/causal-reasoning/do-calculus.md +50 -0
- docs/features/causal-reasoning/overview.md +47 -0
- docs/features/causal-reasoning/structural-models.md +52 -0
- docs/features/hybrid-agent/advanced-components.md +55 -0
- docs/features/hybrid-agent/core-components.md +64 -0
- docs/features/hybrid-agent/overview.md +34 -0
- docs/features/image-annotation/engine.md +82 -0
- docs/features/image-annotation/features.md +113 -0
- docs/features/image-annotation/integration.md +75 -0
- docs/features/image-annotation/overview.md +53 -0
- docs/features/image-annotation/quickstart.md +73 -0
- docs/features/policy-engine/doctrine-ledger.md +105 -0
- docs/features/policy-engine/monitoring.md +44 -0
- docs/features/policy-engine/mpc-control.md +89 -0
- docs/features/policy-engine/overview.md +46 -0
- docs/getting-started/configuration.md +225 -0
- docs/getting-started/first-agent.md +164 -0
- docs/getting-started/installation.md +144 -0
- docs/getting-started/quickstart.md +137 -0
- docs/index.md +118 -0
- docs/js/mathjax.js +13 -0
- docs/lrm/discovery_proof_notes.md +25 -0
- docs/lrm/finetune_full.md +83 -0
- docs/lrm/math_appendix.md +120 -0
- docs/lrm/overview.md +32 -0
- docs/mkdocs.yml +238 -0
- docs/stylesheets/extra.css +21 -0
- docs_generated/crca_core/CounterfactualResult.md +12 -0
- docs_generated/crca_core/DiscoveryHypothesisResult.md +13 -0
- docs_generated/crca_core/DraftSpec.md +13 -0
- docs_generated/crca_core/EstimateResult.md +13 -0
- docs_generated/crca_core/IdentificationResult.md +17 -0
- docs_generated/crca_core/InterventionDesignResult.md +12 -0
- docs_generated/crca_core/LockedSpec.md +15 -0
- docs_generated/crca_core/RefusalResult.md +12 -0
- docs_generated/crca_core/ValidationReport.md +9 -0
- docs_generated/crca_core/index.md +13 -0
- examples/general_agent_example.py +277 -0
- examples/general_agent_quickstart.py +202 -0
- examples/general_agent_simple.py +92 -0
- examples/hybrid_agent_auto_extraction.py +84 -0
- examples/hybrid_agent_dictionary_demo.py +104 -0
- examples/hybrid_agent_enhanced.py +179 -0
- examples/hybrid_agent_general_knowledge.py +107 -0
- examples/image_annotation_quickstart.py +328 -0
- examples/test_hybrid_fixes.py +77 -0
- image_annotation/__init__.py +27 -0
- image_annotation/annotation_engine.py +2593 -0
- install_cuda_wsl2.sh +59 -0
- install_deepspeed.sh +56 -0
- install_deepspeed_simple.sh +87 -0
- mkdocs.yml +252 -0
- ollama/Modelfile +8 -0
- prompts/__init__.py +2 -1
- prompts/default_crca.py +9 -1
- prompts/general_agent.py +227 -0
- prompts/image_annotation.py +56 -0
- pyproject.toml +17 -2
- requirements-docs.txt +10 -0
- requirements.txt +21 -2
- schemas/__init__.py +26 -1
- schemas/annotation.py +222 -0
- schemas/conversation.py +193 -0
- schemas/hybrid.py +211 -0
- schemas/reasoning.py +276 -0
- schemas_export/crca_core/CounterfactualResult.schema.json +108 -0
- schemas_export/crca_core/DiscoveryHypothesisResult.schema.json +113 -0
- schemas_export/crca_core/DraftSpec.schema.json +635 -0
- schemas_export/crca_core/EstimateResult.schema.json +113 -0
- schemas_export/crca_core/IdentificationResult.schema.json +145 -0
- schemas_export/crca_core/InterventionDesignResult.schema.json +111 -0
- schemas_export/crca_core/LockedSpec.schema.json +646 -0
- schemas_export/crca_core/RefusalResult.schema.json +90 -0
- schemas_export/crca_core/ValidationReport.schema.json +62 -0
- scripts/build_lrm_dataset.py +80 -0
- scripts/export_crca_core_schemas.py +54 -0
- scripts/export_hf_lrm.py +37 -0
- scripts/export_ollama_gguf.py +45 -0
- scripts/generate_changelog.py +157 -0
- scripts/generate_crca_core_docs_from_schemas.py +86 -0
- scripts/run_crca_core_benchmarks.py +163 -0
- scripts/run_full_finetune.py +198 -0
- scripts/run_lrm_eval.py +31 -0
- templates/graph_management.py +29 -0
- tests/conftest.py +9 -0
- tests/test_core.py +2 -3
- tests/test_crca_core_discovery_tabular.py +15 -0
- tests/test_crca_core_estimate_dowhy.py +36 -0
- tests/test_crca_core_identify.py +18 -0
- tests/test_crca_core_intervention_design.py +36 -0
- tests/test_crca_core_linear_gaussian_scm.py +69 -0
- tests/test_crca_core_spec.py +25 -0
- tests/test_crca_core_timeseries_pcmci.py +15 -0
- tests/test_crca_llm_coauthor.py +12 -0
- tests/test_crca_llm_orchestrator.py +80 -0
- tests/test_hybrid_agent_llm_enhanced.py +556 -0
- tests/test_image_annotation_demo.py +376 -0
- tests/test_image_annotation_operational.py +408 -0
- tests/test_image_annotation_unit.py +551 -0
- tests/test_training_moe.py +13 -0
- training/__init__.py +42 -0
- training/datasets.py +140 -0
- training/deepspeed_zero2_0_5b.json +22 -0
- training/deepspeed_zero2_1_5b.json +22 -0
- training/deepspeed_zero3_0_5b.json +28 -0
- training/deepspeed_zero3_14b.json +28 -0
- training/deepspeed_zero3_h100_3gpu.json +20 -0
- training/deepspeed_zero3_offload.json +28 -0
- training/eval.py +92 -0
- training/finetune.py +516 -0
- training/public_datasets.py +89 -0
- training_data/react_train.jsonl +7473 -0
- utils/agent_discovery.py +311 -0
- utils/batch_processor.py +317 -0
- utils/conversation.py +78 -0
- utils/edit_distance.py +118 -0
- utils/formatter.py +33 -0
- utils/graph_reasoner.py +530 -0
- utils/rate_limiter.py +283 -0
- utils/router.py +2 -2
- utils/tool_discovery.py +307 -0
- webui/__init__.py +10 -0
- webui/app.py +229 -0
- webui/config.py +104 -0
- webui/static/css/style.css +332 -0
- webui/static/js/main.js +284 -0
- webui/templates/index.html +42 -0
- tests/test_crca_excel.py +0 -166
- tests/test_data_broker.py +0 -424
- tests/test_palantir.py +0 -349
- {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/WHEEL +0 -0
- {crca-1.4.0.dist-info → crca-1.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,4398 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enhanced Symbolic-Statistical Hybrid AI - General Purpose LLM Replacement System.
|
|
3
|
+
|
|
4
|
+
A production-ready, stable, pure symbolic-statistical reasoning agent
|
|
5
|
+
that can replace LLMs entirely without actually using one.
|
|
6
|
+
|
|
7
|
+
Supports both:
|
|
8
|
+
- Causal Reasoning (CRCA): Causal analysis, counterfactuals, interventions
|
|
9
|
+
- General Knowledge: Facts, definitions, taxonomic relationships, spatial/temporal knowledge
|
|
10
|
+
|
|
11
|
+
Key Features:
|
|
12
|
+
- Graph-first reasoning: All answers come from graph state, never text parsing
|
|
13
|
+
- Enhanced NLU: Comprehensive pattern matching for causal AND general knowledge
|
|
14
|
+
- Enhanced NLG: Natural, conversational responses with pragmatic tone adjustment
|
|
15
|
+
- Non-destructive text correction: Handles spelling, abbreviations, grammar
|
|
16
|
+
- Language compilation: Three-layer system (lexical, grammatical, pragmatic)
|
|
17
|
+
- Multi-domain support: Causal, taxonomic, spatial, temporal, functional relationships
|
|
18
|
+
- Graph compression: Composite nodes, latent factors, abstraction
|
|
19
|
+
- Provenance tracking: Every edge tracks its source and confidence decay
|
|
20
|
+
- Robust error handling: Graceful degradation, validation, fallback responses
|
|
21
|
+
|
|
22
|
+
Relationship Types Supported:
|
|
23
|
+
- Causal: affects, causes, influences, depends on, leads to
|
|
24
|
+
- Taxonomic: is-a, type-of, belongs-to, classified-as
|
|
25
|
+
- Meronymic: part-of, consists-of, contains, has
|
|
26
|
+
- Spatial: located-in, found-in
|
|
27
|
+
- Temporal: before, after, precedes, follows
|
|
28
|
+
- Functional: used-for, functions-as
|
|
29
|
+
- Definitional: is, means, refers-to, defined-as
|
|
30
|
+
- Factual: was, became, changed-to
|
|
31
|
+
|
|
32
|
+
CRITICAL: Epistemic Validation (for causal tasks)
|
|
33
|
+
For causal reasoning tasks, this agent requires explicit causal structure, not intent statements. It will:
|
|
34
|
+
- REJECT action verbs (identify, analyze, examine) as causal variables
|
|
35
|
+
- REJECT epistemic terms (policy, task, goal, decision) as causal variables
|
|
36
|
+
- WARN when structure is inferred from syntax alone
|
|
37
|
+
- ERROR when task is epistemically underspecified (no valid state variables)
|
|
38
|
+
|
|
39
|
+
For general knowledge tasks, the agent is more flexible and can extract:
|
|
40
|
+
- Facts and definitions
|
|
41
|
+
- Taxonomic relationships
|
|
42
|
+
- Spatial and temporal information
|
|
43
|
+
- Properties and attributes
|
|
44
|
+
|
|
45
|
+
This system is designed to be stable, reliable, and capable of replacing LLMs
|
|
46
|
+
for both causal reasoning AND general knowledge tasks while maintaining natural language interaction.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
import re
|
|
50
|
+
import json
|
|
51
|
+
from typing import Dict, List, Optional, Tuple, Any, Set
|
|
52
|
+
import logging
|
|
53
|
+
from collections import defaultdict, deque
|
|
54
|
+
|
|
55
|
+
import numpy as np
|
|
56
|
+
|
|
57
|
+
# Optional dependencies
|
|
58
|
+
try:
|
|
59
|
+
import pandas as pd
|
|
60
|
+
PANDAS_AVAILABLE = True
|
|
61
|
+
except ImportError:
|
|
62
|
+
PANDAS_AVAILABLE = False
|
|
63
|
+
|
|
64
|
+
# Import CRCA templates
|
|
65
|
+
from templates.graph_management import GraphManager
|
|
66
|
+
from templates.statistical_methods import StatisticalMethods
|
|
67
|
+
from templates.prediction_framework import PredictionFramework, CounterfactualScenario
|
|
68
|
+
|
|
69
|
+
# Import new graph-first components
|
|
70
|
+
from utils.graph_reasoner import GraphFirstReasoner
|
|
71
|
+
from architecture.hybrid.graph_compressor import GraphCompressor
|
|
72
|
+
from architecture.hybrid.language_compiler import LexicalCompiler, GrammaticalCompiler, PragmaticCompiler
|
|
73
|
+
from architecture.hybrid.text_corrector import TextCorrector
|
|
74
|
+
from schemas.hybrid import EdgeProvenance, TemporalEdge, TemporalType
|
|
75
|
+
|
|
76
|
+
# Import new LLM-enhanced components
|
|
77
|
+
from architecture.hybrid.conversation_manager import ConversationHistory, ContextTracker
|
|
78
|
+
from architecture.hybrid.reasoning_tracker import ReasoningTracker
|
|
79
|
+
from architecture.hybrid.few_shot_learner import ExampleStore, PatternLearner, AdaptiveExtractor
|
|
80
|
+
from architecture.hybrid.task_decomposer import TaskAnalyzer, SubTaskExecutor, PlanGenerator
|
|
81
|
+
from architecture.hybrid.explanation_generator import ExplanationBuilder, TransparencyLayer
|
|
82
|
+
from architecture.hybrid.self_verifier import ConsistencyChecker, ErrorDetector, SelfCorrector
|
|
83
|
+
from architecture.hybrid.consistency_engine import ConsistencyEngine
|
|
84
|
+
from schemas.conversation import ConversationContext, MessageRole
|
|
85
|
+
from schemas.reasoning import ReasoningChain, StepType, InferenceRule, Evidence
|
|
86
|
+
|
|
87
|
+
logger = logging.getLogger(__name__)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class SymbolicReasoner:
|
|
91
|
+
"""
|
|
92
|
+
Advanced symbolic reasoning engine for natural language understanding and causal extraction.
|
|
93
|
+
|
|
94
|
+
Uses sophisticated pattern matching, semantic analysis, and context-aware parsing
|
|
95
|
+
to extract causal variables and relationships from natural language tasks.
|
|
96
|
+
|
|
97
|
+
Enhanced Features:
|
|
98
|
+
- Action Verb Understanding: Extracts state variables from action verbs (e.g., "identify X" -> extracts "X")
|
|
99
|
+
- Epistemic Term Understanding: Extracts state variables from epistemic terms (e.g., "policy of X" -> extracts "X")
|
|
100
|
+
- Vague Language Handling: Understands vague language patterns like "what affects X", "factors influencing Y"
|
|
101
|
+
- Semantic Role Analysis: Understands that action verbs and epistemic terms are signals, not variables themselves
|
|
102
|
+
- Context-Aware Extraction: Uses context to infer relationships even from vague descriptions
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
graph_manager: GraphManager,
|
|
108
|
+
lexical_compiler: Optional[Any] = None,
|
|
109
|
+
adaptive_extractor: Optional[Any] = None
|
|
110
|
+
):
|
|
111
|
+
"""
|
|
112
|
+
Initialize symbolic reasoner.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
graph_manager: GraphManager instance for graph operations
|
|
116
|
+
lexical_compiler: Optional LexicalCompiler instance for dictionary validation
|
|
117
|
+
adaptive_extractor: Optional AdaptiveExtractor for few-shot learning
|
|
118
|
+
"""
|
|
119
|
+
self.graph_manager = graph_manager
|
|
120
|
+
self.lexical_compiler = lexical_compiler
|
|
121
|
+
self.adaptive_extractor = adaptive_extractor
|
|
122
|
+
|
|
123
|
+
# Comprehensive pattern definitions for extracting causal relationships
|
|
124
|
+
# Updated patterns to handle numerical values, conditionals, and state descriptions
|
|
125
|
+
self.patterns = [
|
|
126
|
+
# Direct causal verbs (with optional numerical values)
|
|
127
|
+
(r'(\w+(?:\s+\w+)?)\s+(?:depends?\s+on|depends?\s+upon)\s+(\w+(?:\s+\w+)?)', 'depends_on', 0.9),
|
|
128
|
+
(r'(\w+(?:\s+\w+)?)\s+causes?\s+(\w+(?:\s+\w+)?)', 'causes', 0.95),
|
|
129
|
+
(r'(\w+(?:\s+\w+)?)\s+affects?\s+(\w+(?:\s+\w+)?)', 'affects', 0.9),
|
|
130
|
+
(r'(\w+(?:\s+\w+)?)\s+influences?\s+(\w+(?:\s+\w+)?)', 'influences', 0.85),
|
|
131
|
+
(r'(\w+(?:\s+\w+)?)\s+leads?\s+to\s+(\w+(?:\s+\w+)?)', 'leads_to', 0.9),
|
|
132
|
+
(r'(\w+(?:\s+\w+)?)\s+results?\s+in\s+(\w+(?:\s+\w+)?)', 'results_in', 0.9),
|
|
133
|
+
(r'(\w+(?:\s+\w+)?)\s+impacts?\s+(\w+(?:\s+\w+)?)', 'impacts', 0.85),
|
|
134
|
+
(r'(\w+(?:\s+\w+)?)\s+drives?\s+(\w+(?:\s+\w+)?)', 'drives', 0.9),
|
|
135
|
+
(r'(\w+(?:\s+\w+)?)\s+determines?\s+(\w+(?:\s+\w+)?)', 'determines', 0.95),
|
|
136
|
+
(r'(\w+(?:\s+\w+)?)\s+controls?\s+(\w+(?:\s+\w+)?)', 'controls', 0.9),
|
|
137
|
+
|
|
138
|
+
# Passive voice patterns
|
|
139
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:affected|influenced|determined|controlled|driven)\s+by\s+(\w+(?:\s+\w+)?)', 'affected_by', 0.9),
|
|
140
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+caused\s+by\s+(\w+(?:\s+\w+)?)', 'caused_by', 0.95),
|
|
141
|
+
(r'(\w+(?:\s+\w+)?)\s+results?\s+from\s+(\w+(?:\s+\w+)?)', 'results_from', 0.9),
|
|
142
|
+
|
|
143
|
+
# State description patterns (X is Y, X = Y, X: Y)
|
|
144
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:\d+[.,]?\d*|[\d%]+|[a-z]+)', 'state_description', 0.7),
|
|
145
|
+
(r'(\w+(?:\s+\w+)?)\s*[=:]\s*(?:\d+[.,]?\d*|[\d%]+)', 'state_equals', 0.7),
|
|
146
|
+
(r'(\w+(?:\s+\w+)?)\s+of\s+(\d+[.,]?\d*%?|\w+)', 'state_of', 0.6),
|
|
147
|
+
|
|
148
|
+
# Conditional patterns (enhanced)
|
|
149
|
+
(r'if\s+(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+(?:\d+[.,]?\d*|[\d%]+|\w+)\s*,?\s*(?:then\s+)?(?:what\s+)?(?:is|are|will|would)\s+(\w+(?:\s+\w+)?)', 'conditional_question', 0.85),
|
|
150
|
+
(r'if\s+(\w+(?:\s+\w+)?)\s+then\s+(\w+(?:\s+\w+)?)', 'conditional', 0.85),
|
|
151
|
+
(r'when\s+(\w+(?:\s+\w+)?)\s+,\s+(\w+(?:\s+\w+)?)', 'temporal', 0.8),
|
|
152
|
+
(r'(\w+(?:\s+\w+)?)\s+when\s+(\w+(?:\s+\w+)?)', 'temporal_reverse', 0.8),
|
|
153
|
+
|
|
154
|
+
# Question patterns (what is X, what will X be, etc.)
|
|
155
|
+
(r'(?:what|which|how\s+much|how\s+many)\s+(?:is|are|will|would|should)\s+(?:the\s+)?(\w+(?:\s+\w+)?)', 'question_target', 0.8),
|
|
156
|
+
(r'(?:what|which|how\s+much|how\s+many)\s+(?:is|are|will|would|should)\s+(?:the\s+)?(\w+(?:\s+\w+)?)\s+(?:of|in|for|after|in\s+\d+\s+days)', 'question_target_time', 0.85),
|
|
157
|
+
|
|
158
|
+
# Arrow notation
|
|
159
|
+
(r'(\w+(?:\s+\w+)?)\s*[-->->]\s*(\w+(?:\s+\w+)?)', 'arrow', 0.95),
|
|
160
|
+
(r'(\w+(?:\s+\w+)?)\s*=>\s*(\w+(?:\s+\w+)?)', 'arrow', 0.95),
|
|
161
|
+
|
|
162
|
+
# Comparative patterns
|
|
163
|
+
(r'(\w+(?:\s+\w+)?)\s+increases?\s+(\w+(?:\s+\w+)?)', 'increases', 0.9),
|
|
164
|
+
(r'(\w+(?:\s+\w+)?)\s+decreases?\s+(\w+(?:\s+\w+)?)', 'decreases', 0.9),
|
|
165
|
+
(r'(\w+(?:\s+\w+)?)\s+raises?\s+(\w+(?:\s+\w+)?)', 'increases', 0.85),
|
|
166
|
+
(r'(\w+(?:\s+\w+)?)\s+lowers?\s+(\w+(?:\s+\w+)?)', 'decreases', 0.85),
|
|
167
|
+
|
|
168
|
+
# Correlation patterns (weaker causality)
|
|
169
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+correlated\s+with\s+(\w+(?:\s+\w+)?)', 'correlated', 0.6),
|
|
170
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+related\s+to\s+(\w+(?:\s+\w+)?)', 'related', 0.5),
|
|
171
|
+
|
|
172
|
+
# Implicit relationships (X and Y, X with Y)
|
|
173
|
+
(r'(\w+(?:\s+\w+)?)\s+and\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|determine|control)', 'implicit_and', 0.6),
|
|
174
|
+
|
|
175
|
+
# NEW: Vague language patterns - "what affects X", "factors influencing X"
|
|
176
|
+
(r'what\s+(?:affects|influences|causes|impacts|changes)\s+(\w+(?:\s+\w+)?)', 'vague_causal', 0.6),
|
|
177
|
+
(r'factors?\s+(?:affecting|influencing|causing|impacting)\s+(\w+(?:\s+\w+)?)', 'vague_causal', 0.6),
|
|
178
|
+
(r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|cause|impact)', 'vague_causal', 0.6),
|
|
179
|
+
# NEW: Relationship patterns - "relationship between X and Y"
|
|
180
|
+
(r'relationship\s+(?:between|among)\s+(\w+(?:\s+\w+)?)\s+(?:and|&)\s+(\w+(?:\s+\w+)?)', 'relationship', 0.7),
|
|
181
|
+
(r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+relate\s+to\s+(\w+(?:\s+\w+)?)', 'relationship', 0.7),
|
|
182
|
+
(r'effect\s+of\s+(\w+(?:\s+\w+)?)\s+on\s+(\w+(?:\s+\w+)?)', 'causal', 0.8),
|
|
183
|
+
|
|
184
|
+
# Enhanced patterns for better coverage
|
|
185
|
+
# Temporal patterns
|
|
186
|
+
(r'(\w+(?:\s+\w+)?)\s+before\s+(\w+(?:\s+\w+)?)', 'before', 0.85),
|
|
187
|
+
(r'(\w+(?:\s+\w+)?)\s+after\s+(\w+(?:\s+\w+)?)', 'after', 0.85),
|
|
188
|
+
(r'(\w+(?:\s+\w+)?)\s+leads?\s+to\s+(\w+(?:\s+\w+)?)\s+in\s+(\d+)\s+(?:days?|hours?|weeks?|months?)', 'delayed', 0.9),
|
|
189
|
+
|
|
190
|
+
# Comparative and quantitative
|
|
191
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:higher|greater|larger|bigger)\s+than\s+(\w+(?:\s+\w+)?)', 'greater_than', 0.7),
|
|
192
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:lower|smaller|less)\s+than\s+(\w+(?:\s+\w+)?)', 'less_than', 0.7),
|
|
193
|
+
(r'(\w+(?:\s+\w+)?)\s+varies?\s+with\s+(\w+(?:\s+\w+)?)', 'varies_with', 0.75),
|
|
194
|
+
|
|
195
|
+
# Question patterns (enhanced)
|
|
196
|
+
(r'what\s+(?:happens?|occurs?|results?)\s+(?:if|when)\s+(\w+(?:\s+\w+)?)', 'what_if', 0.9),
|
|
197
|
+
(r'how\s+(?:does|do|will|would)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|impact)\s+(\w+(?:\s+\w+)?)', 'how_affects', 0.9),
|
|
198
|
+
(r'why\s+(?:does|do|is|are)\s+(\w+(?:\s+\w+)?)', 'why_question', 0.8),
|
|
199
|
+
|
|
200
|
+
# Multi-variable patterns
|
|
201
|
+
(r'(\w+(?:\s+\w+)?)\s+(?:together\s+with|along\s+with|combined\s+with)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|cause)', 'combined_effect', 0.8),
|
|
202
|
+
(r'(\w+(?:\s+\w+)?)\s+(?:and|or)\s+(\w+(?:\s+\w+)?)\s+(?:both|together)\s+(?:affect|influence|determine)', 'joint_effect', 0.75),
|
|
203
|
+
|
|
204
|
+
# ====================================================================
|
|
205
|
+
# GENERAL KNOWLEDGE PATTERNS (Non-Causal Relationships)
|
|
206
|
+
# ====================================================================
|
|
207
|
+
|
|
208
|
+
# Taxonomic/Classification patterns (is-a, type-of)
|
|
209
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+(\w+(?:\s+\w+)?)', 'is_a', 0.9),
|
|
210
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+type\s+of\s+(\w+(?:\s+\w+)?)', 'is_a', 0.95),
|
|
211
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+kind\s+of\s+(\w+(?:\s+\w+)?)', 'is_a', 0.9),
|
|
212
|
+
(r'(\w+(?:\s+\w+)?)\s+belongs?\s+to\s+(\w+(?:\s+\w+)?)', 'belongs_to', 0.85),
|
|
213
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+classified\s+as\s+(\w+(?:\s+\w+)?)', 'is_a', 0.9),
|
|
214
|
+
|
|
215
|
+
# Property/Has patterns
|
|
216
|
+
(r'(\w+(?:\s+\w+)?)\s+has\s+(\w+(?:\s+\w+)?)', 'has_property', 0.85),
|
|
217
|
+
(r'(\w+(?:\s+\w+)?)\s+has\s+(?:a|an)\s+(\w+(?:\s+\w+)?)', 'has_property', 0.85),
|
|
218
|
+
(r'(\w+(?:\s+\w+)?)\s+possesses?\s+(\w+(?:\s+\w+)?)', 'has_property', 0.8),
|
|
219
|
+
(r'(\w+(?:\s+\w+)?)\s+contains?\s+(\w+(?:\s+\w+)?)', 'contains', 0.85),
|
|
220
|
+
(r'(\w+(?:\s+\w+)?)\s+includes?\s+(\w+(?:\s+\w+)?)', 'includes', 0.8),
|
|
221
|
+
|
|
222
|
+
# Part-Whole patterns (meronymy)
|
|
223
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:a|an)\s+part\s+of\s+(\w+(?:\s+\w+)?)', 'part_of', 0.9),
|
|
224
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+part\s+of\s+(\w+(?:\s+\w+)?)', 'part_of', 0.9),
|
|
225
|
+
(r'(\w+(?:\s+\w+)?)\s+belongs?\s+to\s+(\w+(?:\s+\w+)?)', 'part_of', 0.8),
|
|
226
|
+
(r'(\w+(?:\s+\w+)?)\s+consists?\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.9),
|
|
227
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+composed\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.9),
|
|
228
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+made\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.85),
|
|
229
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+made\s+up\s+of\s+(\w+(?:\s+\w+)?)', 'consists_of', 0.85),
|
|
230
|
+
|
|
231
|
+
# Location/Spatial patterns
|
|
232
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.8),
|
|
233
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+located\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.85),
|
|
234
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+found\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.8),
|
|
235
|
+
(r'(\w+(?:\s+\w+)?)\s+resides?\s+(?:in|at|on)\s+(\w+(?:\s+\w+)?)', 'located_in', 0.8),
|
|
236
|
+
|
|
237
|
+
# Definition patterns (X is Y, X means Y, X refers to Y)
|
|
238
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:defined\s+as|means?|refers?\s+to)\s+(\w+(?:\s+\w+)?)', 'defined_as', 0.9),
|
|
239
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(\w+(?:\s+\w+)?)', 'is', 0.7), # General "is" (weaker)
|
|
240
|
+
(r'(\w+(?:\s+\w+)?)\s+means?\s+(\w+(?:\s+\w+)?)', 'means', 0.85),
|
|
241
|
+
(r'(\w+(?:\s+\w+)?)\s+refers?\s+to\s+(\w+(?:\s+\w+)?)', 'refers_to', 0.85),
|
|
242
|
+
|
|
243
|
+
# Similarity/Equivalence patterns
|
|
244
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+(?:similar\s+to|like|equivalent\s+to)\s+(\w+(?:\s+\w+)?)', 'similar_to', 0.8),
|
|
245
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+the\s+same\s+as\s+(\w+(?:\s+\w+)?)', 'equivalent_to', 0.9),
|
|
246
|
+
(r'(\w+(?:\s+\w+)?)\s+equals?\s+(\w+(?:\s+\w+)?)', 'equivalent_to', 0.85),
|
|
247
|
+
|
|
248
|
+
# Temporal patterns (general knowledge)
|
|
249
|
+
(r'(\w+(?:\s+\w+)?)\s+occurs?\s+(?:before|after|during)\s+(\w+(?:\s+\w+)?)', 'temporal', 0.8),
|
|
250
|
+
(r'(\w+(?:\s+\w+)?)\s+happens?\s+(?:before|after|during)\s+(\w+(?:\s+\w+)?)', 'temporal', 0.8),
|
|
251
|
+
(r'(\w+(?:\s+\w+)?)\s+precedes?\s+(\w+(?:\s+\w+)?)', 'precedes', 0.85),
|
|
252
|
+
(r'(\w+(?:\s+\w+)?)\s+follows?\s+(\w+(?:\s+\w+)?)', 'follows', 0.85),
|
|
253
|
+
|
|
254
|
+
# Purpose/Function patterns
|
|
255
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+used\s+(?:for|to)\s+(\w+(?:\s+\w+)?)', 'used_for', 0.85),
|
|
256
|
+
(r'(\w+(?:\s+\w+)?)\s+serves?\s+to\s+(\w+(?:\s+\w+)?)', 'used_for', 0.85),
|
|
257
|
+
(r'(\w+(?:\s+\w+)?)\s+functions?\s+as\s+(\w+(?:\s+\w+)?)', 'functions_as', 0.85),
|
|
258
|
+
(r'(\w+(?:\s+\w+)?)\s+is\s+for\s+(\w+(?:\s+\w+)?)', 'used_for', 0.8),
|
|
259
|
+
|
|
260
|
+
# General knowledge question patterns
|
|
261
|
+
(r'what\s+is\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'what_is', 0.9),
|
|
262
|
+
(r'what\s+are?\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'what_is', 0.9),
|
|
263
|
+
(r'who\s+is\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'who_is', 0.9),
|
|
264
|
+
(r'where\s+is\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'where_is', 0.9),
|
|
265
|
+
(r'when\s+(?:is|was|does|did)\s+(?:a|an|the)?\s+(\w+(?:\s+\w+)?)', 'when_is', 0.85),
|
|
266
|
+
(r'how\s+(?:does|do|is|are)\s+(\w+(?:\s+\w+)?)\s+work', 'how_works', 0.85),
|
|
267
|
+
(r'what\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+mean', 'what_means', 0.9),
|
|
268
|
+
|
|
269
|
+
# Factual statement patterns
|
|
270
|
+
(r'(\w+(?:\s+\w+)?)\s+was\s+(\w+(?:\s+\w+)?)', 'factual', 0.7),
|
|
271
|
+
(r'(\w+(?:\s+\w+)?)\s+were\s+(\w+(?:\s+\w+)?)', 'factual', 0.7),
|
|
272
|
+
(r'(\w+(?:\s+\w+)?)\s+became\s+(\w+(?:\s+\w+)?)', 'became', 0.8),
|
|
273
|
+
(r'(\w+(?:\s+\w+)?)\s+changed\s+to\s+(\w+(?:\s+\w+)?)', 'changed_to', 0.8),
|
|
274
|
+
]
|
|
275
|
+
|
|
276
|
+
# Extended keywords for identifying variables (domain-agnostic)
|
|
277
|
+
self.variable_keywords = [
|
|
278
|
+
# General terms
|
|
279
|
+
'variable', 'factor', 'metric', 'indicator', 'measure', 'parameter',
|
|
280
|
+
'dimension', 'attribute', 'feature', 'component', 'element',
|
|
281
|
+
'concept', 'entity', 'object', 'item', 'thing', 'subject', 'topic',
|
|
282
|
+
|
|
283
|
+
# Business/Economics
|
|
284
|
+
'price', 'demand', 'supply', 'sales', 'revenue', 'cost', 'profit',
|
|
285
|
+
'margin', 'growth', 'market', 'customer', 'product', 'service',
|
|
286
|
+
|
|
287
|
+
# Quality/Performance
|
|
288
|
+
'satisfaction', 'quality', 'performance', 'efficiency', 'effectiveness',
|
|
289
|
+
'productivity', 'output', 'throughput', 'latency', 'speed',
|
|
290
|
+
|
|
291
|
+
# Social/Psychological
|
|
292
|
+
'happiness', 'wellbeing', 'stress', 'motivation', 'engagement',
|
|
293
|
+
'retention', 'turnover', 'loyalty', 'trust',
|
|
294
|
+
|
|
295
|
+
# General Knowledge entities
|
|
296
|
+
'person', 'place', 'location', 'country', 'city', 'organization',
|
|
297
|
+
'company', 'institution', 'event', 'date', 'time', 'period',
|
|
298
|
+
'category', 'type', 'class', 'group', 'species', 'genre',
|
|
299
|
+
|
|
300
|
+
# Technical
|
|
301
|
+
'temperature', 'pressure', 'voltage', 'current', 'frequency',
|
|
302
|
+
'bandwidth', 'capacity', 'utilization', 'availability',
|
|
303
|
+
]
|
|
304
|
+
|
|
305
|
+
# Stop words to filter out
|
|
306
|
+
self.stop_words = {
|
|
307
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
308
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
|
309
|
+
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
|
310
|
+
'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this',
|
|
311
|
+
'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'where',
|
|
312
|
+
'when', 'why', 'how', 'all', 'each', 'every', 'some', 'any', 'no',
|
|
313
|
+
'not', 'only', 'just', 'also', 'too', 'very', 'more', 'most', 'less',
|
|
314
|
+
'least', 'many', 'much', 'few', 'little', 'other', 'another', 'same',
|
|
315
|
+
'different', 'such', 'own', 'so', 'than', 'then', 'there', 'here',
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
# Causal verb synonyms for better matching
|
|
319
|
+
self.causal_verbs = {
|
|
320
|
+
'cause', 'causes', 'caused', 'affect', 'affects', 'affected',
|
|
321
|
+
'influence', 'influences', 'influenced', 'impact', 'impacts', 'impacted',
|
|
322
|
+
'determine', 'determines', 'determined', 'control', 'controls', 'controlled',
|
|
323
|
+
'drive', 'drives', 'driven', 'lead', 'leads', 'led', 'result', 'results',
|
|
324
|
+
'increase', 'increases', 'increased', 'decrease', 'decreases', 'decreased',
|
|
325
|
+
'raise', 'raises', 'raised', 'lower', 'lowers', 'lowered',
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
# Negation words
|
|
329
|
+
self.negation_words = {'not', 'no', 'never', 'none', 'nothing', 'nobody', 'nowhere', 'neither', 'nor'}
|
|
330
|
+
|
|
331
|
+
# Quantifier words
|
|
332
|
+
self.quantifier_words = {'all', 'some', 'many', 'most', 'few', 'several', 'each', 'every', 'any'}
|
|
333
|
+
|
|
334
|
+
# Action verbs that should NEVER be treated as causal variables
|
|
335
|
+
# These are epistemic/intentional actions, not state variables
|
|
336
|
+
self.action_verbs = {
|
|
337
|
+
'identify', 'analyze', 'examine', 'study', 'investigate', 'explore',
|
|
338
|
+
'determine', 'find', 'discover', 'detect', 'recognize', 'understand',
|
|
339
|
+
'explain', 'describe', 'define', 'specify', 'clarify', 'elucidate',
|
|
340
|
+
'predict', 'forecast', 'estimate', 'calculate', 'compute', 'measure',
|
|
341
|
+
'evaluate', 'assess', 'judge', 'compare', 'contrast', 'differentiate',
|
|
342
|
+
'recommend', 'suggest', 'propose', 'advise', 'counsel', 'guide',
|
|
343
|
+
'implement', 'execute', 'perform', 'conduct', 'carry', 'out',
|
|
344
|
+
'create', 'generate', 'produce', 'make', 'build', 'construct',
|
|
345
|
+
'modify', 'change', 'alter', 'adjust', 'update', 'revise',
|
|
346
|
+
'remove', 'delete', 'eliminate', 'exclude', 'omit', 'skip',
|
|
347
|
+
'add', 'include', 'insert', 'append', 'attach', 'incorporate',
|
|
348
|
+
'process', 'handle', 'manage', 'control', 'operate', 'run',
|
|
349
|
+
'check', 'verify', 'validate', 'confirm', 'test', 'trial',
|
|
350
|
+
'show', 'display', 'present', 'demonstrate', 'illustrate', 'reveal',
|
|
351
|
+
'report', 'document', 'record', 'log', 'track', 'monitor',
|
|
352
|
+
'request', 'ask', 'query', 'question', 'inquire', 'interrogate',
|
|
353
|
+
'provide', 'supply', 'deliver', 'offer', 'give', 'send',
|
|
354
|
+
'receive', 'obtain', 'acquire', 'get', 'fetch', 'retrieve',
|
|
355
|
+
'use', 'utilize', 'employ', 'apply', 'leverage', 'exploit',
|
|
356
|
+
'consider', 'think', 'contemplate', 'reflect', 'ponder', 'muse',
|
|
357
|
+
'decide', 'choose', 'select', 'pick', 'opt', 'prefer',
|
|
358
|
+
'plan', 'design', 'scheme', 'devise', 'formulate', 'develop',
|
|
359
|
+
'solve', 'resolve', 'fix', 'repair', 'correct', 'rectify',
|
|
360
|
+
'learn', 'teach', 'train', 'educate', 'instruct', 'coach',
|
|
361
|
+
'help', 'assist', 'aid', 'support', 'facilitate', 'enable'
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
# Epistemic/intentional terms that indicate tasks, not causal variables
|
|
365
|
+
self.epistemic_terms = {
|
|
366
|
+
'task', 'goal', 'objective', 'aim', 'purpose', 'intent', 'intention',
|
|
367
|
+
'requirement', 'specification', 'criteria', 'standard', 'benchmark',
|
|
368
|
+
'policy', 'strategy', 'approach', 'method', 'technique', 'procedure',
|
|
369
|
+
'process', 'workflow', 'pipeline', 'system', 'framework', 'model',
|
|
370
|
+
'analysis', 'study', 'research', 'investigation', 'examination',
|
|
371
|
+
'result', 'outcome', 'consequence', 'effect', 'impact', 'influence',
|
|
372
|
+
'finding', 'discovery', 'insight', 'observation', 'conclusion',
|
|
373
|
+
'recommendation', 'suggestion', 'advice', 'guidance', 'direction',
|
|
374
|
+
'decision', 'choice', 'selection', 'option', 'alternative',
|
|
375
|
+
'problem', 'issue', 'challenge', 'difficulty', 'obstacle', 'barrier',
|
|
376
|
+
'solution', 'answer', 'resolution', 'fix', 'remedy', 'cure',
|
|
377
|
+
'question', 'query', 'inquiry', 'request', 'demand', 'need'
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
def _normalize_variable_name(self, var: str) -> str:
|
|
381
|
+
"""
|
|
382
|
+
Normalize variable name by cleaning and standardizing.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
var: Raw variable name
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
Normalized variable name
|
|
389
|
+
"""
|
|
390
|
+
if not var:
|
|
391
|
+
return ''
|
|
392
|
+
|
|
393
|
+
# Remove extra whitespace
|
|
394
|
+
var = ' '.join(var.split())
|
|
395
|
+
|
|
396
|
+
# Remove common articles and prepositions at start
|
|
397
|
+
words = var.split()
|
|
398
|
+
while words and words[0].lower() in {'the', 'a', 'an', 'of', 'for', 'in', 'on', 'at', 'to', 'from'}:
|
|
399
|
+
words = words[1:]
|
|
400
|
+
|
|
401
|
+
var = ' '.join(words)
|
|
402
|
+
|
|
403
|
+
# Convert to lowercase for consistency
|
|
404
|
+
return var.lower().strip()
|
|
405
|
+
|
|
406
|
+
def _extract_noun_phrases(self, text: str) -> List[str]:
|
|
407
|
+
"""
|
|
408
|
+
Extract noun phrases from text using pattern matching.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
text: Input text
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
List of noun phrases
|
|
415
|
+
"""
|
|
416
|
+
noun_phrases = []
|
|
417
|
+
|
|
418
|
+
# Pattern: adjective* noun+
|
|
419
|
+
pattern = r'\b(?:[a-z]+(?:\s+[a-z]+)*\s+)?(?:[a-z]+(?:ing|ed|tion|sion|ment|ness|ity|ance|ence)?)\b'
|
|
420
|
+
matches = re.finditer(pattern, text.lower())
|
|
421
|
+
|
|
422
|
+
for match in matches:
|
|
423
|
+
phrase = match.group(0).strip()
|
|
424
|
+
# Filter out stop words and very short phrases
|
|
425
|
+
words = phrase.split()
|
|
426
|
+
if len(words) >= 1 and not all(w in self.stop_words for w in words):
|
|
427
|
+
# Remove stop words from beginning/end
|
|
428
|
+
while words and words[0] in self.stop_words:
|
|
429
|
+
words = words[1:]
|
|
430
|
+
while words and words[-1] in self.stop_words:
|
|
431
|
+
words = words[:-1]
|
|
432
|
+
if words:
|
|
433
|
+
noun_phrases.append(' '.join(words))
|
|
434
|
+
|
|
435
|
+
return list(set(noun_phrases))
|
|
436
|
+
|
|
437
|
+
def _detect_negation(self, text: str, start_pos: int, end_pos: int) -> bool:
|
|
438
|
+
"""
|
|
439
|
+
Detect if a phrase is negated.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
text: Full text
|
|
443
|
+
start_pos: Start position of phrase
|
|
444
|
+
end_pos: End position of phrase
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
True if negated
|
|
448
|
+
"""
|
|
449
|
+
# Check before the phrase
|
|
450
|
+
before = text[max(0, start_pos-20):start_pos].lower()
|
|
451
|
+
for neg_word in self.negation_words:
|
|
452
|
+
if neg_word in before:
|
|
453
|
+
return True
|
|
454
|
+
return False
|
|
455
|
+
|
|
456
|
+
def _extract_state_variables_from_action_verbs(self, text: str) -> Set[str]:
|
|
457
|
+
"""
|
|
458
|
+
Extract state variables from action verbs by finding what they refer to.
|
|
459
|
+
|
|
460
|
+
Example: "identify past policy" -> extract "policy" (but filter if epistemic)
|
|
461
|
+
Example: "analyze price trends" -> extract "price", "trends"
|
|
462
|
+
Example: "determine demand level" -> extract "demand", "level"
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
text: Input text
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Set of extracted state variable names
|
|
469
|
+
"""
|
|
470
|
+
extracted_vars = set()
|
|
471
|
+
text_lower = text.lower()
|
|
472
|
+
|
|
473
|
+
# Pattern: action_verb + (optional adverb) + noun_phrase
|
|
474
|
+
# Match: "identify X", "analyze the X", "determine X", etc.
|
|
475
|
+
for action_verb in self.action_verbs:
|
|
476
|
+
# Pattern 1: "action_verb [the/a/an] noun_phrase"
|
|
477
|
+
pattern1 = rf'\b{action_verb}\s+(?:the|a|an)?\s*(\w+(?:\s+\w+)?)'
|
|
478
|
+
matches = re.finditer(pattern1, text_lower, re.IGNORECASE)
|
|
479
|
+
for match in matches:
|
|
480
|
+
noun_phrase = match.group(1).strip()
|
|
481
|
+
# Clean and validate
|
|
482
|
+
cleaned = self._normalize_variable_name(noun_phrase)
|
|
483
|
+
if cleaned and not self._is_action_verb(cleaned):
|
|
484
|
+
# Check if it's an epistemic term - if so, try to extract what it refers to
|
|
485
|
+
if self._is_epistemic_term(cleaned):
|
|
486
|
+
# Try to find what the epistemic term refers to
|
|
487
|
+
# E.g., "past policy" -> look for what policy refers to
|
|
488
|
+
# This is harder, so we'll skip for now and let other methods handle it
|
|
489
|
+
continue
|
|
490
|
+
# Only add if it's not an epistemic term itself
|
|
491
|
+
if not self._is_epistemic_term(cleaned):
|
|
492
|
+
extracted_vars.add(cleaned)
|
|
493
|
+
|
|
494
|
+
# Pattern 2: "action_verb [what/which/how] noun_phrase"
|
|
495
|
+
pattern2 = rf'\b{action_verb}\s+(?:what|which|how)\s+(\w+(?:\s+\w+)?)'
|
|
496
|
+
matches = re.finditer(pattern2, text_lower, re.IGNORECASE)
|
|
497
|
+
for match in matches:
|
|
498
|
+
noun_phrase = match.group(1).strip()
|
|
499
|
+
cleaned = self._normalize_variable_name(noun_phrase)
|
|
500
|
+
if cleaned and not self._is_action_verb(cleaned) and not self._is_epistemic_term(cleaned):
|
|
501
|
+
extracted_vars.add(cleaned)
|
|
502
|
+
|
|
503
|
+
return extracted_vars
|
|
504
|
+
|
|
505
|
+
def _extract_state_variables_from_epistemic_terms(self, text: str) -> Set[str]:
|
|
506
|
+
"""
|
|
507
|
+
Extract state variables from epistemic terms by finding what they refer to.
|
|
508
|
+
|
|
509
|
+
Example: "past policy" -> if we can find what policy refers to, extract that
|
|
510
|
+
Example: "task goal" -> extract the underlying state variable the goal refers to
|
|
511
|
+
Example: "policy decision" -> extract what the decision affects
|
|
512
|
+
|
|
513
|
+
Args:
|
|
514
|
+
text: Input text
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
Set of extracted state variable names
|
|
518
|
+
"""
|
|
519
|
+
extracted_vars = set()
|
|
520
|
+
text_lower = text.lower()
|
|
521
|
+
|
|
522
|
+
# Pattern: epistemic_term + "of" + noun_phrase
|
|
523
|
+
# E.g., "policy of X", "goal of Y"
|
|
524
|
+
for epistemic_term in self.epistemic_terms:
|
|
525
|
+
pattern1 = rf'\b{epistemic_term}\s+of\s+(\w+(?:\s+\w+)?)'
|
|
526
|
+
matches = re.finditer(pattern1, text_lower, re.IGNORECASE)
|
|
527
|
+
for match in matches:
|
|
528
|
+
noun_phrase = match.group(1).strip()
|
|
529
|
+
cleaned = self._normalize_variable_name(noun_phrase)
|
|
530
|
+
if cleaned and not self._is_action_verb(cleaned) and not self._is_epistemic_term(cleaned):
|
|
531
|
+
extracted_vars.add(cleaned)
|
|
532
|
+
|
|
533
|
+
# Pattern: adjective + epistemic_term -> extract what it modifies
|
|
534
|
+
# E.g., "past policy" -> look for what policy affects
|
|
535
|
+
# This is harder - we'll use context clues
|
|
536
|
+
epistemic_patterns = [
|
|
537
|
+
r'past\s+(\w+)', # "past X" -> X might be a state variable if not epistemic
|
|
538
|
+
r'(\w+)\s+policy', # "X policy" -> X might be what policy affects
|
|
539
|
+
r'(\w+)\s+decision', # "X decision" -> X might be what decision affects
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
for pattern in epistemic_patterns:
|
|
543
|
+
matches = re.finditer(pattern, text_lower, re.IGNORECASE)
|
|
544
|
+
for match in matches:
|
|
545
|
+
noun_phrase = match.group(1).strip()
|
|
546
|
+
cleaned = self._normalize_variable_name(noun_phrase)
|
|
547
|
+
# Only add if it's not an action verb or epistemic term itself
|
|
548
|
+
if (cleaned and
|
|
549
|
+
not self._is_action_verb(cleaned) and
|
|
550
|
+
not self._is_epistemic_term(cleaned) and
|
|
551
|
+
cleaned not in self.stop_words):
|
|
552
|
+
extracted_vars.add(cleaned)
|
|
553
|
+
|
|
554
|
+
return extracted_vars
|
|
555
|
+
|
|
556
|
+
def _extract_variables_from_vague_language(self, text: str) -> Set[str]:
|
|
557
|
+
"""
|
|
558
|
+
Extract state variables from vague language using semantic understanding.
|
|
559
|
+
|
|
560
|
+
Handles patterns like:
|
|
561
|
+
- "what affects X" -> extract X and what affects it
|
|
562
|
+
- "how does X relate to Y" -> extract X, Y
|
|
563
|
+
- "the relationship between X and Y" -> extract X, Y
|
|
564
|
+
- "factors influencing X" -> extract X and factors
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
text: Input text
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
Set of extracted state variable names
|
|
571
|
+
"""
|
|
572
|
+
extracted_vars = set()
|
|
573
|
+
text_lower = text.lower()
|
|
574
|
+
|
|
575
|
+
# Pattern: "what affects/influences/causes X"
|
|
576
|
+
affect_patterns = [
|
|
577
|
+
r'what\s+(?:affects|influences|causes|impacts|changes)\s+(\w+(?:\s+\w+)?)',
|
|
578
|
+
r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+(?:affect|influence|cause|impact)',
|
|
579
|
+
r'factors?\s+(?:affecting|influencing|causing|impacting)\s+(\w+(?:\s+\w+)?)',
|
|
580
|
+
]
|
|
581
|
+
|
|
582
|
+
for pattern in affect_patterns:
|
|
583
|
+
matches = re.finditer(pattern, text_lower, re.IGNORECASE)
|
|
584
|
+
for match in matches:
|
|
585
|
+
noun_phrase = match.group(1).strip()
|
|
586
|
+
cleaned = self._normalize_variable_name(noun_phrase)
|
|
587
|
+
if cleaned and not self._is_action_verb(cleaned) and not self._is_epistemic_term(cleaned):
|
|
588
|
+
extracted_vars.add(cleaned)
|
|
589
|
+
|
|
590
|
+
# Pattern: "relationship between X and Y"
|
|
591
|
+
relationship_pattern = r'relationship\s+(?:between|among)\s+(\w+(?:\s+\w+)?)\s+(?:and|&)\s+(\w+(?:\s+\w+)?)'
|
|
592
|
+
matches = re.finditer(relationship_pattern, text_lower, re.IGNORECASE)
|
|
593
|
+
for match in matches:
|
|
594
|
+
var1 = self._normalize_variable_name(match.group(1).strip())
|
|
595
|
+
var2 = self._normalize_variable_name(match.group(2).strip())
|
|
596
|
+
if var1 and not self._is_action_verb(var1) and not self._is_epistemic_term(var1):
|
|
597
|
+
extracted_vars.add(var1)
|
|
598
|
+
if var2 and not self._is_action_verb(var2) and not self._is_epistemic_term(var2):
|
|
599
|
+
extracted_vars.add(var2)
|
|
600
|
+
|
|
601
|
+
# Pattern: "how does X relate to Y"
|
|
602
|
+
relate_pattern = r'how\s+(?:does|do)\s+(\w+(?:\s+\w+)?)\s+relate\s+to\s+(\w+(?:\s+\w+)?)'
|
|
603
|
+
matches = re.finditer(relate_pattern, text_lower, re.IGNORECASE)
|
|
604
|
+
for match in matches:
|
|
605
|
+
var1 = self._normalize_variable_name(match.group(1).strip())
|
|
606
|
+
var2 = self._normalize_variable_name(match.group(2).strip())
|
|
607
|
+
if var1 and not self._is_action_verb(var1) and not self._is_epistemic_term(var1):
|
|
608
|
+
extracted_vars.add(var1)
|
|
609
|
+
if var2 and not self._is_action_verb(var2) and not self._is_epistemic_term(var2):
|
|
610
|
+
extracted_vars.add(var2)
|
|
611
|
+
|
|
612
|
+
# Pattern: "the effect of X on Y"
|
|
613
|
+
effect_pattern = r'effect\s+of\s+(\w+(?:\s+\w+)?)\s+on\s+(\w+(?:\s+\w+)?)'
|
|
614
|
+
matches = re.finditer(effect_pattern, text_lower, re.IGNORECASE)
|
|
615
|
+
for match in matches:
|
|
616
|
+
var1 = self._normalize_variable_name(match.group(1).strip())
|
|
617
|
+
var2 = self._normalize_variable_name(match.group(2).strip())
|
|
618
|
+
if var1 and not self._is_action_verb(var1) and not self._is_epistemic_term(var1):
|
|
619
|
+
extracted_vars.add(var1)
|
|
620
|
+
if var2 and not self._is_action_verb(var2) and not self._is_epistemic_term(var2):
|
|
621
|
+
extracted_vars.add(var2)
|
|
622
|
+
|
|
623
|
+
return extracted_vars
|
|
624
|
+
|
|
625
|
+
def _extract_with_context(self, text: str) -> List[Dict[str, Any]]:
|
|
626
|
+
"""
|
|
627
|
+
Extract variables and relationships with context awareness.
|
|
628
|
+
Enhanced to handle numerical values, conditionals, questions, action verbs, and epistemic terms.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
text: Input text
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
List of extracted relationships with context
|
|
635
|
+
"""
|
|
636
|
+
relationships = []
|
|
637
|
+
text_lower = text.lower()
|
|
638
|
+
|
|
639
|
+
# Extract using all patterns
|
|
640
|
+
for pattern, rel_type, confidence in self.patterns:
|
|
641
|
+
matches = re.finditer(pattern, text_lower, re.IGNORECASE)
|
|
642
|
+
for match in matches:
|
|
643
|
+
# Handle patterns with 1 or 2 groups
|
|
644
|
+
if match.lastindex >= 2:
|
|
645
|
+
source_raw = match.group(1).strip()
|
|
646
|
+
target_raw = match.group(2).strip()
|
|
647
|
+
elif match.lastindex == 1:
|
|
648
|
+
# Single group patterns (like question_target)
|
|
649
|
+
source_raw = match.group(1).strip()
|
|
650
|
+
target_raw = None
|
|
651
|
+
else:
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
# Normalize variable names (remove numerical values and percentages)
|
|
655
|
+
source = self._normalize_variable_name(source_raw)
|
|
656
|
+
if target_raw:
|
|
657
|
+
target = self._normalize_variable_name(target_raw)
|
|
658
|
+
else:
|
|
659
|
+
target = None
|
|
660
|
+
|
|
661
|
+
# Skip if too short or stop words
|
|
662
|
+
if not source or len(source.split()) == 0:
|
|
663
|
+
continue
|
|
664
|
+
if source in self.stop_words:
|
|
665
|
+
continue
|
|
666
|
+
|
|
667
|
+
# For single-group patterns (questions), extract target from context
|
|
668
|
+
if not target and rel_type in ['question_target', 'question_target_time', 'state_description', 'state_equals']:
|
|
669
|
+
# Try to find what the question is about
|
|
670
|
+
# Look for "what is X" -> X is the target variable
|
|
671
|
+
if 'what' in text_lower or 'which' in text_lower:
|
|
672
|
+
# Extract all variables mentioned before the question
|
|
673
|
+
# This is a heuristic - the question target is usually mentioned earlier
|
|
674
|
+
pass # Will be handled by standalone variable extraction
|
|
675
|
+
|
|
676
|
+
# For state descriptions, infer relationships
|
|
677
|
+
if rel_type in ['state_description', 'state_equals', 'state_of'] and target:
|
|
678
|
+
# State descriptions like "X is Y" don't create causal edges directly
|
|
679
|
+
# But we can infer that variables mentioned together might be related
|
|
680
|
+
continue
|
|
681
|
+
|
|
682
|
+
# Skip if target is invalid
|
|
683
|
+
if target and (len(target.split()) == 0 or target in self.stop_words):
|
|
684
|
+
continue
|
|
685
|
+
|
|
686
|
+
# Check for negation
|
|
687
|
+
start_pos = match.start()
|
|
688
|
+
end_pos = match.end()
|
|
689
|
+
is_negated = self._detect_negation(text, start_pos, end_pos)
|
|
690
|
+
|
|
691
|
+
# Adjust confidence for negation
|
|
692
|
+
if is_negated:
|
|
693
|
+
confidence *= 0.3 # Much lower confidence for negated relationships
|
|
694
|
+
|
|
695
|
+
# Only add if we have both source and target (or it's a question pattern)
|
|
696
|
+
if target or rel_type in ['question_target', 'question_target_time', 'what_is', 'who_is', 'where_is', 'when_is', 'how_works', 'what_means']:
|
|
697
|
+
# Determine relationship category
|
|
698
|
+
relationship_category = 'causal' # default
|
|
699
|
+
if rel_type in ['is_a', 'belongs_to', 'is', 'defined_as', 'means', 'refers_to', 'equivalent_to', 'similar_to']:
|
|
700
|
+
relationship_category = 'taxonomic'
|
|
701
|
+
elif rel_type in ['has_property', 'contains', 'includes', 'part_of', 'consists_of']:
|
|
702
|
+
relationship_category = 'meronymic'
|
|
703
|
+
elif rel_type in ['located_in', 'found_in']:
|
|
704
|
+
relationship_category = 'spatial'
|
|
705
|
+
elif rel_type in ['used_for', 'functions_as']:
|
|
706
|
+
relationship_category = 'functional'
|
|
707
|
+
elif rel_type in ['temporal', 'precedes', 'follows', 'before', 'after', 'delayed']:
|
|
708
|
+
relationship_category = 'temporal'
|
|
709
|
+
elif rel_type in ['what_is', 'who_is', 'where_is', 'when_is', 'how_works', 'what_means']:
|
|
710
|
+
relationship_category = 'definitional'
|
|
711
|
+
elif rel_type in ['factual', 'became', 'changed_to']:
|
|
712
|
+
relationship_category = 'factual'
|
|
713
|
+
elif rel_type in ['causes', 'affects', 'influences', 'depends_on', 'leads_to', 'results_in', 'impacts', 'drives', 'determines', 'controls', 'caused_by', 'affected_by', 'results_from', 'increases', 'decreases']:
|
|
714
|
+
relationship_category = 'causal'
|
|
715
|
+
|
|
716
|
+
relationships.append({
|
|
717
|
+
'source': source,
|
|
718
|
+
'target': target or source, # For questions, use source as both
|
|
719
|
+
'type': rel_type,
|
|
720
|
+
'category': relationship_category,
|
|
721
|
+
'confidence': confidence,
|
|
722
|
+
'negated': is_negated,
|
|
723
|
+
'raw_source': source_raw,
|
|
724
|
+
'raw_target': target_raw or source_raw
|
|
725
|
+
})
|
|
726
|
+
|
|
727
|
+
# Post-process: For conditional questions, infer relationships between mentioned variables
|
|
728
|
+
if 'if' in text_lower and 'what' in text_lower:
|
|
729
|
+
# Extract all variables mentioned
|
|
730
|
+
all_vars = self._extract_standalone_variables(text)
|
|
731
|
+
var_list = sorted(list(all_vars))
|
|
732
|
+
|
|
733
|
+
# If we have multiple variables, infer they might be related
|
|
734
|
+
if len(var_list) >= 2:
|
|
735
|
+
# Common pattern: "If X is Y, what is Z?" -> X might affect Z
|
|
736
|
+
for i in range(len(var_list) - 1):
|
|
737
|
+
relationships.append({
|
|
738
|
+
'source': var_list[i],
|
|
739
|
+
'target': var_list[-1], # Last variable is usually the question target
|
|
740
|
+
'type': 'inferred_from_question',
|
|
741
|
+
'confidence': 0.5,
|
|
742
|
+
'negated': False,
|
|
743
|
+
'raw_source': var_list[i],
|
|
744
|
+
'raw_target': var_list[-1]
|
|
745
|
+
})
|
|
746
|
+
|
|
747
|
+
# NEW: Extract state variables from action verbs and epistemic terms
|
|
748
|
+
# This helps handle vague language like "identify past policy" or "analyze the system"
|
|
749
|
+
action_verb_vars = self._extract_state_variables_from_action_verbs(text)
|
|
750
|
+
epistemic_vars = self._extract_state_variables_from_epistemic_terms(text)
|
|
751
|
+
vague_language_vars = self._extract_variables_from_vague_language(text)
|
|
752
|
+
|
|
753
|
+
# Add relationships for extracted variables (if we can infer them)
|
|
754
|
+
# For action verbs: if we have "identify X" and "determine Y", infer X might affect Y
|
|
755
|
+
all_extracted = action_verb_vars | epistemic_vars | vague_language_vars
|
|
756
|
+
if len(all_extracted) >= 2:
|
|
757
|
+
# Create inferred relationships between extracted variables
|
|
758
|
+
extracted_list = sorted(list(all_extracted))
|
|
759
|
+
for i in range(len(extracted_list) - 1):
|
|
760
|
+
# Only add if not already in relationships
|
|
761
|
+
already_exists = any(
|
|
762
|
+
r['source'] == extracted_list[i] and r['target'] == extracted_list[i+1]
|
|
763
|
+
for r in relationships
|
|
764
|
+
)
|
|
765
|
+
if not already_exists:
|
|
766
|
+
relationships.append({
|
|
767
|
+
'source': extracted_list[i],
|
|
768
|
+
'target': extracted_list[i+1],
|
|
769
|
+
'type': 'inferred_from_action_verb',
|
|
770
|
+
'confidence': 0.4, # Lower confidence for inferred relationships
|
|
771
|
+
'negated': False,
|
|
772
|
+
'raw_source': extracted_list[i],
|
|
773
|
+
'raw_target': extracted_list[i+1]
|
|
774
|
+
})
|
|
775
|
+
|
|
776
|
+
return relationships
|
|
777
|
+
|
|
778
|
+
def _extract_variables_with_values(self, text: str) -> Dict[str, Any]:
|
|
779
|
+
"""
|
|
780
|
+
Extract variables that have numerical values attached.
|
|
781
|
+
|
|
782
|
+
Args:
|
|
783
|
+
text: Input text
|
|
784
|
+
|
|
785
|
+
Returns:
|
|
786
|
+
Dictionary mapping variables to their values
|
|
787
|
+
"""
|
|
788
|
+
variables_with_values = {}
|
|
789
|
+
text_lower = text.lower()
|
|
790
|
+
|
|
791
|
+
# Pattern: "variable is value" or "variable = value" or "variable: value"
|
|
792
|
+
patterns = [
|
|
793
|
+
r'(\w+(?:\s+\w+)?)\s+is\s+(\d+[.,]?\d*%?|\d+[.,]?\d*\s*[a-z]+)',
|
|
794
|
+
r'(\w+(?:\s+\w+)?)\s*[=:]\s*(\d+[.,]?\d*%?|\d+[.,]?\d*\s*[a-z]+)',
|
|
795
|
+
r'(\w+(?:\s+\w+)?)\s+of\s+(\d+[.,]?\d*%?)',
|
|
796
|
+
]
|
|
797
|
+
|
|
798
|
+
for pattern in patterns:
|
|
799
|
+
matches = re.finditer(pattern, text_lower, re.IGNORECASE)
|
|
800
|
+
for match in matches:
|
|
801
|
+
var = self._normalize_variable_name(match.group(1))
|
|
802
|
+
value = match.group(2).strip()
|
|
803
|
+
if var and var not in self.stop_words:
|
|
804
|
+
variables_with_values[var] = value
|
|
805
|
+
|
|
806
|
+
return variables_with_values
|
|
807
|
+
|
|
808
|
+
def _extract_standalone_variables(self, text: str) -> Set[str]:
|
|
809
|
+
"""
|
|
810
|
+
Extract standalone variables using multiple strategies.
|
|
811
|
+
|
|
812
|
+
Args:
|
|
813
|
+
text: Input text
|
|
814
|
+
|
|
815
|
+
Returns:
|
|
816
|
+
Set of variable names
|
|
817
|
+
"""
|
|
818
|
+
variables = set()
|
|
819
|
+
text_lower = text.lower()
|
|
820
|
+
|
|
821
|
+
# Strategy 1: Extract variables with values (new)
|
|
822
|
+
variables_with_values = self._extract_variables_with_values(text)
|
|
823
|
+
variables.update(variables_with_values.keys())
|
|
824
|
+
|
|
825
|
+
# Strategy 2: Keyword-based extraction
|
|
826
|
+
words = re.findall(r'\b\w+\b', text_lower)
|
|
827
|
+
for word in words:
|
|
828
|
+
if word in self.stop_words:
|
|
829
|
+
continue
|
|
830
|
+
# Check if word contains or matches keywords
|
|
831
|
+
for keyword in self.variable_keywords:
|
|
832
|
+
if keyword in word or word in keyword:
|
|
833
|
+
variables.add(word)
|
|
834
|
+
|
|
835
|
+
# Strategy 3: Noun phrase extraction (enhanced to handle "X is Y" patterns)
|
|
836
|
+
# Extract noun phrases before "is", "=", ":" followed by numbers
|
|
837
|
+
state_patterns = [
|
|
838
|
+
r'(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+(?:\d|%)',
|
|
839
|
+
r'(\w+(?:\s+\w+)?)\s*[=:]\s*(?:\d|%)',
|
|
840
|
+
]
|
|
841
|
+
for pattern in state_patterns:
|
|
842
|
+
matches = re.finditer(pattern, text_lower, re.IGNORECASE)
|
|
843
|
+
for match in matches:
|
|
844
|
+
var = self._normalize_variable_name(match.group(1))
|
|
845
|
+
if var and var not in self.stop_words:
|
|
846
|
+
variables.add(var)
|
|
847
|
+
|
|
848
|
+
# Strategy 4: Standard noun phrase extraction
|
|
849
|
+
noun_phrases = self._extract_noun_phrases(text_lower)
|
|
850
|
+
for phrase in noun_phrases:
|
|
851
|
+
# Filter out phrases that are just stop words
|
|
852
|
+
words = phrase.split()
|
|
853
|
+
if words and not all(w in self.stop_words for w in words):
|
|
854
|
+
variables.add(phrase)
|
|
855
|
+
|
|
856
|
+
# Strategy 5: Capitalized words (proper nouns or emphasized terms)
|
|
857
|
+
capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
|
858
|
+
for word in capitalized:
|
|
859
|
+
normalized = self._normalize_variable_name(word)
|
|
860
|
+
if normalized and normalized not in self.stop_words:
|
|
861
|
+
variables.add(normalized)
|
|
862
|
+
|
|
863
|
+
# Strategy 6: Quoted phrases
|
|
864
|
+
quoted = re.findall(r'"([^"]+)"|\'([^\']+)\'', text)
|
|
865
|
+
for match in quoted:
|
|
866
|
+
phrase = (match[0] or match[1]).strip().lower()
|
|
867
|
+
if phrase and phrase not in self.stop_words:
|
|
868
|
+
variables.add(phrase)
|
|
869
|
+
|
|
870
|
+
# Strategy 7: Terms after "of", "for", "in" (common variable indicators)
|
|
871
|
+
of_pattern = r'\b(?:of|for|in|about|regarding)\s+(\w+(?:\s+\w+)?)'
|
|
872
|
+
of_matches = re.finditer(of_pattern, text_lower)
|
|
873
|
+
for match in of_matches:
|
|
874
|
+
var = self._normalize_variable_name(match.group(1))
|
|
875
|
+
if var and var not in self.stop_words:
|
|
876
|
+
variables.add(var)
|
|
877
|
+
|
|
878
|
+
# Strategy 8: Extract from questions (what is X, what will X be)
|
|
879
|
+
question_patterns = [
|
|
880
|
+
r'(?:what|which|how\s+much|how\s+many)\s+(?:is|are|will|would|should)\s+(?:the\s+)?(\w+(?:\s+\w+)?)',
|
|
881
|
+
r'(?:what|which)\s+is\s+(?:the\s+)?(\w+(?:\s+\w+)?)\s+(?:of|in|for)',
|
|
882
|
+
]
|
|
883
|
+
for pattern in question_patterns:
|
|
884
|
+
matches = re.finditer(pattern, text_lower, re.IGNORECASE)
|
|
885
|
+
for match in matches:
|
|
886
|
+
var = self._normalize_variable_name(match.group(1))
|
|
887
|
+
if var and var not in self.stop_words:
|
|
888
|
+
variables.add(var)
|
|
889
|
+
|
|
890
|
+
# Strategy 9: Extract variables mentioned with "&" or "and" (common in state descriptions)
|
|
891
|
+
and_pattern = r'(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+[\d%]+\s*(?:&|and)\s+(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)'
|
|
892
|
+
and_matches = re.finditer(and_pattern, text_lower, re.IGNORECASE)
|
|
893
|
+
for match in and_matches:
|
|
894
|
+
var1 = self._normalize_variable_name(match.group(1))
|
|
895
|
+
var2 = self._normalize_variable_name(match.group(2))
|
|
896
|
+
if var1 and var1 not in self.stop_words:
|
|
897
|
+
variables.add(var1)
|
|
898
|
+
if var2 and var2 not in self.stop_words:
|
|
899
|
+
variables.add(var2)
|
|
900
|
+
|
|
901
|
+
# Strategy 10: Extract from action verbs (e.g., "identify X" -> extract X)
|
|
902
|
+
# This helps handle vague language by finding what action verbs refer to
|
|
903
|
+
action_verb_vars = self._extract_state_variables_from_action_verbs(text)
|
|
904
|
+
variables.update(action_verb_vars)
|
|
905
|
+
|
|
906
|
+
# Strategy 11: Extract from epistemic terms (e.g., "policy of X" -> extract X)
|
|
907
|
+
epistemic_vars = self._extract_state_variables_from_epistemic_terms(text)
|
|
908
|
+
variables.update(epistemic_vars)
|
|
909
|
+
|
|
910
|
+
# Strategy 12: Extract from vague language patterns
|
|
911
|
+
vague_vars = self._extract_variables_from_vague_language(text)
|
|
912
|
+
variables.update(vague_vars)
|
|
913
|
+
|
|
914
|
+
return variables
|
|
915
|
+
|
|
916
|
+
def _resolve_references(self, text: str, variables: Set[str]) -> Set[str]:
|
|
917
|
+
"""
|
|
918
|
+
Resolve pronoun and reference resolution.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
text: Input text
|
|
922
|
+
variables: Existing variables
|
|
923
|
+
|
|
924
|
+
Returns:
|
|
925
|
+
Updated set of variables with resolved references
|
|
926
|
+
"""
|
|
927
|
+
# Simple pronoun resolution: if we see "it", "this", "that" referring to variables
|
|
928
|
+
# This is a simplified version - full resolution would require more context
|
|
929
|
+
resolved = variables.copy()
|
|
930
|
+
|
|
931
|
+
# Look for patterns like "this X", "that X", "these X", "those X"
|
|
932
|
+
reference_pattern = r'\b(this|that|these|those)\s+(\w+(?:\s+\w+)?)'
|
|
933
|
+
matches = re.finditer(reference_pattern, text.lower())
|
|
934
|
+
for match in matches:
|
|
935
|
+
var = self._normalize_variable_name(match.group(2))
|
|
936
|
+
if var and var not in self.stop_words:
|
|
937
|
+
resolved.add(var)
|
|
938
|
+
|
|
939
|
+
return resolved
|
|
940
|
+
|
|
941
|
+
def _merge_similar_variables(self, variables: Set[str]) -> Set[str]:
|
|
942
|
+
"""
|
|
943
|
+
Merge similar variable names (plurals, variations).
|
|
944
|
+
|
|
945
|
+
Args:
|
|
946
|
+
variables: Set of variable names
|
|
947
|
+
|
|
948
|
+
Returns:
|
|
949
|
+
Merged set of variables
|
|
950
|
+
"""
|
|
951
|
+
merged = set()
|
|
952
|
+
variable_list = list(variables)
|
|
953
|
+
|
|
954
|
+
for var in variable_list:
|
|
955
|
+
# Check if similar variable already exists
|
|
956
|
+
is_duplicate = False
|
|
957
|
+
for existing in merged:
|
|
958
|
+
# Check for plural/singular
|
|
959
|
+
if var == existing or var == existing + 's' or var + 's' == existing:
|
|
960
|
+
is_duplicate = True
|
|
961
|
+
break
|
|
962
|
+
# Check for common variations
|
|
963
|
+
if var.replace('_', ' ') == existing.replace('_', ' '):
|
|
964
|
+
is_duplicate = True
|
|
965
|
+
break
|
|
966
|
+
|
|
967
|
+
if not is_duplicate:
|
|
968
|
+
merged.add(var)
|
|
969
|
+
|
|
970
|
+
return merged
|
|
971
|
+
|
|
972
|
+
def _filter_valid_variables(self, variables: Set[str]) -> Set[str]:
|
|
973
|
+
"""
|
|
974
|
+
Filter variables to keep only valid ones for causal analysis.
|
|
975
|
+
|
|
976
|
+
Args:
|
|
977
|
+
variables: Set of variable names
|
|
978
|
+
|
|
979
|
+
Returns:
|
|
980
|
+
Set of valid variable names
|
|
981
|
+
"""
|
|
982
|
+
valid = set()
|
|
983
|
+
|
|
984
|
+
for var in variables:
|
|
985
|
+
# Use _clean_variable to validate
|
|
986
|
+
cleaned = self._clean_variable(var)
|
|
987
|
+
if cleaned:
|
|
988
|
+
# Additional checks
|
|
989
|
+
words = cleaned.split()
|
|
990
|
+
|
|
991
|
+
# Filter out single words that aren't keywords
|
|
992
|
+
if len(words) == 1:
|
|
993
|
+
if cleaned not in self.variable_keywords:
|
|
994
|
+
# Check if it's a meaningful single word
|
|
995
|
+
if cleaned.lower() in self.stop_words:
|
|
996
|
+
continue
|
|
997
|
+
# Very short single words are likely invalid
|
|
998
|
+
if len(cleaned) < 4:
|
|
999
|
+
continue
|
|
1000
|
+
|
|
1001
|
+
# Filter out variables that are clearly value descriptors
|
|
1002
|
+
value_descriptors = ['buy', 'sell', 'percent', 'percentage']
|
|
1003
|
+
if any(desc in cleaned.lower() for desc in value_descriptors):
|
|
1004
|
+
continue
|
|
1005
|
+
|
|
1006
|
+
# Filter out variables that start with "if"
|
|
1007
|
+
if cleaned.lower().startswith('if '):
|
|
1008
|
+
continue
|
|
1009
|
+
|
|
1010
|
+
valid.add(cleaned)
|
|
1011
|
+
|
|
1012
|
+
return valid
|
|
1013
|
+
|
|
1014
|
+
def _is_action_verb(self, var: str) -> bool:
|
|
1015
|
+
"""
|
|
1016
|
+
Check if a variable is actually an action verb (epistemic/intentional action).
|
|
1017
|
+
|
|
1018
|
+
Action verbs like "identify", "analyze" should NOT be treated as causal variables.
|
|
1019
|
+
|
|
1020
|
+
Uses both local action_verbs list and dictionary part-of-speech checking.
|
|
1021
|
+
|
|
1022
|
+
Args:
|
|
1023
|
+
var: Variable name to check
|
|
1024
|
+
|
|
1025
|
+
Returns:
|
|
1026
|
+
True if it's an action verb
|
|
1027
|
+
"""
|
|
1028
|
+
var_lower = var.lower()
|
|
1029
|
+
words = var_lower.split()
|
|
1030
|
+
|
|
1031
|
+
# Check if any word is an action verb (local list)
|
|
1032
|
+
for word in words:
|
|
1033
|
+
if word in self.action_verbs:
|
|
1034
|
+
return True
|
|
1035
|
+
# Check for verb forms (ing, ed, s)
|
|
1036
|
+
base_word = word.rstrip('s').rstrip('ed').rstrip('ing')
|
|
1037
|
+
if base_word in self.action_verbs:
|
|
1038
|
+
return True
|
|
1039
|
+
|
|
1040
|
+
# Use dictionary to check part of speech (more accurate)
|
|
1041
|
+
if self.lexical_compiler and self.lexical_compiler.enable_dictionary:
|
|
1042
|
+
# For single-word variables, check if it's a verb
|
|
1043
|
+
if len(words) == 1:
|
|
1044
|
+
if self.lexical_compiler.is_action_verb(words[0]):
|
|
1045
|
+
return True
|
|
1046
|
+
# For multi-word, check each word
|
|
1047
|
+
else:
|
|
1048
|
+
for word in words:
|
|
1049
|
+
if self.lexical_compiler.is_action_verb(word):
|
|
1050
|
+
return True
|
|
1051
|
+
|
|
1052
|
+
return False
|
|
1053
|
+
|
|
1054
|
+
def _is_epistemic_term(self, var: str) -> bool:
|
|
1055
|
+
"""
|
|
1056
|
+
Check if a variable is an epistemic/intentional term (task, policy, etc.).
|
|
1057
|
+
|
|
1058
|
+
These are not causal state variables - they're about knowledge/intentions.
|
|
1059
|
+
|
|
1060
|
+
Args:
|
|
1061
|
+
var: Variable name to check
|
|
1062
|
+
|
|
1063
|
+
Returns:
|
|
1064
|
+
True if it's an epistemic term
|
|
1065
|
+
"""
|
|
1066
|
+
var_lower = var.lower()
|
|
1067
|
+
words = var_lower.split()
|
|
1068
|
+
|
|
1069
|
+
# Check if any word is an epistemic term
|
|
1070
|
+
for word in words:
|
|
1071
|
+
if word in self.epistemic_terms:
|
|
1072
|
+
return True
|
|
1073
|
+
|
|
1074
|
+
# Check for common epistemic patterns
|
|
1075
|
+
epistemic_patterns = [
|
|
1076
|
+
r'past\s+\w+', # "past policy"
|
|
1077
|
+
r'\w+\s+policy', # "X policy"
|
|
1078
|
+
r'\w+\s+task', # "X task"
|
|
1079
|
+
r'\w+\s+goal', # "X goal"
|
|
1080
|
+
r'\w+\s+decision', # "X decision"
|
|
1081
|
+
]
|
|
1082
|
+
|
|
1083
|
+
for pattern in epistemic_patterns:
|
|
1084
|
+
if re.search(pattern, var_lower):
|
|
1085
|
+
return True
|
|
1086
|
+
|
|
1087
|
+
return False
|
|
1088
|
+
|
|
1089
|
+
def validate_causal_relationship(
|
|
1090
|
+
self,
|
|
1091
|
+
source: str,
|
|
1092
|
+
target: str,
|
|
1093
|
+
graph: Dict[str, Any]
|
|
1094
|
+
) -> Tuple[bool, Optional[str]]:
|
|
1095
|
+
"""
|
|
1096
|
+
Validate causal relationship using do-calculus and d-separation.
|
|
1097
|
+
|
|
1098
|
+
Implements formal causal validation:
|
|
1099
|
+
- Correlation vs. causation: P(Y | X) ≠ P(Y | do(X))
|
|
1100
|
+
- D-separation: X ⊥ Y | Z if d-separated in graph
|
|
1101
|
+
- Temporal ordering: if X causes Y, then time(X) < time(Y)
|
|
1102
|
+
- Confounder detection: backdoor criterion
|
|
1103
|
+
|
|
1104
|
+
Args:
|
|
1105
|
+
source: Source variable
|
|
1106
|
+
target: Target variable
|
|
1107
|
+
graph: Graph state
|
|
1108
|
+
|
|
1109
|
+
Returns:
|
|
1110
|
+
Tuple of (is_valid, error_message)
|
|
1111
|
+
"""
|
|
1112
|
+
nodes = graph.get('nodes', [])
|
|
1113
|
+
edges = graph.get('edges', [])
|
|
1114
|
+
|
|
1115
|
+
# Check if variables exist
|
|
1116
|
+
if source not in nodes or target not in nodes:
|
|
1117
|
+
return False, f"Variables {source} or {target} not in graph"
|
|
1118
|
+
|
|
1119
|
+
# Check for direct edge (simplified causal validation)
|
|
1120
|
+
if (source, target) in edges:
|
|
1121
|
+
# Valid causal edge
|
|
1122
|
+
return True, None
|
|
1123
|
+
|
|
1124
|
+
# Check for confounders using backdoor criterion (simplified)
|
|
1125
|
+
# Look for common causes
|
|
1126
|
+
common_causes = []
|
|
1127
|
+
for node in nodes:
|
|
1128
|
+
if node != source and node != target:
|
|
1129
|
+
# Check if node is a parent of both source and target
|
|
1130
|
+
has_edge_to_source = (node, source) in edges
|
|
1131
|
+
has_edge_to_target = (node, target) in edges
|
|
1132
|
+
if has_edge_to_source and has_edge_to_target:
|
|
1133
|
+
common_causes.append(node)
|
|
1134
|
+
|
|
1135
|
+
if common_causes:
|
|
1136
|
+
return False, f"Potential confounder detected: {common_causes[0]}"
|
|
1137
|
+
|
|
1138
|
+
return True, None
|
|
1139
|
+
|
|
1140
|
+
def _clean_variable(self, var: str) -> Optional[str]:
|
|
1141
|
+
"""
|
|
1142
|
+
Clean and validate a variable name.
|
|
1143
|
+
|
|
1144
|
+
Args:
|
|
1145
|
+
var: Raw variable name
|
|
1146
|
+
|
|
1147
|
+
Returns:
|
|
1148
|
+
Cleaned variable name or None if invalid
|
|
1149
|
+
"""
|
|
1150
|
+
if not var:
|
|
1151
|
+
return None
|
|
1152
|
+
|
|
1153
|
+
# Normalize
|
|
1154
|
+
var = self._normalize_variable_name(var)
|
|
1155
|
+
var_lower = var.lower()
|
|
1156
|
+
|
|
1157
|
+
# CRITICAL: Filter out action verbs (epistemic/intentional actions)
|
|
1158
|
+
# These are NOT causal state variables - they're tasks, not observables
|
|
1159
|
+
if self._is_action_verb(var):
|
|
1160
|
+
return None
|
|
1161
|
+
|
|
1162
|
+
# CRITICAL: Filter out epistemic/intentional terms
|
|
1163
|
+
# These are about knowledge/intentions, not causal state variables
|
|
1164
|
+
if self._is_epistemic_term(var):
|
|
1165
|
+
return None
|
|
1166
|
+
|
|
1167
|
+
# Filter out relationship phrases (contain causal verbs)
|
|
1168
|
+
if any(verb in var_lower for verb in self.causal_verbs):
|
|
1169
|
+
return None
|
|
1170
|
+
|
|
1171
|
+
# Filter out if it contains relationship indicators
|
|
1172
|
+
relationship_indicators = ['depends', 'causes', 'affects', 'influences', 'leads', 'results', 'impacts']
|
|
1173
|
+
if any(indicator in var_lower for indicator in relationship_indicators):
|
|
1174
|
+
return None
|
|
1175
|
+
|
|
1176
|
+
# Filter out value descriptions (buy, sell, etc. when they're part of percentages)
|
|
1177
|
+
value_descriptors = ['buy', 'sell', 'percent', 'percentage', '%']
|
|
1178
|
+
if var in value_descriptors:
|
|
1179
|
+
return None
|
|
1180
|
+
|
|
1181
|
+
# Filter out time units that are standalone (but keep "7 days" as a variable)
|
|
1182
|
+
time_units = ['day', 'days', 'hour', 'hours', 'minute', 'minutes', 'second', 'seconds']
|
|
1183
|
+
if var in time_units and len(var.split()) == 1:
|
|
1184
|
+
return None
|
|
1185
|
+
|
|
1186
|
+
# Remove common conjunctions at start/end
|
|
1187
|
+
words = var.split()
|
|
1188
|
+
if words:
|
|
1189
|
+
# Remove "and", "or", "the", "a", "an", "if" from start
|
|
1190
|
+
while words and words[0].lower() in {'and', 'or', 'the', 'a', 'an', 'if'}:
|
|
1191
|
+
words = words[1:]
|
|
1192
|
+
# Remove "and", "or" from end
|
|
1193
|
+
while words and words[-1].lower() in {'and', 'or'}:
|
|
1194
|
+
words = words[:-1]
|
|
1195
|
+
|
|
1196
|
+
if not words:
|
|
1197
|
+
return None
|
|
1198
|
+
|
|
1199
|
+
var = ' '.join(words)
|
|
1200
|
+
|
|
1201
|
+
# Filter out if it's just stop words
|
|
1202
|
+
if var in self.stop_words:
|
|
1203
|
+
return None
|
|
1204
|
+
|
|
1205
|
+
# Filter out if all words are stop words
|
|
1206
|
+
if all(w in self.stop_words for w in var.split()):
|
|
1207
|
+
return None
|
|
1208
|
+
|
|
1209
|
+
# Stricter filtering for single-word variables
|
|
1210
|
+
if len(words) == 1:
|
|
1211
|
+
# Filter out single-word variables that are likely invalid
|
|
1212
|
+
invalid_single_words = {
|
|
1213
|
+
'if', 'and', 'or', 'but', 'the', 'a', 'an', 'buy', 'sell',
|
|
1214
|
+
'days', 'day', 'hours', 'hour', 'minutes', 'minute',
|
|
1215
|
+
'seconds', 'second', 'of', 'in', 'on', 'at', 'to', 'for',
|
|
1216
|
+
'from', 'with', 'by', 'as', 'is', 'was', 'are', 'were',
|
|
1217
|
+
'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does',
|
|
1218
|
+
'did', 'will', 'would', 'could', 'should', 'may', 'might',
|
|
1219
|
+
'must', 'can', 'this', 'that', 'these', 'those', 'what',
|
|
1220
|
+
'which', 'who', 'whom', 'where', 'when', 'why', 'how'
|
|
1221
|
+
}
|
|
1222
|
+
if var.lower() in invalid_single_words:
|
|
1223
|
+
return None
|
|
1224
|
+
|
|
1225
|
+
# Filter out very short single-word variables (unless it's a known keyword)
|
|
1226
|
+
if len(var) < 3 and var not in self.variable_keywords:
|
|
1227
|
+
return None
|
|
1228
|
+
|
|
1229
|
+
# Only allow single-word variables if they're in the keyword list
|
|
1230
|
+
if var not in self.variable_keywords:
|
|
1231
|
+
# Check if it's a meaningful single word (not a stop word)
|
|
1232
|
+
if var.lower() in self.stop_words:
|
|
1233
|
+
return None
|
|
1234
|
+
|
|
1235
|
+
# Filter out variables that are just conjunctions
|
|
1236
|
+
if var.lower() in {'and', 'or', 'but', 'the', 'a', 'an', 'if'}:
|
|
1237
|
+
return None
|
|
1238
|
+
|
|
1239
|
+
# Filter out very long phrases (likely not a single variable)
|
|
1240
|
+
if len(words) > 4:
|
|
1241
|
+
return None
|
|
1242
|
+
|
|
1243
|
+
# Final check: Reject if it's an action verb or epistemic term
|
|
1244
|
+
if self._is_action_verb(var) or self._is_epistemic_term(var):
|
|
1245
|
+
return None
|
|
1246
|
+
|
|
1247
|
+
# Optional: Use dictionary to validate word (if lexical compiler available)
|
|
1248
|
+
# This helps filter out made-up words or typos that passed other filters
|
|
1249
|
+
# Note: This is a soft check - we don't require dictionary validation for all words
|
|
1250
|
+
# as domain-specific terms may not be in standard dictionaries
|
|
1251
|
+
if self.lexical_compiler and self.lexical_compiler.enable_dictionary:
|
|
1252
|
+
# For single-word variables, check if it's a valid word
|
|
1253
|
+
# Multi-word phrases are more likely to be domain-specific
|
|
1254
|
+
if len(words) == 1:
|
|
1255
|
+
# Check if word exists in dictionary
|
|
1256
|
+
if not self.lexical_compiler.is_valid_word(words[0]):
|
|
1257
|
+
# Word not found - could be a typo or domain-specific term
|
|
1258
|
+
# We'll still allow it but with lower confidence
|
|
1259
|
+
logger.debug(f"Word '{words[0]}' not found in dictionary - may be domain-specific or typo")
|
|
1260
|
+
|
|
1261
|
+
return var
|
|
1262
|
+
|
|
1263
|
+
def _extract_clean_variables_from_relationships(self, relationships: List[Dict[str, Any]]) -> Set[str]:
|
|
1264
|
+
"""
|
|
1265
|
+
Extract clean variables from relationships.
|
|
1266
|
+
|
|
1267
|
+
Args:
|
|
1268
|
+
relationships: List of relationship dictionaries
|
|
1269
|
+
|
|
1270
|
+
Returns:
|
|
1271
|
+
Set of clean variable names
|
|
1272
|
+
"""
|
|
1273
|
+
variables = set()
|
|
1274
|
+
for rel in relationships:
|
|
1275
|
+
source = self._clean_variable(rel.get('source', ''))
|
|
1276
|
+
target = self._clean_variable(rel.get('target', ''))
|
|
1277
|
+
if source:
|
|
1278
|
+
variables.add(source)
|
|
1279
|
+
if target:
|
|
1280
|
+
variables.add(target)
|
|
1281
|
+
return variables
|
|
1282
|
+
|
|
1283
|
+
def _infer_relationships_from_context(self, variables: Set[str], text: str) -> List[Tuple[str, str]]:
|
|
1284
|
+
"""
|
|
1285
|
+
Infer relationships from context when explicit patterns aren't found.
|
|
1286
|
+
Enhanced to handle conditional questions and state descriptions.
|
|
1287
|
+
|
|
1288
|
+
Args:
|
|
1289
|
+
variables: Set of extracted variables
|
|
1290
|
+
text: Original text
|
|
1291
|
+
|
|
1292
|
+
Returns:
|
|
1293
|
+
List of inferred (source, target) tuples
|
|
1294
|
+
"""
|
|
1295
|
+
inferred = []
|
|
1296
|
+
var_list = sorted(list(variables))
|
|
1297
|
+
text_lower = text.lower()
|
|
1298
|
+
|
|
1299
|
+
# Clean variable list - remove value descriptors
|
|
1300
|
+
cleaned_vars = [v for v in var_list if self._clean_variable(v)]
|
|
1301
|
+
|
|
1302
|
+
# If we have a conditional question pattern: "If X is Y, what is Z?"
|
|
1303
|
+
if 'if' in text_lower and ('what' in text_lower or 'which' in text_lower or 'expected' in text_lower):
|
|
1304
|
+
# Find variables mentioned before "what" or "expected"
|
|
1305
|
+
question_markers = ['what', 'which', 'expected']
|
|
1306
|
+
question_pos = -1
|
|
1307
|
+
for marker in question_markers:
|
|
1308
|
+
pos = text_lower.find(marker)
|
|
1309
|
+
if pos > 0:
|
|
1310
|
+
question_pos = pos
|
|
1311
|
+
break
|
|
1312
|
+
|
|
1313
|
+
if question_pos > 0:
|
|
1314
|
+
before_question = text_lower[:question_pos]
|
|
1315
|
+
after_question = text_lower[question_pos:]
|
|
1316
|
+
|
|
1317
|
+
# Variables before question are likely causes (state variables)
|
|
1318
|
+
before_vars = [v for v in cleaned_vars if v.lower() in before_question and 'expected' not in v.lower()]
|
|
1319
|
+
# Variables after question are likely effects (question target)
|
|
1320
|
+
after_vars = [v for v in cleaned_vars if v.lower() in after_question or 'expected' in v.lower()]
|
|
1321
|
+
|
|
1322
|
+
# Also look for "expected X" pattern
|
|
1323
|
+
expected_pattern = r'expected\s+(\w+(?:\s+\w+)?)'
|
|
1324
|
+
expected_match = re.search(expected_pattern, text_lower, re.IGNORECASE)
|
|
1325
|
+
if expected_match:
|
|
1326
|
+
expected_var = self._clean_variable(expected_match.group(1))
|
|
1327
|
+
if expected_var and expected_var not in after_vars:
|
|
1328
|
+
after_vars.append(expected_var)
|
|
1329
|
+
|
|
1330
|
+
# Create relationships: state variables -> question target
|
|
1331
|
+
if before_vars and after_vars:
|
|
1332
|
+
for before_var in before_vars:
|
|
1333
|
+
for after_var in after_vars:
|
|
1334
|
+
if before_var != after_var:
|
|
1335
|
+
inferred.append((before_var, after_var))
|
|
1336
|
+
elif before_vars and not after_vars:
|
|
1337
|
+
# If no explicit target, use the most likely target (e.g., "expected price")
|
|
1338
|
+
# Look for variables with "expected" or mentioned in question
|
|
1339
|
+
question_var_pattern = r'(?:what|which|expected)\s+(?:is|are|will|would|the\s+)?(\w+(?:\s+\w+)?)'
|
|
1340
|
+
q_match = re.search(question_var_pattern, text_lower, re.IGNORECASE)
|
|
1341
|
+
if q_match:
|
|
1342
|
+
q_var = self._clean_variable(q_match.group(1))
|
|
1343
|
+
if q_var and q_var in cleaned_vars:
|
|
1344
|
+
for before_var in before_vars:
|
|
1345
|
+
if before_var != q_var:
|
|
1346
|
+
inferred.append((before_var, q_var))
|
|
1347
|
+
|
|
1348
|
+
# If we have state descriptions with multiple variables
|
|
1349
|
+
# Pattern: "X is Y & Z is W" -> X and Z might affect the question target
|
|
1350
|
+
if '&' in text or (' and ' in text_lower and 'is' in text_lower):
|
|
1351
|
+
# Find variables mentioned with "is" followed by values
|
|
1352
|
+
state_pattern = r'(\w+(?:\s+\w+)?)\s+(?:is|are|was|were)\s+[\d%]+'
|
|
1353
|
+
state_matches = list(re.finditer(state_pattern, text_lower, re.IGNORECASE))
|
|
1354
|
+
|
|
1355
|
+
if len(state_matches) >= 1:
|
|
1356
|
+
# Variables mentioned in state descriptions
|
|
1357
|
+
state_vars = []
|
|
1358
|
+
for m in state_matches:
|
|
1359
|
+
var = self._clean_variable(m.group(1))
|
|
1360
|
+
if var and var in cleaned_vars:
|
|
1361
|
+
state_vars.append(var)
|
|
1362
|
+
|
|
1363
|
+
# Find question target
|
|
1364
|
+
question_vars = []
|
|
1365
|
+
# Look for "expected X" or "what is X"
|
|
1366
|
+
expected_pattern = r'expected\s+(\w+(?:\s+\w+)?)'
|
|
1367
|
+
what_pattern = r'what\s+(?:is|are|will|would)\s+(?:the\s+)?(\w+(?:\s+\w+)?)'
|
|
1368
|
+
|
|
1369
|
+
for pattern in [expected_pattern, what_pattern]:
|
|
1370
|
+
match = re.search(pattern, text_lower, re.IGNORECASE)
|
|
1371
|
+
if match:
|
|
1372
|
+
q_var = self._clean_variable(match.group(1))
|
|
1373
|
+
if q_var and q_var in cleaned_vars:
|
|
1374
|
+
question_vars.append(q_var)
|
|
1375
|
+
|
|
1376
|
+
# If no explicit question var, look for variables with "price" or similar
|
|
1377
|
+
if not question_vars:
|
|
1378
|
+
price_vars = [v for v in cleaned_vars if 'price' in v.lower() and 'expected' not in v.lower()]
|
|
1379
|
+
if price_vars:
|
|
1380
|
+
question_vars = price_vars[:1] # Take first one
|
|
1381
|
+
|
|
1382
|
+
# Create relationships from state variables to question target
|
|
1383
|
+
for state_var in state_vars:
|
|
1384
|
+
for q_var in question_vars:
|
|
1385
|
+
if state_var != q_var:
|
|
1386
|
+
inferred.append((state_var, q_var))
|
|
1387
|
+
|
|
1388
|
+
# Remove duplicates
|
|
1389
|
+
inferred = list(set(inferred))
|
|
1390
|
+
|
|
1391
|
+
return inferred
|
|
1392
|
+
|
|
1393
|
+
def extract_variables_from_task(self, task: str) -> Dict[str, Any]:
|
|
1394
|
+
"""
|
|
1395
|
+
Advanced extraction of variables and relationships from natural language.
|
|
1396
|
+
|
|
1397
|
+
Automatically extracts variables and relationships (causal, knowledge, etc.) from natural language text.
|
|
1398
|
+
Enhanced to handle:
|
|
1399
|
+
- Causal relationships (depends on, affects, causes)
|
|
1400
|
+
- General knowledge relationships (is-a, has, part-of, located-in)
|
|
1401
|
+
- Numerical values, conditionals, questions, and state descriptions
|
|
1402
|
+
- Definitions, facts, and taxonomic relationships
|
|
1403
|
+
|
|
1404
|
+
Args:
|
|
1405
|
+
task: Natural language task description
|
|
1406
|
+
|
|
1407
|
+
Returns:
|
|
1408
|
+
Dictionary with 'variables', 'edges', 'relationships', and metadata
|
|
1409
|
+
|
|
1410
|
+
Example:
|
|
1411
|
+
>>> agent = HybridAgent()
|
|
1412
|
+
>>> result = agent.extract_causal_variables("price depends on demand and supply")
|
|
1413
|
+
>>> print(result['variables']) # ['price', 'demand', 'supply']
|
|
1414
|
+
>>> print(result['edges']) # [('price', 'demand'), ('price', 'supply')]
|
|
1415
|
+
|
|
1416
|
+
>>> result = agent.extract_causal_variables("A dog is a mammal")
|
|
1417
|
+
>>> print(result['variables']) # ['dog', 'mammal']
|
|
1418
|
+
>>> print(result['edges']) # [('dog', 'mammal')] with type='is_a'
|
|
1419
|
+
|
|
1420
|
+
>>> result = agent.extract_causal_variables("Paris is in France")
|
|
1421
|
+
>>> print(result['variables']) # ['Paris', 'France']
|
|
1422
|
+
>>> print(result['edges']) # [('Paris', 'France')] with type='located_in'
|
|
1423
|
+
"""
|
|
1424
|
+
# Extract relationships with context
|
|
1425
|
+
relationships = self._extract_with_context(task)
|
|
1426
|
+
|
|
1427
|
+
# Extract clean variables from relationships first (most reliable)
|
|
1428
|
+
variables = self._extract_clean_variables_from_relationships(relationships)
|
|
1429
|
+
|
|
1430
|
+
# Extract standalone variables (supplementary) - this now handles state descriptions
|
|
1431
|
+
standalone_vars = self._extract_standalone_variables(task)
|
|
1432
|
+
|
|
1433
|
+
# Clean standalone variables
|
|
1434
|
+
for var in standalone_vars:
|
|
1435
|
+
cleaned = self._clean_variable(var)
|
|
1436
|
+
if cleaned:
|
|
1437
|
+
variables.add(cleaned)
|
|
1438
|
+
|
|
1439
|
+
# NEW: Extract state variables from action verbs and epistemic terms
|
|
1440
|
+
# This helps handle vague language by finding what action verbs/epistemic terms refer to
|
|
1441
|
+
action_verb_vars = self._extract_state_variables_from_action_verbs(task)
|
|
1442
|
+
epistemic_vars = self._extract_state_variables_from_epistemic_terms(task)
|
|
1443
|
+
vague_language_vars = self._extract_variables_from_vague_language(task)
|
|
1444
|
+
|
|
1445
|
+
# Add extracted variables (they're already cleaned by the extraction methods)
|
|
1446
|
+
for var in action_verb_vars:
|
|
1447
|
+
cleaned = self._clean_variable(var)
|
|
1448
|
+
if cleaned:
|
|
1449
|
+
variables.add(cleaned)
|
|
1450
|
+
|
|
1451
|
+
for var in epistemic_vars:
|
|
1452
|
+
cleaned = self._clean_variable(var)
|
|
1453
|
+
if cleaned:
|
|
1454
|
+
variables.add(cleaned)
|
|
1455
|
+
|
|
1456
|
+
for var in vague_language_vars:
|
|
1457
|
+
cleaned = self._clean_variable(var)
|
|
1458
|
+
if cleaned:
|
|
1459
|
+
variables.add(cleaned)
|
|
1460
|
+
|
|
1461
|
+
# Resolve references
|
|
1462
|
+
variables = self._resolve_references(task, variables)
|
|
1463
|
+
|
|
1464
|
+
# Merge similar variables
|
|
1465
|
+
variables = self._merge_similar_variables(variables)
|
|
1466
|
+
|
|
1467
|
+
# Final filtering: remove invalid variables
|
|
1468
|
+
variables = self._filter_valid_variables(variables)
|
|
1469
|
+
|
|
1470
|
+
# Build clean edges from relationships
|
|
1471
|
+
edges = []
|
|
1472
|
+
for rel in relationships:
|
|
1473
|
+
if not rel.get('negated', False): # Only add non-negated relationships
|
|
1474
|
+
source = self._clean_variable(rel['source'])
|
|
1475
|
+
target = self._clean_variable(rel['target'])
|
|
1476
|
+
if source and target and source != target:
|
|
1477
|
+
# Filter out edges with "of" at the end (e.g., "price of")
|
|
1478
|
+
if not target.endswith(' of') and not source.endswith(' of'):
|
|
1479
|
+
edges.append((source, target))
|
|
1480
|
+
|
|
1481
|
+
# If no explicit edges found, try to infer from context
|
|
1482
|
+
if not edges and variables:
|
|
1483
|
+
inferred_edges = self._infer_relationships_from_context(variables, task)
|
|
1484
|
+
edges.extend(inferred_edges)
|
|
1485
|
+
|
|
1486
|
+
# Clean up edges: remove edges to/from invalid variables
|
|
1487
|
+
# Only include edges between valid variables
|
|
1488
|
+
valid_vars_set = variables
|
|
1489
|
+
cleaned_edges = []
|
|
1490
|
+
for source, target in edges:
|
|
1491
|
+
source_clean = self._clean_variable(source)
|
|
1492
|
+
target_clean = self._clean_variable(target)
|
|
1493
|
+
|
|
1494
|
+
# Both must be valid and in the valid variables set
|
|
1495
|
+
if (source_clean and target_clean and
|
|
1496
|
+
source_clean != target_clean and
|
|
1497
|
+
source_clean in valid_vars_set and
|
|
1498
|
+
target_clean in valid_vars_set):
|
|
1499
|
+
# Don't create edges to time units unless they're part of a compound variable
|
|
1500
|
+
if target_clean in ['days', 'day', 'hours', 'hour'] and len(target_clean.split()) == 1:
|
|
1501
|
+
continue
|
|
1502
|
+
# Don't create edges from single words to compound phrases that contain them
|
|
1503
|
+
if source_clean in target_clean.split() or target_clean in source_clean.split():
|
|
1504
|
+
# Only skip if one is clearly a subset of the other
|
|
1505
|
+
if len(source_clean.split()) < len(target_clean.split()) or len(target_clean.split()) < len(source_clean.split()):
|
|
1506
|
+
continue
|
|
1507
|
+
cleaned_edges.append((source_clean, target_clean))
|
|
1508
|
+
|
|
1509
|
+
# Remove duplicate edges
|
|
1510
|
+
edges = list(set(cleaned_edges))
|
|
1511
|
+
|
|
1512
|
+
# Prioritize edges: prefer edges to "expected X" or question targets
|
|
1513
|
+
question_targets = [v for v in variables if 'expected' in v.lower()]
|
|
1514
|
+
if question_targets:
|
|
1515
|
+
# Keep edges that go to question targets
|
|
1516
|
+
prioritized_edges = [e for e in edges if e[1] in question_targets]
|
|
1517
|
+
# Add other edges that don't conflict
|
|
1518
|
+
for e in edges:
|
|
1519
|
+
if e not in prioritized_edges:
|
|
1520
|
+
# Only add if source doesn't already have an edge to a question target
|
|
1521
|
+
if not any(e[0] == p[0] for p in prioritized_edges):
|
|
1522
|
+
prioritized_edges.append(e)
|
|
1523
|
+
edges = prioritized_edges if prioritized_edges else edges
|
|
1524
|
+
|
|
1525
|
+
# Extract metadata
|
|
1526
|
+
metadata = {
|
|
1527
|
+
'total_relationships': len(relationships),
|
|
1528
|
+
'negated_relationships': sum(1 for r in relationships if r.get('negated', False)),
|
|
1529
|
+
'average_confidence': sum(r['confidence'] for r in relationships) / len(relationships) if relationships else 0.0,
|
|
1530
|
+
'variables_extracted': len(variables),
|
|
1531
|
+
'edges_extracted': len(edges),
|
|
1532
|
+
'variables_with_values': self._extract_variables_with_values(task)
|
|
1533
|
+
}
|
|
1534
|
+
|
|
1535
|
+
return {
|
|
1536
|
+
'variables': sorted(list(variables)), # Sorted for consistency
|
|
1537
|
+
'edges': edges,
|
|
1538
|
+
'relationships': relationships,
|
|
1539
|
+
'metadata': metadata
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
def infer_causal_structure(self, variables: List[str], context: Optional[str] = None) -> List[Tuple[str, str]]:
|
|
1543
|
+
"""
|
|
1544
|
+
Infer causal structure from variables using advanced logical inference.
|
|
1545
|
+
|
|
1546
|
+
Args:
|
|
1547
|
+
variables: List of variable names
|
|
1548
|
+
context: Optional context text for better inference
|
|
1549
|
+
|
|
1550
|
+
Returns:
|
|
1551
|
+
List of (source, target) tuples representing causal edges
|
|
1552
|
+
"""
|
|
1553
|
+
edges = []
|
|
1554
|
+
|
|
1555
|
+
if not variables:
|
|
1556
|
+
return edges
|
|
1557
|
+
|
|
1558
|
+
# Strategy 1: Sequential inference (variables mentioned in order)
|
|
1559
|
+
# Only if we have 2-4 variables (too many would create too many edges)
|
|
1560
|
+
if 2 <= len(variables) <= 4:
|
|
1561
|
+
for i in range(len(variables) - 1):
|
|
1562
|
+
source = variables[i]
|
|
1563
|
+
target = variables[i + 1]
|
|
1564
|
+
# Only add if not creating cycles
|
|
1565
|
+
if not self.graph_manager.has_path(target, source):
|
|
1566
|
+
edges.append((source, target))
|
|
1567
|
+
|
|
1568
|
+
# Strategy 2: Domain-specific heuristics
|
|
1569
|
+
# Common patterns: input -> process -> output, cause -> effect
|
|
1570
|
+
variable_lower = [v.lower() for v in variables]
|
|
1571
|
+
|
|
1572
|
+
# Look for common causal patterns
|
|
1573
|
+
input_keywords = ['input', 'source', 'cause', 'factor', 'driver', 'trigger']
|
|
1574
|
+
output_keywords = ['output', 'result', 'effect', 'outcome', 'consequence', 'impact']
|
|
1575
|
+
process_keywords = ['process', 'mechanism', 'method', 'approach', 'system']
|
|
1576
|
+
|
|
1577
|
+
inputs = [v for v, v_lower in zip(variables, variable_lower)
|
|
1578
|
+
if any(kw in v_lower for kw in input_keywords)]
|
|
1579
|
+
outputs = [v for v, v_lower in zip(variables, variable_lower)
|
|
1580
|
+
if any(kw in v_lower for kw in output_keywords)]
|
|
1581
|
+
processes = [v for v, v_lower in zip(variables, variable_lower)
|
|
1582
|
+
if any(kw in v_lower for kw in process_keywords)]
|
|
1583
|
+
|
|
1584
|
+
# Input -> Process -> Output pattern
|
|
1585
|
+
if inputs and processes:
|
|
1586
|
+
for inp in inputs:
|
|
1587
|
+
for proc in processes:
|
|
1588
|
+
if not self.graph_manager.has_path(proc, inp):
|
|
1589
|
+
edges.append((inp, proc))
|
|
1590
|
+
|
|
1591
|
+
if processes and outputs:
|
|
1592
|
+
for proc in processes:
|
|
1593
|
+
for out in outputs:
|
|
1594
|
+
if not self.graph_manager.has_path(out, proc):
|
|
1595
|
+
edges.append((proc, out))
|
|
1596
|
+
|
|
1597
|
+
# Direct input -> output (if no process)
|
|
1598
|
+
if inputs and outputs and not processes:
|
|
1599
|
+
for inp in inputs:
|
|
1600
|
+
for out in outputs:
|
|
1601
|
+
if not self.graph_manager.has_path(out, inp):
|
|
1602
|
+
edges.append((inp, out))
|
|
1603
|
+
|
|
1604
|
+
# Strategy 3: Context-based inference (if context provided)
|
|
1605
|
+
if context:
|
|
1606
|
+
context_lower = context.lower()
|
|
1607
|
+
# Look for mentions of variables in context
|
|
1608
|
+
for i, var1 in enumerate(variables):
|
|
1609
|
+
for var2 in variables[i+1:]:
|
|
1610
|
+
# Check if var1 appears before var2 in context
|
|
1611
|
+
pos1 = context_lower.find(var1.lower())
|
|
1612
|
+
pos2 = context_lower.find(var2.lower())
|
|
1613
|
+
if pos1 != -1 and pos2 != -1 and pos1 < pos2:
|
|
1614
|
+
# Check if there's a causal word between them
|
|
1615
|
+
between = context_lower[pos1:pos2]
|
|
1616
|
+
if any(verb in between for verb in self.causal_verbs):
|
|
1617
|
+
if not self.graph_manager.has_path(var2, var1):
|
|
1618
|
+
edges.append((var1, var2))
|
|
1619
|
+
|
|
1620
|
+
# Remove duplicates
|
|
1621
|
+
edges = list(set(edges))
|
|
1622
|
+
|
|
1623
|
+
return edges
|
|
1624
|
+
|
|
1625
|
+
def validate_causal_graph(self) -> Tuple[bool, Optional[str]]:
|
|
1626
|
+
"""
|
|
1627
|
+
Validate that the causal graph is a valid DAG.
|
|
1628
|
+
|
|
1629
|
+
Returns:
|
|
1630
|
+
Tuple of (is_valid, error_message)
|
|
1631
|
+
"""
|
|
1632
|
+
if not self.graph_manager.is_dag():
|
|
1633
|
+
return False, "Graph contains cycles"
|
|
1634
|
+
return True, None
|
|
1635
|
+
|
|
1636
|
+
def apply_causal_rules(self, state: Dict[str, float]) -> Dict[str, float]:
|
|
1637
|
+
"""
|
|
1638
|
+
Apply rule-based causal reasoning to a state.
|
|
1639
|
+
|
|
1640
|
+
Args:
|
|
1641
|
+
state: Dictionary mapping variables to values
|
|
1642
|
+
|
|
1643
|
+
Returns:
|
|
1644
|
+
Updated state after applying causal rules
|
|
1645
|
+
"""
|
|
1646
|
+
result = state.copy()
|
|
1647
|
+
|
|
1648
|
+
# Get topological order
|
|
1649
|
+
try:
|
|
1650
|
+
order = self.graph_manager.topological_sort()
|
|
1651
|
+
except Exception:
|
|
1652
|
+
order = list(state.keys())
|
|
1653
|
+
|
|
1654
|
+
# Apply causal propagation
|
|
1655
|
+
for node in order:
|
|
1656
|
+
if node not in result:
|
|
1657
|
+
continue
|
|
1658
|
+
|
|
1659
|
+
parents = self.graph_manager.get_parents(node)
|
|
1660
|
+
if not parents:
|
|
1661
|
+
continue
|
|
1662
|
+
|
|
1663
|
+
# Simple linear combination rule
|
|
1664
|
+
value = result.get(node, 0.0)
|
|
1665
|
+
for parent in parents:
|
|
1666
|
+
if parent in result:
|
|
1667
|
+
strength = self.graph_manager.edge_strength(parent, node)
|
|
1668
|
+
value += result[parent] * strength * 0.1 # Dampening factor
|
|
1669
|
+
|
|
1670
|
+
result[node] = value
|
|
1671
|
+
|
|
1672
|
+
return result
|
|
1673
|
+
|
|
1674
|
+
|
|
1675
|
+
class StatisticalEngine:
|
|
1676
|
+
"""
|
|
1677
|
+
Statistical inference engine wrapping StatisticalMethods.
|
|
1678
|
+
|
|
1679
|
+
Provides Bayesian inference, regression-based edge estimation,
|
|
1680
|
+
and uncertainty quantification.
|
|
1681
|
+
"""
|
|
1682
|
+
|
|
1683
|
+
def __init__(
|
|
1684
|
+
self,
|
|
1685
|
+
graph_manager: GraphManager,
|
|
1686
|
+
prediction_framework: PredictionFramework,
|
|
1687
|
+
seed: int = 42
|
|
1688
|
+
):
|
|
1689
|
+
"""
|
|
1690
|
+
Initialize statistical engine.
|
|
1691
|
+
|
|
1692
|
+
Args:
|
|
1693
|
+
graph_manager: GraphManager instance
|
|
1694
|
+
prediction_framework: PredictionFramework instance
|
|
1695
|
+
seed: Random seed
|
|
1696
|
+
"""
|
|
1697
|
+
self.graph_manager = graph_manager
|
|
1698
|
+
self.prediction_framework = prediction_framework
|
|
1699
|
+
self.statistical_methods = StatisticalMethods(
|
|
1700
|
+
graph_manager=graph_manager,
|
|
1701
|
+
prediction_framework=prediction_framework,
|
|
1702
|
+
seed=seed
|
|
1703
|
+
)
|
|
1704
|
+
|
|
1705
|
+
def fit_from_dataframe(
|
|
1706
|
+
self,
|
|
1707
|
+
df: Any,
|
|
1708
|
+
variables: List[str],
|
|
1709
|
+
window: int = 30,
|
|
1710
|
+
decay_alpha: float = 0.9,
|
|
1711
|
+
ridge_lambda: float = 0.0,
|
|
1712
|
+
enforce_signs: bool = True
|
|
1713
|
+
) -> None:
|
|
1714
|
+
"""
|
|
1715
|
+
Fit edge strengths from data using Bayesian regression.
|
|
1716
|
+
|
|
1717
|
+
Args:
|
|
1718
|
+
df: pandas DataFrame with historical data
|
|
1719
|
+
variables: List of variable names to fit
|
|
1720
|
+
window: Rolling window size
|
|
1721
|
+
decay_alpha: Decay factor for recency weighting
|
|
1722
|
+
ridge_lambda: Ridge regularization parameter
|
|
1723
|
+
enforce_signs: Whether to enforce edge sign constraints
|
|
1724
|
+
"""
|
|
1725
|
+
if not PANDAS_AVAILABLE:
|
|
1726
|
+
raise ImportError("pandas is required for statistical fitting")
|
|
1727
|
+
|
|
1728
|
+
self.statistical_methods.fit_from_dataframe(
|
|
1729
|
+
df=df,
|
|
1730
|
+
variables=variables,
|
|
1731
|
+
window=window,
|
|
1732
|
+
decay_alpha=decay_alpha,
|
|
1733
|
+
ridge_lambda=ridge_lambda,
|
|
1734
|
+
enforce_signs=enforce_signs
|
|
1735
|
+
)
|
|
1736
|
+
|
|
1737
|
+
# Update prediction framework standardization stats
|
|
1738
|
+
self.prediction_framework.standardization_stats = (
|
|
1739
|
+
self.statistical_methods.standardization_stats.copy()
|
|
1740
|
+
)
|
|
1741
|
+
|
|
1742
|
+
def quantify_uncertainty(
|
|
1743
|
+
self,
|
|
1744
|
+
df: Any,
|
|
1745
|
+
variables: List[str],
|
|
1746
|
+
windows: int = 200,
|
|
1747
|
+
alpha: float = 0.95
|
|
1748
|
+
) -> Dict[str, Any]:
|
|
1749
|
+
"""
|
|
1750
|
+
Quantify uncertainty using bootstrap resampling.
|
|
1751
|
+
|
|
1752
|
+
Args:
|
|
1753
|
+
df: pandas DataFrame
|
|
1754
|
+
variables: List of variable names
|
|
1755
|
+
windows: Number of bootstrap samples
|
|
1756
|
+
alpha: Confidence level
|
|
1757
|
+
|
|
1758
|
+
Returns:
|
|
1759
|
+
Dictionary with edge confidence intervals
|
|
1760
|
+
"""
|
|
1761
|
+
if not PANDAS_AVAILABLE:
|
|
1762
|
+
return {}
|
|
1763
|
+
|
|
1764
|
+
return self.statistical_methods.quantify_uncertainty(
|
|
1765
|
+
df=df,
|
|
1766
|
+
variables=variables,
|
|
1767
|
+
windows=windows,
|
|
1768
|
+
alpha=alpha
|
|
1769
|
+
)
|
|
1770
|
+
|
|
1771
|
+
def assess_causal_strength(self, source: str, target: str) -> float:
|
|
1772
|
+
"""
|
|
1773
|
+
Assess causal strength between two variables.
|
|
1774
|
+
|
|
1775
|
+
Args:
|
|
1776
|
+
source: Source variable
|
|
1777
|
+
target: Target variable
|
|
1778
|
+
|
|
1779
|
+
Returns:
|
|
1780
|
+
Causal strength (0.0 if no edge exists)
|
|
1781
|
+
"""
|
|
1782
|
+
return self.graph_manager.edge_strength(source, target)
|
|
1783
|
+
|
|
1784
|
+
def generate_probabilistic_counterfactuals(
|
|
1785
|
+
self,
|
|
1786
|
+
factual_state: Dict[str, float],
|
|
1787
|
+
target_variables: List[str],
|
|
1788
|
+
n_scenarios: int = 5
|
|
1789
|
+
) -> List[Dict[str, Any]]:
|
|
1790
|
+
"""
|
|
1791
|
+
Generate probabilistic counterfactual scenarios.
|
|
1792
|
+
|
|
1793
|
+
Args:
|
|
1794
|
+
factual_state: Current factual state
|
|
1795
|
+
target_variables: Variables to intervene on
|
|
1796
|
+
n_scenarios: Number of scenarios to generate
|
|
1797
|
+
|
|
1798
|
+
Returns:
|
|
1799
|
+
List of counterfactual scenario dictionaries
|
|
1800
|
+
"""
|
|
1801
|
+
scenarios = []
|
|
1802
|
+
|
|
1803
|
+
for i in range(n_scenarios):
|
|
1804
|
+
# Generate intervention values (deterministic sampling using seeded RNG)
|
|
1805
|
+
interventions = {}
|
|
1806
|
+
# Use consistency engine's RNG if available, otherwise use statistical engine's
|
|
1807
|
+
rng = getattr(self, '_rng', None)
|
|
1808
|
+
if rng is None and hasattr(self.statistical_engine, '_rng'):
|
|
1809
|
+
rng = self.statistical_engine._rng
|
|
1810
|
+
elif rng is None:
|
|
1811
|
+
# Fallback: create deterministic RNG with seed
|
|
1812
|
+
if NUMPY_AVAILABLE:
|
|
1813
|
+
rng = np.random.default_rng(42)
|
|
1814
|
+
else:
|
|
1815
|
+
import random
|
|
1816
|
+
random.seed(42)
|
|
1817
|
+
rng = random
|
|
1818
|
+
|
|
1819
|
+
for var in target_variables:
|
|
1820
|
+
if var in factual_state:
|
|
1821
|
+
base_value = factual_state[var]
|
|
1822
|
+
# Sample around base value (deterministic)
|
|
1823
|
+
if NUMPY_AVAILABLE and hasattr(rng, 'random'):
|
|
1824
|
+
random_val = float(rng.random())
|
|
1825
|
+
else:
|
|
1826
|
+
random_val = rng.random() if hasattr(rng, 'random') else 0.5
|
|
1827
|
+
intervention_value = base_value * (0.5 + random_val)
|
|
1828
|
+
interventions[var] = intervention_value
|
|
1829
|
+
|
|
1830
|
+
# Predict outcomes
|
|
1831
|
+
try:
|
|
1832
|
+
predicted = self.prediction_framework.predict_outcomes(
|
|
1833
|
+
factual_state=factual_state,
|
|
1834
|
+
interventions=interventions
|
|
1835
|
+
)
|
|
1836
|
+
|
|
1837
|
+
scenarios.append({
|
|
1838
|
+
'name': f'Scenario {i+1}',
|
|
1839
|
+
'interventions': interventions,
|
|
1840
|
+
'expected_outcomes': predicted,
|
|
1841
|
+
'probability': 1.0 / n_scenarios
|
|
1842
|
+
})
|
|
1843
|
+
except Exception as e:
|
|
1844
|
+
logger.warning(f"Failed to generate scenario {i+1}: {e}")
|
|
1845
|
+
continue
|
|
1846
|
+
|
|
1847
|
+
return scenarios
|
|
1848
|
+
|
|
1849
|
+
|
|
1850
|
+
class RuleBasedNLG:
|
|
1851
|
+
"""
|
|
1852
|
+
Enhanced rule-based natural language generation for LLM replacement.
|
|
1853
|
+
|
|
1854
|
+
Generates natural, conversational responses from graph state and reasoning results.
|
|
1855
|
+
Uses pragmatic layer for tone adjustment and confidence-based language.
|
|
1856
|
+
"""
|
|
1857
|
+
|
|
1858
|
+
def __init__(self):
|
|
1859
|
+
"""Initialize enhanced rule-based NLG."""
|
|
1860
|
+
self.templates = {
|
|
1861
|
+
'causal_analysis': """## Causal Analysis
|
|
1862
|
+
|
|
1863
|
+
**Variables Identified:** {variables}
|
|
1864
|
+
|
|
1865
|
+
**Causal Relationships:**
|
|
1866
|
+
{relationships}
|
|
1867
|
+
|
|
1868
|
+
**Graph Structure:**
|
|
1869
|
+
{graph_structure}
|
|
1870
|
+
|
|
1871
|
+
**Key Insights:**
|
|
1872
|
+
{insights}
|
|
1873
|
+
""",
|
|
1874
|
+
'knowledge_analysis': """## Knowledge Analysis
|
|
1875
|
+
|
|
1876
|
+
**Entities Identified:** {variables}
|
|
1877
|
+
|
|
1878
|
+
**Relationships:**
|
|
1879
|
+
{relationships}
|
|
1880
|
+
|
|
1881
|
+
**Graph Structure:**
|
|
1882
|
+
{graph_structure}
|
|
1883
|
+
|
|
1884
|
+
**Key Insights:**
|
|
1885
|
+
{insights}
|
|
1886
|
+
""",
|
|
1887
|
+
'general_analysis': """## Analysis
|
|
1888
|
+
|
|
1889
|
+
**Entities/Variables Identified:** {variables}
|
|
1890
|
+
|
|
1891
|
+
**Relationships:**
|
|
1892
|
+
{relationships}
|
|
1893
|
+
|
|
1894
|
+
**Graph Structure:**
|
|
1895
|
+
{graph_structure}
|
|
1896
|
+
|
|
1897
|
+
**Key Insights:**
|
|
1898
|
+
{insights}
|
|
1899
|
+
""",
|
|
1900
|
+
'counterfactual': """## Counterfactual Scenario: {name}
|
|
1901
|
+
|
|
1902
|
+
**Interventions:**
|
|
1903
|
+
{interventions}
|
|
1904
|
+
|
|
1905
|
+
**Expected Outcomes:**
|
|
1906
|
+
{outcomes}
|
|
1907
|
+
|
|
1908
|
+
**Probability:** {probability:.2%}
|
|
1909
|
+
""",
|
|
1910
|
+
'statistical_summary': """## Statistical Summary
|
|
1911
|
+
|
|
1912
|
+
**Edge Strengths:**
|
|
1913
|
+
{edge_strengths}
|
|
1914
|
+
|
|
1915
|
+
**Uncertainty:**
|
|
1916
|
+
{uncertainty}
|
|
1917
|
+
|
|
1918
|
+
**Confidence Intervals:**
|
|
1919
|
+
{confidence_intervals}
|
|
1920
|
+
""",
|
|
1921
|
+
'recommendation': """## Recommendations
|
|
1922
|
+
|
|
1923
|
+
Based on the causal analysis:
|
|
1924
|
+
|
|
1925
|
+
{recommendations}
|
|
1926
|
+
""",
|
|
1927
|
+
'conversational_intro': """I've analyzed your question about {topic}. Here's what I found:
|
|
1928
|
+
|
|
1929
|
+
""",
|
|
1930
|
+
'conversational_summary': """
|
|
1931
|
+
Based on the causal relationships I've identified, {summary}
|
|
1932
|
+
|
|
1933
|
+
""",
|
|
1934
|
+
'question_answer': """To answer your question: {question}
|
|
1935
|
+
|
|
1936
|
+
{answer}
|
|
1937
|
+
|
|
1938
|
+
This conclusion is derived from the causal graph structure, which shows {explanation}.
|
|
1939
|
+
|
|
1940
|
+
""",
|
|
1941
|
+
'explanation': """Let me explain how I reached this conclusion:
|
|
1942
|
+
|
|
1943
|
+
{explanation}
|
|
1944
|
+
|
|
1945
|
+
The causal relationships in the graph indicate that {insight}.
|
|
1946
|
+
|
|
1947
|
+
""",
|
|
1948
|
+
}
|
|
1949
|
+
|
|
1950
|
+
# Conversational connectors
|
|
1951
|
+
self.connectors = {
|
|
1952
|
+
'high_confidence': ['Based on', 'According to', 'The evidence shows', 'Analysis indicates'],
|
|
1953
|
+
'medium_confidence': ['It appears that', 'The data suggests', 'This likely means', 'It seems'],
|
|
1954
|
+
'low_confidence': ['It may be that', 'Possibly', 'This could indicate', 'There might be'],
|
|
1955
|
+
'transition': ['Furthermore', 'Additionally', 'Moreover', 'In addition', 'Also'],
|
|
1956
|
+
'conclusion': ['Therefore', 'Thus', 'As a result', 'Consequently', 'Hence']
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
# Natural language patterns for different intents
|
|
1960
|
+
self.intent_responses = {
|
|
1961
|
+
'question': "Let me answer your question based on the relationships I've identified.",
|
|
1962
|
+
'analysis': "I've performed an analysis of the relationships you described.",
|
|
1963
|
+
'prediction': "Based on the structure, here's what I predict:",
|
|
1964
|
+
'counterfactual': "Let me explore what would happen if we changed certain variables:",
|
|
1965
|
+
'recommendation': "Based on the analysis, here are my recommendations:",
|
|
1966
|
+
'extraction': "I've extracted the following structure from your description:",
|
|
1967
|
+
'definition': "Here's what I know about that:",
|
|
1968
|
+
'person_query': "Here's information about that person:",
|
|
1969
|
+
'location_query': "Here's the location information:",
|
|
1970
|
+
'temporal_query': "Here's the temporal information:",
|
|
1971
|
+
'explanation': "Let me explain:",
|
|
1972
|
+
'comparison': "Comparing the entities, I found:"
|
|
1973
|
+
}
|
|
1974
|
+
|
|
1975
|
+
def format_causal_analysis(self, analysis: Dict[str, Any]) -> str:
|
|
1976
|
+
"""
|
|
1977
|
+
Format causal analysis results into natural language.
|
|
1978
|
+
|
|
1979
|
+
Args:
|
|
1980
|
+
analysis: Dictionary with analysis results
|
|
1981
|
+
|
|
1982
|
+
Returns:
|
|
1983
|
+
Formatted natural language text
|
|
1984
|
+
"""
|
|
1985
|
+
variables = analysis.get('variables', [])
|
|
1986
|
+
relationships = analysis.get('relationships', [])
|
|
1987
|
+
graph_structure = analysis.get('graph_structure', '')
|
|
1988
|
+
insights = analysis.get('insights', [])
|
|
1989
|
+
|
|
1990
|
+
# Format relationships
|
|
1991
|
+
rel_text = []
|
|
1992
|
+
for rel in relationships:
|
|
1993
|
+
source = rel.get('source', '')
|
|
1994
|
+
target = rel.get('target', '')
|
|
1995
|
+
rel_type = rel.get('type', '')
|
|
1996
|
+
strength = rel.get('strength', 0.0)
|
|
1997
|
+
rel_text.append(f"- {source} -> {target} (type: {rel_type}, strength: {strength:.3f})")
|
|
1998
|
+
|
|
1999
|
+
# Format insights
|
|
2000
|
+
insights_text = []
|
|
2001
|
+
if isinstance(insights, list):
|
|
2002
|
+
for insight in insights:
|
|
2003
|
+
insights_text.append(f"- {insight}")
|
|
2004
|
+
else:
|
|
2005
|
+
insights_text.append(f"- {insights}")
|
|
2006
|
+
|
|
2007
|
+
return self.templates['causal_analysis'].format(
|
|
2008
|
+
variables=', '.join(variables) if variables else 'None identified',
|
|
2009
|
+
relationships='\n'.join(rel_text) if rel_text else 'No relationships found',
|
|
2010
|
+
graph_structure=graph_structure or 'No graph structure available',
|
|
2011
|
+
insights='\n'.join(insights_text) if insights_text else 'No insights generated'
|
|
2012
|
+
)
|
|
2013
|
+
|
|
2014
|
+
def format_counterfactuals(self, scenarios: List[Dict[str, Any]]) -> str:
|
|
2015
|
+
"""
|
|
2016
|
+
Format counterfactual scenarios into natural language.
|
|
2017
|
+
|
|
2018
|
+
Args:
|
|
2019
|
+
scenarios: List of counterfactual scenario dictionaries
|
|
2020
|
+
|
|
2021
|
+
Returns:
|
|
2022
|
+
Formatted natural language text
|
|
2023
|
+
"""
|
|
2024
|
+
if not scenarios:
|
|
2025
|
+
return "No counterfactual scenarios generated."
|
|
2026
|
+
|
|
2027
|
+
formatted = []
|
|
2028
|
+
for scenario in scenarios:
|
|
2029
|
+
name = scenario.get('name', 'Unknown')
|
|
2030
|
+
interventions = scenario.get('interventions', {})
|
|
2031
|
+
outcomes = scenario.get('expected_outcomes', {})
|
|
2032
|
+
probability = scenario.get('probability', 0.0)
|
|
2033
|
+
|
|
2034
|
+
# Format interventions
|
|
2035
|
+
inter_text = []
|
|
2036
|
+
for var, val in interventions.items():
|
|
2037
|
+
inter_text.append(f"- {var}: {val:.3f}")
|
|
2038
|
+
|
|
2039
|
+
# Format outcomes
|
|
2040
|
+
out_text = []
|
|
2041
|
+
for var, val in outcomes.items():
|
|
2042
|
+
out_text.append(f"- {var}: {val:.3f}")
|
|
2043
|
+
|
|
2044
|
+
formatted.append(self.templates['counterfactual'].format(
|
|
2045
|
+
name=name,
|
|
2046
|
+
interventions='\n'.join(inter_text) if inter_text else 'None',
|
|
2047
|
+
outcomes='\n'.join(out_text) if out_text else 'None',
|
|
2048
|
+
probability=probability
|
|
2049
|
+
))
|
|
2050
|
+
|
|
2051
|
+
return '\n\n'.join(formatted)
|
|
2052
|
+
|
|
2053
|
+
def format_statistical_results(self, results: Dict[str, Any]) -> str:
|
|
2054
|
+
"""
|
|
2055
|
+
Format statistical results into natural language.
|
|
2056
|
+
|
|
2057
|
+
Args:
|
|
2058
|
+
results: Dictionary with statistical results
|
|
2059
|
+
|
|
2060
|
+
Returns:
|
|
2061
|
+
Formatted natural language text
|
|
2062
|
+
"""
|
|
2063
|
+
edge_strengths = results.get('edge_strengths', {})
|
|
2064
|
+
uncertainty = results.get('uncertainty', {})
|
|
2065
|
+
confidence_intervals = results.get('confidence_intervals', {})
|
|
2066
|
+
|
|
2067
|
+
# Format edge strengths
|
|
2068
|
+
strength_text = []
|
|
2069
|
+
for (source, target), strength in edge_strengths.items():
|
|
2070
|
+
strength_text.append(f"- {source} -> {target}: {strength:.3f}")
|
|
2071
|
+
|
|
2072
|
+
# Format uncertainty
|
|
2073
|
+
uncertainty_text = []
|
|
2074
|
+
for key, val in uncertainty.items():
|
|
2075
|
+
uncertainty_text.append(f"- {key}: {val:.3f}")
|
|
2076
|
+
|
|
2077
|
+
# Format confidence intervals
|
|
2078
|
+
ci_text = []
|
|
2079
|
+
for key, ci in confidence_intervals.items():
|
|
2080
|
+
if isinstance(ci, dict):
|
|
2081
|
+
lower = ci.get('lower', 0.0)
|
|
2082
|
+
upper = ci.get('upper', 0.0)
|
|
2083
|
+
ci_text.append(f"- {key}: [{lower:.3f}, {upper:.3f}]")
|
|
2084
|
+
else:
|
|
2085
|
+
ci_text.append(f"- {key}: {ci}")
|
|
2086
|
+
|
|
2087
|
+
return self.templates['statistical_summary'].format(
|
|
2088
|
+
edge_strengths='\n'.join(strength_text) if strength_text else 'None',
|
|
2089
|
+
uncertainty='\n'.join(uncertainty_text) if uncertainty_text else 'None',
|
|
2090
|
+
confidence_intervals='\n'.join(ci_text) if ci_text else 'None'
|
|
2091
|
+
)
|
|
2092
|
+
|
|
2093
|
+
def generate_response(
|
|
2094
|
+
self,
|
|
2095
|
+
reasoning_result: Dict[str, Any],
|
|
2096
|
+
response_type: str = 'full',
|
|
2097
|
+
pragmatic_info: Optional[Dict[str, Any]] = None,
|
|
2098
|
+
show_reasoning: bool = False,
|
|
2099
|
+
reasoning_chain: Optional[ReasoningChain] = None
|
|
2100
|
+
) -> str:
|
|
2101
|
+
"""
|
|
2102
|
+
Generate enhanced natural language response with conversational tone.
|
|
2103
|
+
|
|
2104
|
+
Args:
|
|
2105
|
+
reasoning_result: Dictionary with reasoning results
|
|
2106
|
+
response_type: Type of response ('full', 'analysis', 'counterfactuals', 'statistical', 'conversational')
|
|
2107
|
+
pragmatic_info: Optional pragmatic information (register, hedging, explicitness)
|
|
2108
|
+
|
|
2109
|
+
Returns:
|
|
2110
|
+
Natural language response
|
|
2111
|
+
"""
|
|
2112
|
+
intent = reasoning_result.get('intent', {})
|
|
2113
|
+
intent_type = intent.get('type', 'analysis')
|
|
2114
|
+
|
|
2115
|
+
# Get pragmatic information
|
|
2116
|
+
if pragmatic_info is None:
|
|
2117
|
+
pragmatic_info = reasoning_result.get('pragmatic', {})
|
|
2118
|
+
|
|
2119
|
+
register = pragmatic_info.get('register', 'neutral')
|
|
2120
|
+
hedging = pragmatic_info.get('hedging', 'likely')
|
|
2121
|
+
|
|
2122
|
+
if response_type == 'conversational':
|
|
2123
|
+
# Enhanced conversational format with chain-of-thought
|
|
2124
|
+
parts = []
|
|
2125
|
+
|
|
2126
|
+
# Show chain-of-thought reasoning if requested
|
|
2127
|
+
if show_reasoning and reasoning_chain:
|
|
2128
|
+
reasoning_text = self._format_reasoning_chain(reasoning_chain)
|
|
2129
|
+
parts.append(reasoning_text)
|
|
2130
|
+
|
|
2131
|
+
# Conversational introduction
|
|
2132
|
+
task = reasoning_result.get('task', '')
|
|
2133
|
+
if task:
|
|
2134
|
+
# Extract topic from task
|
|
2135
|
+
topic = self._extract_topic(task)
|
|
2136
|
+
intro = self.intent_responses.get(intent_type, "I've analyzed your request.")
|
|
2137
|
+
parts.append(intro + "\n\n")
|
|
2138
|
+
|
|
2139
|
+
# Graph-first answer if available (most authoritative)
|
|
2140
|
+
graph_answer = reasoning_result.get('graph_first_answer', {})
|
|
2141
|
+
if graph_answer and graph_answer.get('answer'):
|
|
2142
|
+
# Determine graph type from relationships or result
|
|
2143
|
+
analysis = reasoning_result.get('analysis', {})
|
|
2144
|
+
analysis_relationships = analysis.get('relationships', [])
|
|
2145
|
+
has_general = any(
|
|
2146
|
+
isinstance(rel, dict) and rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual']
|
|
2147
|
+
for rel in analysis_relationships
|
|
2148
|
+
)
|
|
2149
|
+
graph_type_str = 'knowledge' if has_general else 'causal'
|
|
2150
|
+
parts.append(self._format_graph_answer(graph_answer, hedging, graph_type_str))
|
|
2151
|
+
|
|
2152
|
+
# Analysis with natural language (causal or general knowledge)
|
|
2153
|
+
if 'analysis' in reasoning_result:
|
|
2154
|
+
analysis = reasoning_result['analysis']
|
|
2155
|
+
# Determine if this is causal or general knowledge
|
|
2156
|
+
analysis_relationships = analysis.get('relationships', [])
|
|
2157
|
+
has_causal = any(
|
|
2158
|
+
(isinstance(rel, dict) and (rel.get('category') == 'causal' or rel.get('type', '').startswith('causal'))) or
|
|
2159
|
+
(isinstance(rel, str) and 'causal' in rel.lower())
|
|
2160
|
+
for rel in analysis_relationships
|
|
2161
|
+
)
|
|
2162
|
+
has_general = any(
|
|
2163
|
+
isinstance(rel, dict) and rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual']
|
|
2164
|
+
for rel in analysis_relationships
|
|
2165
|
+
)
|
|
2166
|
+
|
|
2167
|
+
if has_causal and not has_general:
|
|
2168
|
+
analysis_text = self._format_analysis_conversational(
|
|
2169
|
+
analysis,
|
|
2170
|
+
register,
|
|
2171
|
+
hedging
|
|
2172
|
+
)
|
|
2173
|
+
elif has_general:
|
|
2174
|
+
analysis_text = self._format_knowledge_analysis_conversational(
|
|
2175
|
+
analysis,
|
|
2176
|
+
register,
|
|
2177
|
+
hedging
|
|
2178
|
+
)
|
|
2179
|
+
else:
|
|
2180
|
+
analysis_text = self._format_analysis_conversational(
|
|
2181
|
+
analysis,
|
|
2182
|
+
register,
|
|
2183
|
+
hedging
|
|
2184
|
+
)
|
|
2185
|
+
parts.append(analysis_text)
|
|
2186
|
+
|
|
2187
|
+
# Answer specific questions
|
|
2188
|
+
if intent_type in ['question', 'prediction'] and 'analysis' in reasoning_result:
|
|
2189
|
+
answer = self._generate_question_answer(reasoning_result, hedging)
|
|
2190
|
+
if answer:
|
|
2191
|
+
parts.append(answer)
|
|
2192
|
+
|
|
2193
|
+
# Counterfactuals with explanation
|
|
2194
|
+
if 'counterfactuals' in reasoning_result and reasoning_result['counterfactuals']:
|
|
2195
|
+
cf_text = self._format_counterfactuals_conversational(
|
|
2196
|
+
reasoning_result['counterfactuals'],
|
|
2197
|
+
hedging
|
|
2198
|
+
)
|
|
2199
|
+
parts.append(cf_text)
|
|
2200
|
+
|
|
2201
|
+
# Recommendations
|
|
2202
|
+
recommendations = reasoning_result.get('recommendations', [])
|
|
2203
|
+
if recommendations:
|
|
2204
|
+
rec_text = self._format_recommendations_conversational(recommendations, hedging)
|
|
2205
|
+
parts.append(rec_text)
|
|
2206
|
+
|
|
2207
|
+
# Statistical results (if available and relevant)
|
|
2208
|
+
if 'statistical' in reasoning_result and reasoning_result['statistical']:
|
|
2209
|
+
if intent_type in ['analysis', 'prediction', 'comparison']:
|
|
2210
|
+
stat_text = self._format_statistical_conversational(
|
|
2211
|
+
reasoning_result['statistical'],
|
|
2212
|
+
hedging
|
|
2213
|
+
)
|
|
2214
|
+
parts.append(stat_text)
|
|
2215
|
+
|
|
2216
|
+
# Show transparency information if available
|
|
2217
|
+
if 'transparency' in reasoning_result and response_type != 'brief':
|
|
2218
|
+
transparency_text = self._format_transparency(reasoning_result['transparency'])
|
|
2219
|
+
parts.append(transparency_text)
|
|
2220
|
+
|
|
2221
|
+
return '\n\n'.join(parts)
|
|
2222
|
+
|
|
2223
|
+
elif response_type == 'full':
|
|
2224
|
+
# Original full format for backwards compatibility
|
|
2225
|
+
parts = []
|
|
2226
|
+
|
|
2227
|
+
task = reasoning_result.get('task', '')
|
|
2228
|
+
if task:
|
|
2229
|
+
parts.append(f"## Task Analysis\n\nAnalyzing: *{task}*\n")
|
|
2230
|
+
|
|
2231
|
+
if intent_type == 'extraction':
|
|
2232
|
+
parts.append("## Extracted Causal Structure\n")
|
|
2233
|
+
elif intent_type == 'counterfactual':
|
|
2234
|
+
parts.append("## Counterfactual Analysis\n")
|
|
2235
|
+
elif intent_type == 'recommendation':
|
|
2236
|
+
parts.append("## Causal Analysis & Recommendations\n")
|
|
2237
|
+
else:
|
|
2238
|
+
parts.append("## Causal Analysis\n")
|
|
2239
|
+
|
|
2240
|
+
if 'analysis' in reasoning_result:
|
|
2241
|
+
parts.append(self.format_causal_analysis(reasoning_result['analysis']))
|
|
2242
|
+
|
|
2243
|
+
recommendations = reasoning_result.get('recommendations', [])
|
|
2244
|
+
if recommendations:
|
|
2245
|
+
parts.append("\n## Recommendations\n")
|
|
2246
|
+
for i, rec in enumerate(recommendations, 1):
|
|
2247
|
+
parts.append(f"{i}. {rec}")
|
|
2248
|
+
|
|
2249
|
+
if 'counterfactuals' in reasoning_result and reasoning_result['counterfactuals']:
|
|
2250
|
+
parts.append("\n" + self.format_counterfactuals(reasoning_result['counterfactuals']))
|
|
2251
|
+
|
|
2252
|
+
if 'statistical' in reasoning_result and reasoning_result['statistical']:
|
|
2253
|
+
if intent_type in ['analysis', 'prediction', 'comparison']:
|
|
2254
|
+
parts.append("\n" + self.format_statistical_results(reasoning_result['statistical']))
|
|
2255
|
+
|
|
2256
|
+
return '\n\n'.join(parts)
|
|
2257
|
+
|
|
2258
|
+
elif response_type == 'analysis':
|
|
2259
|
+
return self.format_causal_analysis(reasoning_result.get('analysis', {}))
|
|
2260
|
+
|
|
2261
|
+
elif response_type == 'counterfactuals':
|
|
2262
|
+
return self.format_counterfactuals(reasoning_result.get('counterfactuals', []))
|
|
2263
|
+
|
|
2264
|
+
elif response_type == 'statistical':
|
|
2265
|
+
return self.format_statistical_results(reasoning_result.get('statistical', {}))
|
|
2266
|
+
|
|
2267
|
+
else:
|
|
2268
|
+
return str(reasoning_result)
|
|
2269
|
+
|
|
2270
|
+
def _extract_topic(self, task: str) -> str:
|
|
2271
|
+
"""Extract topic from task for conversational intro."""
|
|
2272
|
+
# Simple extraction - take first few words
|
|
2273
|
+
words = task.split()[:5]
|
|
2274
|
+
return ' '.join(words)
|
|
2275
|
+
|
|
2276
|
+
def _format_reasoning_chain(self, reasoning_chain: ReasoningChain) -> str:
|
|
2277
|
+
"""
|
|
2278
|
+
Format reasoning chain for display.
|
|
2279
|
+
|
|
2280
|
+
Args:
|
|
2281
|
+
reasoning_chain: Reasoning chain to format
|
|
2282
|
+
|
|
2283
|
+
Returns:
|
|
2284
|
+
Formatted reasoning text
|
|
2285
|
+
"""
|
|
2286
|
+
parts = ["## Chain-of-Thought Reasoning\n"]
|
|
2287
|
+
|
|
2288
|
+
for i, step in enumerate(reasoning_chain.steps, 1):
|
|
2289
|
+
step_text = f"**Step {i}: {step.operation}**\n"
|
|
2290
|
+
if step.input_state:
|
|
2291
|
+
step_text += f" Input: {str(step.input_state)[:100]}...\n"
|
|
2292
|
+
if step.output_state:
|
|
2293
|
+
step_text += f" Output: {str(step.output_state)[:100]}...\n"
|
|
2294
|
+
if step.conclusion:
|
|
2295
|
+
step_text += f" Conclusion: {step.conclusion}\n"
|
|
2296
|
+
if step.confidence < 1.0:
|
|
2297
|
+
step_text += f" Confidence: {step.confidence:.2f}\n"
|
|
2298
|
+
|
|
2299
|
+
parts.append(step_text)
|
|
2300
|
+
|
|
2301
|
+
return "\n".join(parts)
|
|
2302
|
+
|
|
2303
|
+
def _format_transparency(self, transparency: Dict[str, Any]) -> str:
|
|
2304
|
+
"""
|
|
2305
|
+
Format transparency information.
|
|
2306
|
+
|
|
2307
|
+
Args:
|
|
2308
|
+
transparency: Transparency dictionary
|
|
2309
|
+
|
|
2310
|
+
Returns:
|
|
2311
|
+
Formatted transparency text
|
|
2312
|
+
"""
|
|
2313
|
+
parts = ["## Transparency\n"]
|
|
2314
|
+
|
|
2315
|
+
confidence_viz = transparency.get('confidence', {})
|
|
2316
|
+
if confidence_viz:
|
|
2317
|
+
mean_conf = confidence_viz.get('mean_confidence', 0.0)
|
|
2318
|
+
std_conf = confidence_viz.get('std_confidence', 0.0)
|
|
2319
|
+
parts.append(f"**Confidence:** {mean_conf:.2f} ± {std_conf:.2f}")
|
|
2320
|
+
|
|
2321
|
+
graph_structure = transparency.get('graph_structure', {})
|
|
2322
|
+
if graph_structure:
|
|
2323
|
+
parts.append(f"**Graph Structure:** {graph_structure.get('structure_type', 'unknown')}")
|
|
2324
|
+
parts.append(f" - Nodes: {graph_structure.get('num_nodes', 0)}")
|
|
2325
|
+
parts.append(f" - Edges: {graph_structure.get('num_edges', 0)}")
|
|
2326
|
+
|
|
2327
|
+
return "\n".join(parts)
|
|
2328
|
+
|
|
2329
|
+
def _format_reasoning_chain(self, reasoning_chain: ReasoningChain) -> str:
|
|
2330
|
+
"""
|
|
2331
|
+
Format reasoning chain for display.
|
|
2332
|
+
|
|
2333
|
+
Args:
|
|
2334
|
+
reasoning_chain: Reasoning chain to format
|
|
2335
|
+
|
|
2336
|
+
Returns:
|
|
2337
|
+
Formatted reasoning text
|
|
2338
|
+
"""
|
|
2339
|
+
parts = ["## Chain-of-Thought Reasoning\n"]
|
|
2340
|
+
|
|
2341
|
+
for i, step in enumerate(reasoning_chain.steps, 1):
|
|
2342
|
+
step_text = f"**Step {i}: {step.operation}**\n"
|
|
2343
|
+
if step.input_state:
|
|
2344
|
+
step_text += f" Input: {str(step.input_state)[:100]}...\n"
|
|
2345
|
+
if step.output_state:
|
|
2346
|
+
step_text += f" Output: {str(step.output_state)[:100]}...\n"
|
|
2347
|
+
if step.conclusion:
|
|
2348
|
+
step_text += f" Conclusion: {step.conclusion}\n"
|
|
2349
|
+
if step.confidence < 1.0:
|
|
2350
|
+
step_text += f" Confidence: {step.confidence:.2f}\n"
|
|
2351
|
+
|
|
2352
|
+
parts.append(step_text)
|
|
2353
|
+
|
|
2354
|
+
return "\n".join(parts)
|
|
2355
|
+
|
|
2356
|
+
def _format_transparency(self, transparency: Dict[str, Any]) -> str:
|
|
2357
|
+
"""
|
|
2358
|
+
Format transparency information.
|
|
2359
|
+
|
|
2360
|
+
Args:
|
|
2361
|
+
transparency: Transparency dictionary
|
|
2362
|
+
|
|
2363
|
+
Returns:
|
|
2364
|
+
Formatted transparency text
|
|
2365
|
+
"""
|
|
2366
|
+
parts = ["## Transparency\n"]
|
|
2367
|
+
|
|
2368
|
+
confidence_viz = transparency.get('confidence', {})
|
|
2369
|
+
if confidence_viz:
|
|
2370
|
+
mean_conf = confidence_viz.get('mean_confidence', 0.0)
|
|
2371
|
+
std_conf = confidence_viz.get('std_confidence', 0.0)
|
|
2372
|
+
parts.append(f"**Confidence:** {mean_conf:.2f} ± {std_conf:.2f}")
|
|
2373
|
+
|
|
2374
|
+
graph_structure = transparency.get('graph_structure', {})
|
|
2375
|
+
if graph_structure:
|
|
2376
|
+
parts.append(f"**Graph Structure:** {graph_structure.get('structure_type', 'unknown')}")
|
|
2377
|
+
parts.append(f" - Nodes: {graph_structure.get('num_nodes', 0)}")
|
|
2378
|
+
parts.append(f" - Edges: {graph_structure.get('num_edges', 0)}")
|
|
2379
|
+
|
|
2380
|
+
return "\n".join(parts)
|
|
2381
|
+
|
|
2382
|
+
def _format_graph_answer(self, graph_answer: Dict[str, Any], hedging: str, graph_type: str = 'causal') -> str:
|
|
2383
|
+
"""Format graph-first answer conversationally."""
|
|
2384
|
+
answer = graph_answer.get('answer', '')
|
|
2385
|
+
evidence = graph_answer.get('supporting_evidence', [])
|
|
2386
|
+
|
|
2387
|
+
if answer:
|
|
2388
|
+
if graph_type in ['knowledge', 'mixed']:
|
|
2389
|
+
result = f"Based on the knowledge graph, {hedging} {answer.lower()}\n\n"
|
|
2390
|
+
else:
|
|
2391
|
+
result = f"Based on the causal graph structure, {hedging} {answer.lower()}\n\n"
|
|
2392
|
+
if evidence:
|
|
2393
|
+
result += "This conclusion is supported by:\n"
|
|
2394
|
+
for ev in evidence[:3]: # Limit to 3 pieces of evidence
|
|
2395
|
+
ev_type = ev.get('type', 'evidence')
|
|
2396
|
+
result += f"- {ev_type}: {str(ev)[:100]}\n"
|
|
2397
|
+
return result
|
|
2398
|
+
return ""
|
|
2399
|
+
|
|
2400
|
+
def _format_knowledge_analysis_conversational(
|
|
2401
|
+
self,
|
|
2402
|
+
analysis: Dict[str, Any],
|
|
2403
|
+
register: str,
|
|
2404
|
+
hedging: str
|
|
2405
|
+
) -> str:
|
|
2406
|
+
"""Format general knowledge analysis in conversational style."""
|
|
2407
|
+
parts = []
|
|
2408
|
+
variables = analysis.get('variables', [])
|
|
2409
|
+
relationships = analysis.get('relationships', [])
|
|
2410
|
+
|
|
2411
|
+
if variables:
|
|
2412
|
+
parts.append(f"I've identified {len(variables)} entities: {', '.join(variables[:5])}")
|
|
2413
|
+
if len(variables) > 5:
|
|
2414
|
+
parts.append(f" and {len(variables) - 5} more")
|
|
2415
|
+
parts.append(".\n\n")
|
|
2416
|
+
|
|
2417
|
+
if relationships:
|
|
2418
|
+
parts.append("Here are the relationships I found:\n\n")
|
|
2419
|
+
for rel in relationships[:5]:
|
|
2420
|
+
if isinstance(rel, dict):
|
|
2421
|
+
source = rel.get('source', '')
|
|
2422
|
+
target = rel.get('target', '')
|
|
2423
|
+
rel_type = rel.get('type', 'related')
|
|
2424
|
+
category = rel.get('category', 'general')
|
|
2425
|
+
|
|
2426
|
+
# Format based on relationship type
|
|
2427
|
+
if category == 'taxonomic':
|
|
2428
|
+
parts.append(f"- {source} is a type of {target}\n")
|
|
2429
|
+
elif category == 'meronymic':
|
|
2430
|
+
if rel_type == 'part_of':
|
|
2431
|
+
parts.append(f"- {source} is part of {target}\n")
|
|
2432
|
+
elif rel_type == 'has_property':
|
|
2433
|
+
parts.append(f"- {source} has {target}\n")
|
|
2434
|
+
elif rel_type == 'contains':
|
|
2435
|
+
parts.append(f"- {source} contains {target}\n")
|
|
2436
|
+
elif category == 'spatial':
|
|
2437
|
+
parts.append(f"- {source} is located in {target}\n")
|
|
2438
|
+
elif category == 'functional':
|
|
2439
|
+
parts.append(f"- {source} is used for {target}\n")
|
|
2440
|
+
elif category == 'definitional':
|
|
2441
|
+
parts.append(f"- {source} is {target}\n")
|
|
2442
|
+
else:
|
|
2443
|
+
parts.append(f"- {source} is related to {target}\n")
|
|
2444
|
+
|
|
2445
|
+
if len(relationships) > 5:
|
|
2446
|
+
parts.append(f"\n... and {len(relationships) - 5} more relationships.\n")
|
|
2447
|
+
|
|
2448
|
+
insights = analysis.get('insights', [])
|
|
2449
|
+
if insights:
|
|
2450
|
+
parts.append("\n**Key Insights:**\n")
|
|
2451
|
+
for insight in insights[:3]:
|
|
2452
|
+
parts.append(f"- {insight}\n")
|
|
2453
|
+
|
|
2454
|
+
return ''.join(parts)
|
|
2455
|
+
|
|
2456
|
+
def _format_analysis_conversational(
|
|
2457
|
+
self,
|
|
2458
|
+
analysis: Dict[str, Any],
|
|
2459
|
+
register: str,
|
|
2460
|
+
hedging: str
|
|
2461
|
+
) -> str:
|
|
2462
|
+
"""Format analysis (causal or general knowledge) in conversational style."""
|
|
2463
|
+
variables = analysis.get('variables', [])
|
|
2464
|
+
relationships = analysis.get('relationships', [])
|
|
2465
|
+
insights = analysis.get('insights', [])
|
|
2466
|
+
|
|
2467
|
+
parts = []
|
|
2468
|
+
|
|
2469
|
+
# Determine if this is causal or general knowledge
|
|
2470
|
+
has_causal = any(
|
|
2471
|
+
isinstance(rel, dict) and rel.get('category') == 'causal'
|
|
2472
|
+
for rel in relationships
|
|
2473
|
+
)
|
|
2474
|
+
has_general = any(
|
|
2475
|
+
isinstance(rel, dict) and rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual']
|
|
2476
|
+
for rel in relationships
|
|
2477
|
+
)
|
|
2478
|
+
|
|
2479
|
+
if variables:
|
|
2480
|
+
if has_general and not has_causal:
|
|
2481
|
+
var_text = ', '.join(variables[:5])
|
|
2482
|
+
if len(variables) > 5:
|
|
2483
|
+
var_text += f", and {len(variables) - 5} more"
|
|
2484
|
+
parts.append(f"I've identified {len(variables)} entities: {var_text}.\n")
|
|
2485
|
+
else:
|
|
2486
|
+
var_text = ', '.join(variables[:5])
|
|
2487
|
+
if len(variables) > 5:
|
|
2488
|
+
var_text += f", and {len(variables) - 5} more"
|
|
2489
|
+
parts.append(f"I've identified {len(variables)} key variables: {var_text}.\n")
|
|
2490
|
+
|
|
2491
|
+
if relationships:
|
|
2492
|
+
if has_general and not has_causal:
|
|
2493
|
+
parts.append(f"Here are the relationships I found:\n\n")
|
|
2494
|
+
for rel in relationships[:5]:
|
|
2495
|
+
if isinstance(rel, dict):
|
|
2496
|
+
source = rel.get('source', '')
|
|
2497
|
+
target = rel.get('target', '')
|
|
2498
|
+
category = rel.get('category', 'general')
|
|
2499
|
+
rel_type = rel.get('type', 'related')
|
|
2500
|
+
|
|
2501
|
+
if category == 'taxonomic':
|
|
2502
|
+
parts.append(f"- {source} is a type of {target}\n")
|
|
2503
|
+
elif category == 'meronymic':
|
|
2504
|
+
if rel_type == 'part_of':
|
|
2505
|
+
parts.append(f"- {source} is part of {target}\n")
|
|
2506
|
+
elif rel_type == 'has_property':
|
|
2507
|
+
parts.append(f"- {source} has {target}\n")
|
|
2508
|
+
else:
|
|
2509
|
+
parts.append(f"- {source} -> {target}\n")
|
|
2510
|
+
elif category == 'spatial':
|
|
2511
|
+
parts.append(f"- {source} is located in {target}\n")
|
|
2512
|
+
elif category == 'functional':
|
|
2513
|
+
parts.append(f"- {source} is used for {target}\n")
|
|
2514
|
+
else:
|
|
2515
|
+
parts.append(f"- {source} is related to {target}\n")
|
|
2516
|
+
|
|
2517
|
+
if len(relationships) > 5:
|
|
2518
|
+
parts.append(f"\n... and {len(relationships) - 5} more relationships.\n")
|
|
2519
|
+
else:
|
|
2520
|
+
parts.append(f"These variables are connected through {len(relationships)} causal relationships.\n")
|
|
2521
|
+
|
|
2522
|
+
# Highlight strongest relationship
|
|
2523
|
+
if relationships:
|
|
2524
|
+
strongest = max(relationships, key=lambda x: abs(x.get('strength', 0)) if isinstance(x, dict) else 0)
|
|
2525
|
+
if isinstance(strongest, dict):
|
|
2526
|
+
source = strongest.get('source', '')
|
|
2527
|
+
target = strongest.get('target', '')
|
|
2528
|
+
strength = strongest.get('strength', 0)
|
|
2529
|
+
confidence = strongest.get('confidence', 0.8)
|
|
2530
|
+
|
|
2531
|
+
connector = self.connectors.get('high_confidence' if confidence > 0.7 else 'medium_confidence', ['It appears'])[0]
|
|
2532
|
+
parts.append(
|
|
2533
|
+
f"{connector}, the strongest relationship is between '{source}' and '{target}' "
|
|
2534
|
+
f"(strength: {strength:.2f}, confidence: {confidence:.1%}).\n"
|
|
2535
|
+
)
|
|
2536
|
+
|
|
2537
|
+
if insights:
|
|
2538
|
+
parts.append("\nKey insights:\n")
|
|
2539
|
+
for insight in insights[:3]: # Limit to 3 insights
|
|
2540
|
+
parts.append(f"- {insight}\n")
|
|
2541
|
+
|
|
2542
|
+
return ''.join(parts)
|
|
2543
|
+
|
|
2544
|
+
def _generate_question_answer(
|
|
2545
|
+
self,
|
|
2546
|
+
reasoning_result: Dict[str, Any],
|
|
2547
|
+
hedging: str
|
|
2548
|
+
) -> str:
|
|
2549
|
+
"""Generate direct answer to a question."""
|
|
2550
|
+
task = reasoning_result.get('task', '')
|
|
2551
|
+
analysis = reasoning_result.get('analysis', {})
|
|
2552
|
+
graph_answer = reasoning_result.get('graph_first_answer', {})
|
|
2553
|
+
|
|
2554
|
+
# Try to extract answer from graph-first reasoning first
|
|
2555
|
+
if graph_answer and graph_answer.get('answer'):
|
|
2556
|
+
return f"**Answer:** {graph_answer['answer']}\n"
|
|
2557
|
+
|
|
2558
|
+
# Fallback to analysis-based answer
|
|
2559
|
+
variables = analysis.get('variables', [])
|
|
2560
|
+
relationships = analysis.get('relationships', [])
|
|
2561
|
+
|
|
2562
|
+
if 'what' in task.lower() or 'which' in task.lower():
|
|
2563
|
+
if variables:
|
|
2564
|
+
return f"**Answer:** The key variables involved are: {', '.join(variables[:3])}.\n"
|
|
2565
|
+
|
|
2566
|
+
if 'how' in task.lower() or 'why' in task.lower():
|
|
2567
|
+
if relationships:
|
|
2568
|
+
strongest = max(relationships, key=lambda x: abs(x.get('strength', 0)))
|
|
2569
|
+
source = strongest.get('source', '')
|
|
2570
|
+
target = strongest.get('target', '')
|
|
2571
|
+
return f"**Answer:** {hedging.capitalize()}, '{source}' affects '{target}' through a causal relationship.\n"
|
|
2572
|
+
|
|
2573
|
+
return ""
|
|
2574
|
+
|
|
2575
|
+
def _format_counterfactuals_conversational(
|
|
2576
|
+
self,
|
|
2577
|
+
counterfactuals: List[Dict[str, Any]],
|
|
2578
|
+
hedging: str
|
|
2579
|
+
) -> str:
|
|
2580
|
+
"""Format counterfactuals conversationally."""
|
|
2581
|
+
if not counterfactuals:
|
|
2582
|
+
return ""
|
|
2583
|
+
|
|
2584
|
+
parts = ["## Exploring Alternative Scenarios\n\n"]
|
|
2585
|
+
parts.append(f"Let me explore {len(counterfactuals)} alternative scenarios:\n\n")
|
|
2586
|
+
|
|
2587
|
+
for i, scenario in enumerate(counterfactuals[:3], 1): # Limit to 3 scenarios
|
|
2588
|
+
name = scenario.get('name', f'Scenario {i}')
|
|
2589
|
+
interventions = scenario.get('interventions', {})
|
|
2590
|
+
outcomes = scenario.get('expected_outcomes', {})
|
|
2591
|
+
probability = scenario.get('probability', 0.0)
|
|
2592
|
+
|
|
2593
|
+
parts.append(f"**{name}** ({probability:.1%} probability):\n")
|
|
2594
|
+
|
|
2595
|
+
if interventions:
|
|
2596
|
+
parts.append("If we change:\n")
|
|
2597
|
+
for var, val in list(interventions.items())[:3]:
|
|
2598
|
+
parts.append(f"- {var} to {val:.2f}\n")
|
|
2599
|
+
|
|
2600
|
+
if outcomes:
|
|
2601
|
+
parts.append("Then we would expect:\n")
|
|
2602
|
+
for var, val in list(outcomes.items())[:3]:
|
|
2603
|
+
parts.append(f"- {var}: {val:.2f}\n")
|
|
2604
|
+
|
|
2605
|
+
parts.append("\n")
|
|
2606
|
+
|
|
2607
|
+
return ''.join(parts)
|
|
2608
|
+
|
|
2609
|
+
def _format_recommendations_conversational(
|
|
2610
|
+
self,
|
|
2611
|
+
recommendations: List[str],
|
|
2612
|
+
hedging: str
|
|
2613
|
+
) -> str:
|
|
2614
|
+
"""Format recommendations conversationally."""
|
|
2615
|
+
if not recommendations:
|
|
2616
|
+
return ""
|
|
2617
|
+
|
|
2618
|
+
parts = ["## Recommendations\n\n"]
|
|
2619
|
+
parts.append("Based on my analysis, here's what I recommend:\n\n")
|
|
2620
|
+
|
|
2621
|
+
for i, rec in enumerate(recommendations[:5], 1): # Limit to 5 recommendations
|
|
2622
|
+
parts.append(f"{i}. {rec}\n")
|
|
2623
|
+
|
|
2624
|
+
return ''.join(parts)
|
|
2625
|
+
|
|
2626
|
+
def _format_statistical_conversational(
|
|
2627
|
+
self,
|
|
2628
|
+
statistical: Dict[str, Any],
|
|
2629
|
+
hedging: str
|
|
2630
|
+
) -> str:
|
|
2631
|
+
"""Format statistical results conversationally."""
|
|
2632
|
+
parts = ["## Statistical Analysis\n\n"]
|
|
2633
|
+
|
|
2634
|
+
edge_strengths = statistical.get('edge_strengths', {})
|
|
2635
|
+
if edge_strengths:
|
|
2636
|
+
parts.append(f"The statistical analysis reveals {len(edge_strengths)} causal relationships with quantified strengths.\n")
|
|
2637
|
+
|
|
2638
|
+
# Highlight strongest edges
|
|
2639
|
+
sorted_edges = sorted(edge_strengths.items(), key=lambda x: abs(x[1]), reverse=True)
|
|
2640
|
+
if sorted_edges:
|
|
2641
|
+
parts.append("The strongest relationships are:\n")
|
|
2642
|
+
for (source, target), strength in sorted_edges[:3]:
|
|
2643
|
+
parts.append(f"- {source} -> {target}: {strength:.3f}\n")
|
|
2644
|
+
|
|
2645
|
+
uncertainty = statistical.get('uncertainty', {})
|
|
2646
|
+
if uncertainty:
|
|
2647
|
+
parts.append(f"\nUncertainty analysis indicates {hedging} confidence in these relationships.\n")
|
|
2648
|
+
|
|
2649
|
+
return ''.join(parts)
|
|
2650
|
+
|
|
2651
|
+
|
|
2652
|
+
class HybridOrchestrator:
|
|
2653
|
+
"""
|
|
2654
|
+
Orchestrates hybrid reasoning with all LLM-enhanced components.
|
|
2655
|
+
|
|
2656
|
+
Integrates:
|
|
2657
|
+
- Reasoning tracking for chain-of-thought
|
|
2658
|
+
- Explanation generation
|
|
2659
|
+
- Self-verification
|
|
2660
|
+
- Consistency guarantees
|
|
2661
|
+
"""
|
|
2662
|
+
|
|
2663
|
+
def __init__(
|
|
2664
|
+
self,
|
|
2665
|
+
symbolic_reasoner: SymbolicReasoner,
|
|
2666
|
+
statistical_engine: StatisticalEngine,
|
|
2667
|
+
nlg: RuleBasedNLG,
|
|
2668
|
+
graph_first_reasoner: Optional[GraphFirstReasoner] = None,
|
|
2669
|
+
text_corrector: Optional[TextCorrector] = None,
|
|
2670
|
+
lexical_compiler: Optional[LexicalCompiler] = None,
|
|
2671
|
+
grammatical_compiler: Optional[GrammaticalCompiler] = None,
|
|
2672
|
+
pragmatic_compiler: Optional[PragmaticCompiler] = None,
|
|
2673
|
+
reasoning_tracker: Optional[ReasoningTracker] = None,
|
|
2674
|
+
explanation_builder: Optional[ExplanationBuilder] = None,
|
|
2675
|
+
transparency_layer: Optional[TransparencyLayer] = None,
|
|
2676
|
+
consistency_checker: Optional[ConsistencyChecker] = None,
|
|
2677
|
+
error_detector: Optional[ErrorDetector] = None,
|
|
2678
|
+
self_corrector: Optional[SelfCorrector] = None,
|
|
2679
|
+
consistency_engine: Optional[ConsistencyEngine] = None
|
|
2680
|
+
):
|
|
2681
|
+
"""
|
|
2682
|
+
Initialize hybrid orchestrator.
|
|
2683
|
+
|
|
2684
|
+
Args:
|
|
2685
|
+
symbolic_reasoner: Symbolic reasoner instance
|
|
2686
|
+
statistical_engine: Statistical engine instance
|
|
2687
|
+
nlg: Natural language generator
|
|
2688
|
+
graph_first_reasoner: Optional graph-first reasoner
|
|
2689
|
+
text_corrector: Optional text corrector
|
|
2690
|
+
lexical_compiler: Optional lexical compiler
|
|
2691
|
+
grammatical_compiler: Optional grammatical compiler
|
|
2692
|
+
pragmatic_compiler: Optional pragmatic compiler
|
|
2693
|
+
reasoning_tracker: Optional reasoning tracker for chain-of-thought
|
|
2694
|
+
explanation_builder: Optional explanation builder
|
|
2695
|
+
transparency_layer: Optional transparency layer
|
|
2696
|
+
consistency_checker: Optional consistency checker
|
|
2697
|
+
error_detector: Optional error detector
|
|
2698
|
+
self_corrector: Optional self corrector
|
|
2699
|
+
consistency_engine: Optional consistency engine
|
|
2700
|
+
"""
|
|
2701
|
+
self.symbolic_reasoner = symbolic_reasoner
|
|
2702
|
+
self.statistical_engine = statistical_engine
|
|
2703
|
+
self.nlg = nlg
|
|
2704
|
+
self.graph_first_reasoner = graph_first_reasoner
|
|
2705
|
+
self.text_corrector = text_corrector
|
|
2706
|
+
self.lexical_compiler = lexical_compiler
|
|
2707
|
+
self.grammatical_compiler = grammatical_compiler
|
|
2708
|
+
self.pragmatic_compiler = pragmatic_compiler
|
|
2709
|
+
self.reasoning_tracker = reasoning_tracker
|
|
2710
|
+
self.explanation_builder = explanation_builder
|
|
2711
|
+
self.transparency_layer = transparency_layer
|
|
2712
|
+
self.consistency_checker = consistency_checker
|
|
2713
|
+
self.error_detector = error_detector
|
|
2714
|
+
self.self_corrector = self_corrector
|
|
2715
|
+
self.consistency_engine = consistency_engine
|
|
2716
|
+
|
|
2717
|
+
def _parse_task_intent(self, task: str) -> Dict[str, Any]:
|
|
2718
|
+
"""
|
|
2719
|
+
Parse task to understand user intent and extract query type.
|
|
2720
|
+
|
|
2721
|
+
Args:
|
|
2722
|
+
task: Natural language task
|
|
2723
|
+
|
|
2724
|
+
Returns:
|
|
2725
|
+
Dictionary with intent information
|
|
2726
|
+
"""
|
|
2727
|
+
task_lower = task.lower()
|
|
2728
|
+
intent = {
|
|
2729
|
+
'type': 'analysis', # default
|
|
2730
|
+
'question_type': None,
|
|
2731
|
+
'target_variables': [],
|
|
2732
|
+
'intervention_variables': [],
|
|
2733
|
+
'comparison_requested': False,
|
|
2734
|
+
}
|
|
2735
|
+
|
|
2736
|
+
# Question type detection
|
|
2737
|
+
if any(word in task_lower for word in ['what', 'which', 'who']):
|
|
2738
|
+
intent['question_type'] = 'what'
|
|
2739
|
+
elif any(word in task_lower for word in ['how', 'why']):
|
|
2740
|
+
intent['question_type'] = 'how'
|
|
2741
|
+
elif any(word in task_lower for word in ['when', 'where']):
|
|
2742
|
+
intent['question_type'] = 'when_where'
|
|
2743
|
+
elif '?' in task:
|
|
2744
|
+
intent['question_type'] = 'general_question'
|
|
2745
|
+
|
|
2746
|
+
# Intent type detection
|
|
2747
|
+
if any(word in task_lower for word in ['extract', 'identify', 'find', 'list']):
|
|
2748
|
+
intent['type'] = 'extraction'
|
|
2749
|
+
elif any(word in task_lower for word in ['analyze', 'analyze', 'examine', 'study']):
|
|
2750
|
+
intent['type'] = 'analysis'
|
|
2751
|
+
elif any(word in task_lower for word in ['predict', 'forecast', 'estimate']):
|
|
2752
|
+
intent['type'] = 'prediction'
|
|
2753
|
+
elif any(word in task_lower for word in ['compare', 'versus', 'vs', 'difference']):
|
|
2754
|
+
intent['type'] = 'comparison'
|
|
2755
|
+
intent['comparison_requested'] = True
|
|
2756
|
+
elif any(word in task_lower for word in ['what if', 'if', 'suppose', 'assume']):
|
|
2757
|
+
intent['type'] = 'counterfactual'
|
|
2758
|
+
elif any(word in task_lower for word in ['recommend', 'suggest', 'should', 'best']):
|
|
2759
|
+
intent['type'] = 'recommendation'
|
|
2760
|
+
elif any(word in task_lower for word in ['what is', 'what are', 'define', 'definition', 'meaning']):
|
|
2761
|
+
intent['type'] = 'definition'
|
|
2762
|
+
elif any(word in task_lower for word in ['who is', 'who are']):
|
|
2763
|
+
intent['type'] = 'person_query'
|
|
2764
|
+
elif any(word in task_lower for word in ['where is', 'where are', 'location']):
|
|
2765
|
+
intent['type'] = 'location_query'
|
|
2766
|
+
elif any(word in task_lower for word in ['when is', 'when was', 'when did', 'date', 'time']):
|
|
2767
|
+
intent['type'] = 'temporal_query'
|
|
2768
|
+
elif any(word in task_lower for word in ['explain', 'describe', 'tell me about']):
|
|
2769
|
+
intent['type'] = 'explanation'
|
|
2770
|
+
|
|
2771
|
+
# Extract target variables (what user wants to know about)
|
|
2772
|
+
target_patterns = [
|
|
2773
|
+
r'(?:about|regarding|for|of)\s+(\w+(?:\s+\w+)?)',
|
|
2774
|
+
r'(?:affecting|impacting|influencing)\s+(\w+(?:\s+\w+)?)',
|
|
2775
|
+
r'(?:on|in)\s+(\w+(?:\s+\w+)?)',
|
|
2776
|
+
]
|
|
2777
|
+
for pattern in target_patterns:
|
|
2778
|
+
matches = re.finditer(pattern, task_lower)
|
|
2779
|
+
for match in matches:
|
|
2780
|
+
var = self.symbolic_reasoner._normalize_variable_name(match.group(1))
|
|
2781
|
+
if var and var not in self.symbolic_reasoner.stop_words:
|
|
2782
|
+
intent['target_variables'].append(var)
|
|
2783
|
+
|
|
2784
|
+
# Extract intervention variables (what user wants to change)
|
|
2785
|
+
intervention_patterns = [
|
|
2786
|
+
r'(?:if|when|suppose)\s+(\w+(?:\s+\w+)?)\s+(?:changes?|increases?|decreases?)',
|
|
2787
|
+
r'(?:change|modify|adjust)\s+(\w+(?:\s+\w+)?)',
|
|
2788
|
+
]
|
|
2789
|
+
for pattern in intervention_patterns:
|
|
2790
|
+
matches = re.finditer(pattern, task_lower)
|
|
2791
|
+
for match in matches:
|
|
2792
|
+
var = self.symbolic_reasoner._normalize_variable_name(match.group(1))
|
|
2793
|
+
if var and var not in self.symbolic_reasoner.stop_words:
|
|
2794
|
+
intent['intervention_variables'].append(var)
|
|
2795
|
+
|
|
2796
|
+
return intent
|
|
2797
|
+
|
|
2798
|
+
def _parse_extracted_values(self, variables_with_values: Dict[str, str]) -> Dict[str, float]:
|
|
2799
|
+
"""
|
|
2800
|
+
Parse extracted string values into float values.
|
|
2801
|
+
|
|
2802
|
+
Handles:
|
|
2803
|
+
- "20000" -> 20000.0
|
|
2804
|
+
- "61%" -> 0.61
|
|
2805
|
+
- "61% buy, 39% sell" -> extracts main percentage (61% -> 0.61)
|
|
2806
|
+
|
|
2807
|
+
Args:
|
|
2808
|
+
variables_with_values: Dictionary mapping variable names to string values
|
|
2809
|
+
|
|
2810
|
+
Returns:
|
|
2811
|
+
Dictionary mapping variable names to float values
|
|
2812
|
+
"""
|
|
2813
|
+
parsed = {}
|
|
2814
|
+
|
|
2815
|
+
for var, value_str in variables_with_values.items():
|
|
2816
|
+
if not value_str:
|
|
2817
|
+
continue
|
|
2818
|
+
|
|
2819
|
+
try:
|
|
2820
|
+
# Remove whitespace
|
|
2821
|
+
value_str = value_str.strip()
|
|
2822
|
+
|
|
2823
|
+
# Handle percentages
|
|
2824
|
+
if '%' in value_str:
|
|
2825
|
+
# Extract first percentage if multiple (e.g., "61% buy, 39% sell" -> 61%)
|
|
2826
|
+
percent_match = re.search(r'(\d+[.,]?\d*)\s*%', value_str)
|
|
2827
|
+
if percent_match:
|
|
2828
|
+
percent_value = float(percent_match.group(1).replace(',', '.'))
|
|
2829
|
+
# Convert percentage to decimal (61% -> 0.61)
|
|
2830
|
+
parsed[var] = percent_value / 100.0
|
|
2831
|
+
else:
|
|
2832
|
+
# Try to extract any number before %
|
|
2833
|
+
num_match = re.search(r'(\d+[.,]?\d*)', value_str)
|
|
2834
|
+
if num_match:
|
|
2835
|
+
parsed[var] = float(num_match.group(1).replace(',', '.')) / 100.0
|
|
2836
|
+
else:
|
|
2837
|
+
# Regular number
|
|
2838
|
+
# Remove any non-numeric characters except decimal point and comma
|
|
2839
|
+
clean_value = re.sub(r'[^\d.,-]', '', value_str)
|
|
2840
|
+
if clean_value:
|
|
2841
|
+
# Handle comma as decimal separator (European format)
|
|
2842
|
+
if ',' in clean_value and '.' not in clean_value:
|
|
2843
|
+
clean_value = clean_value.replace(',', '.')
|
|
2844
|
+
# Handle comma as thousands separator
|
|
2845
|
+
elif ',' in clean_value and '.' in clean_value:
|
|
2846
|
+
# Assume last comma/period is decimal separator
|
|
2847
|
+
parts = clean_value.replace(',', ' ').replace('.', ' ').split()
|
|
2848
|
+
if len(parts) > 1:
|
|
2849
|
+
clean_value = '.'.join(parts)
|
|
2850
|
+
else:
|
|
2851
|
+
clean_value = clean_value.replace(',', '')
|
|
2852
|
+
|
|
2853
|
+
parsed[var] = float(clean_value)
|
|
2854
|
+
except (ValueError, AttributeError) as e:
|
|
2855
|
+
logger.debug(f"Failed to parse value '{value_str}' for variable '{var}': {e}")
|
|
2856
|
+
continue
|
|
2857
|
+
|
|
2858
|
+
return parsed
|
|
2859
|
+
|
|
2860
|
+
def _detect_and_parse_json_scm(self, task: str) -> Optional[Dict[str, Any]]:
|
|
2861
|
+
"""
|
|
2862
|
+
Detect and parse JSON SCM (Structural Causal Model) from task.
|
|
2863
|
+
|
|
2864
|
+
Args:
|
|
2865
|
+
task: Task string that may contain JSON SCM
|
|
2866
|
+
|
|
2867
|
+
Returns:
|
|
2868
|
+
Parsed SCM dictionary or None if not detected
|
|
2869
|
+
"""
|
|
2870
|
+
# Try to find JSON in the task
|
|
2871
|
+
# Look for JSON object pattern - find the largest JSON object
|
|
2872
|
+
# This handles cases where there's text before/after the JSON
|
|
2873
|
+
json_matches = list(re.finditer(r'\{[\s\S]*?\}', task))
|
|
2874
|
+
if not json_matches:
|
|
2875
|
+
return None
|
|
2876
|
+
|
|
2877
|
+
# Try the largest match first (most likely to be complete JSON)
|
|
2878
|
+
json_matches_sorted = sorted(json_matches, key=lambda m: len(m.group(0)), reverse=True)
|
|
2879
|
+
|
|
2880
|
+
for json_match in json_matches_sorted:
|
|
2881
|
+
json_str = json_match.group(0)
|
|
2882
|
+
|
|
2883
|
+
try:
|
|
2884
|
+
scm_data = json.loads(json_str)
|
|
2885
|
+
|
|
2886
|
+
# Validate it's an SCM structure
|
|
2887
|
+
if not isinstance(scm_data, dict):
|
|
2888
|
+
continue
|
|
2889
|
+
|
|
2890
|
+
# Check for SCM indicators
|
|
2891
|
+
has_variables = 'variables' in scm_data
|
|
2892
|
+
has_equations = 'equations' in scm_data
|
|
2893
|
+
has_roles = any(
|
|
2894
|
+
isinstance(v, dict) and 'role' in v
|
|
2895
|
+
for v in scm_data.get('variables', [])
|
|
2896
|
+
)
|
|
2897
|
+
|
|
2898
|
+
if has_variables and (has_equations or has_roles):
|
|
2899
|
+
logger.info(f"Detected SCM structure with {len(scm_data.get('variables', []))} variables")
|
|
2900
|
+
return scm_data
|
|
2901
|
+
|
|
2902
|
+
except json.JSONDecodeError:
|
|
2903
|
+
# Try to extract and fix JSON
|
|
2904
|
+
try:
|
|
2905
|
+
# Remove comments and fix common issues
|
|
2906
|
+
json_str_clean = re.sub(r'//.*?$', '', json_str, flags=re.MULTILINE)
|
|
2907
|
+
json_str_clean = re.sub(r'/\*.*?\*/', '', json_str_clean, flags=re.DOTALL)
|
|
2908
|
+
scm_data = json.loads(json_str_clean)
|
|
2909
|
+
|
|
2910
|
+
if isinstance(scm_data, dict) and 'variables' in scm_data:
|
|
2911
|
+
logger.info(f"Detected SCM structure (after cleaning) with {len(scm_data.get('variables', []))} variables")
|
|
2912
|
+
return scm_data
|
|
2913
|
+
except json.JSONDecodeError:
|
|
2914
|
+
continue
|
|
2915
|
+
|
|
2916
|
+
return None
|
|
2917
|
+
|
|
2918
|
+
def _parse_scm_to_graph(self, scm_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
2919
|
+
"""
|
|
2920
|
+
Parse structured SCM JSON into causal graph.
|
|
2921
|
+
|
|
2922
|
+
Args:
|
|
2923
|
+
scm_data: Parsed SCM dictionary
|
|
2924
|
+
|
|
2925
|
+
Returns:
|
|
2926
|
+
Dictionary with extracted variables, edges, and metadata
|
|
2927
|
+
"""
|
|
2928
|
+
variables = []
|
|
2929
|
+
edges = []
|
|
2930
|
+
relationships = []
|
|
2931
|
+
variables_with_values = {}
|
|
2932
|
+
|
|
2933
|
+
# Extract variables
|
|
2934
|
+
var_list = scm_data.get('variables', [])
|
|
2935
|
+
for var_def in var_list:
|
|
2936
|
+
if isinstance(var_def, dict):
|
|
2937
|
+
var_id = var_def.get('id', '')
|
|
2938
|
+
var_role = var_def.get('role', '')
|
|
2939
|
+
if var_id:
|
|
2940
|
+
variables.append(var_id)
|
|
2941
|
+
# Store role information
|
|
2942
|
+
if var_role:
|
|
2943
|
+
variables_with_values[f"{var_id}_role"] = var_role
|
|
2944
|
+
|
|
2945
|
+
# Extract relationships from equations
|
|
2946
|
+
equations = scm_data.get('equations', [])
|
|
2947
|
+
for eq in equations:
|
|
2948
|
+
if isinstance(eq, dict):
|
|
2949
|
+
defines = eq.get('defines', '')
|
|
2950
|
+
parents = eq.get('parents', [])
|
|
2951
|
+
|
|
2952
|
+
# Extract variable name from defines (e.g., "S[t+1]" -> "S")
|
|
2953
|
+
defines_var = re.sub(r'\[.*?\]', '', defines).strip()
|
|
2954
|
+
|
|
2955
|
+
# Create edges from parents to defined variable
|
|
2956
|
+
for parent in parents:
|
|
2957
|
+
# Extract variable name from parent (e.g., "S[t]" -> "S", "alpha" -> "alpha")
|
|
2958
|
+
parent_var = re.sub(r'\[.*?\]', '', str(parent)).strip()
|
|
2959
|
+
|
|
2960
|
+
if defines_var and parent_var and defines_var != parent_var:
|
|
2961
|
+
# Only create edges between state variables (not parameters/constants)
|
|
2962
|
+
parent_role = None
|
|
2963
|
+
defines_role = None
|
|
2964
|
+
|
|
2965
|
+
# Find roles
|
|
2966
|
+
for var_def in var_list:
|
|
2967
|
+
if isinstance(var_def, dict):
|
|
2968
|
+
if var_def.get('id') == parent_var:
|
|
2969
|
+
parent_role = var_def.get('role', '')
|
|
2970
|
+
if var_def.get('id') == defines_var:
|
|
2971
|
+
defines_role = var_def.get('role', '')
|
|
2972
|
+
|
|
2973
|
+
# Only create causal edges (not parameter/constant relationships)
|
|
2974
|
+
# State variables can cause other state variables
|
|
2975
|
+
# Interventions can affect state variables
|
|
2976
|
+
# Exogenous can affect state variables
|
|
2977
|
+
if (defines_role in ['state', 'derived'] and
|
|
2978
|
+
parent_role in ['state', 'intervention', 'exogenous']):
|
|
2979
|
+
edges.append((parent_var, defines_var))
|
|
2980
|
+
relationships.append({
|
|
2981
|
+
'source': parent_var,
|
|
2982
|
+
'target': defines_var,
|
|
2983
|
+
'type': 'causal',
|
|
2984
|
+
'confidence': 1.0, # High confidence for explicit SCM
|
|
2985
|
+
'negated': False,
|
|
2986
|
+
'raw_source': parent_var,
|
|
2987
|
+
'raw_target': defines_var,
|
|
2988
|
+
'from_equation': eq.get('id', ''),
|
|
2989
|
+
'equation': eq.get('expr', '')
|
|
2990
|
+
})
|
|
2991
|
+
|
|
2992
|
+
# Extract initial state values
|
|
2993
|
+
given = scm_data.get('given', {})
|
|
2994
|
+
initial_state = given.get('initial_state', {})
|
|
2995
|
+
for var, value in initial_state.items():
|
|
2996
|
+
var_name = re.sub(r'\[.*?\]', '', var).strip()
|
|
2997
|
+
if var_name in variables:
|
|
2998
|
+
variables_with_values[var_name] = str(value)
|
|
2999
|
+
|
|
3000
|
+
# Extract parameter values
|
|
3001
|
+
parameters = given.get('parameters', {})
|
|
3002
|
+
for param, value in parameters.items():
|
|
3003
|
+
if param in variables:
|
|
3004
|
+
variables_with_values[param] = str(value)
|
|
3005
|
+
|
|
3006
|
+
return {
|
|
3007
|
+
'variables': variables,
|
|
3008
|
+
'edges': edges,
|
|
3009
|
+
'relationships': relationships,
|
|
3010
|
+
'metadata': {
|
|
3011
|
+
'variables_with_values': variables_with_values,
|
|
3012
|
+
'scm_structure': True,
|
|
3013
|
+
'task_id': scm_data.get('task_id', ''),
|
|
3014
|
+
'equations_count': len(equations),
|
|
3015
|
+
'variables_count': len(variables)
|
|
3016
|
+
}
|
|
3017
|
+
}
|
|
3018
|
+
|
|
3019
|
+
def reason_hybrid(
|
|
3020
|
+
self,
|
|
3021
|
+
task: str,
|
|
3022
|
+
data: Optional[Any] = None,
|
|
3023
|
+
context: Optional[ConversationContext] = None
|
|
3024
|
+
) -> Dict[str, Any]:
|
|
3025
|
+
"""
|
|
3026
|
+
Execute graph-first hybrid reasoning pipeline with reasoning tracking.
|
|
3027
|
+
|
|
3028
|
+
Pipeline:
|
|
3029
|
+
1. JSON/SCM detection and parsing (if structured input)
|
|
3030
|
+
2. Text correction (non-destructive)
|
|
3031
|
+
3. Language compilation (lexical -> grammatical -> pragmatic)
|
|
3032
|
+
4. Symbolic extraction (to graph with provenance)
|
|
3033
|
+
5. Graph-first reasoning (answer from graph state ONLY)
|
|
3034
|
+
6. Natural language generation (from graph state)
|
|
3035
|
+
7. Self-verification and error correction
|
|
3036
|
+
8. Explanation generation
|
|
3037
|
+
|
|
3038
|
+
Args:
|
|
3039
|
+
task: Natural language task description or JSON SCM
|
|
3040
|
+
data: Optional pandas DataFrame for statistical inference
|
|
3041
|
+
context: Optional conversation context
|
|
3042
|
+
|
|
3043
|
+
Returns:
|
|
3044
|
+
Dictionary with reasoning results derived from graph state
|
|
3045
|
+
"""
|
|
3046
|
+
# Create reasoning chain if tracking enabled
|
|
3047
|
+
if self.reasoning_tracker:
|
|
3048
|
+
self.reasoning_tracker.create_chain()
|
|
3049
|
+
|
|
3050
|
+
result = {
|
|
3051
|
+
'task': task,
|
|
3052
|
+
'intent': {},
|
|
3053
|
+
'analysis': {},
|
|
3054
|
+
'counterfactuals': [],
|
|
3055
|
+
'statistical': {},
|
|
3056
|
+
'graph_structure': '',
|
|
3057
|
+
'recommendations': [],
|
|
3058
|
+
'graph_first_answer': None,
|
|
3059
|
+
'scm_parsed': False,
|
|
3060
|
+
'reasoning_chain': None,
|
|
3061
|
+
'explanation': None
|
|
3062
|
+
}
|
|
3063
|
+
|
|
3064
|
+
# Track reasoning step: SCM detection
|
|
3065
|
+
if self.reasoning_tracker:
|
|
3066
|
+
self.reasoning_tracker.add_step(
|
|
3067
|
+
step_type=StepType.EXTRACTION,
|
|
3068
|
+
operation="detect_json_scm",
|
|
3069
|
+
input_state={'task': task},
|
|
3070
|
+
output_state={},
|
|
3071
|
+
conclusion="SCM detection"
|
|
3072
|
+
)
|
|
3073
|
+
|
|
3074
|
+
# Step 0: Detect and parse JSON SCM if present
|
|
3075
|
+
scm_data = self._detect_and_parse_json_scm(task)
|
|
3076
|
+
if scm_data:
|
|
3077
|
+
logger.info("Detected structured JSON SCM - parsing directly")
|
|
3078
|
+
result['scm_parsed'] = True
|
|
3079
|
+
result['scm_data'] = scm_data
|
|
3080
|
+
|
|
3081
|
+
# Track reasoning step: SCM parsing
|
|
3082
|
+
if self.reasoning_tracker:
|
|
3083
|
+
self.reasoning_tracker.add_step(
|
|
3084
|
+
step_type=StepType.EXTRACTION,
|
|
3085
|
+
operation="parse_scm",
|
|
3086
|
+
input_state={'scm_data': scm_data},
|
|
3087
|
+
output_state={},
|
|
3088
|
+
conclusion="SCM parsed"
|
|
3089
|
+
)
|
|
3090
|
+
|
|
3091
|
+
# Parse SCM to graph structure
|
|
3092
|
+
scm_extraction = self._parse_scm_to_graph(scm_data)
|
|
3093
|
+
result['scm_extraction'] = scm_extraction
|
|
3094
|
+
|
|
3095
|
+
# Use SCM extraction instead of natural language extraction
|
|
3096
|
+
variables = scm_extraction.get('variables', [])
|
|
3097
|
+
relationships_extracted = scm_extraction.get('relationships', [])
|
|
3098
|
+
edges = scm_extraction.get('edges', [])
|
|
3099
|
+
|
|
3100
|
+
# All SCM variables are valid (they're explicitly defined)
|
|
3101
|
+
valid_variables = set(variables)
|
|
3102
|
+
|
|
3103
|
+
# Add edges directly from SCM (high confidence - explicit structure)
|
|
3104
|
+
edges_added = 0
|
|
3105
|
+
for rel in relationships_extracted:
|
|
3106
|
+
source = rel['source']
|
|
3107
|
+
target = rel['target']
|
|
3108
|
+
|
|
3109
|
+
self.symbolic_reasoner.graph_manager.add_relationship(
|
|
3110
|
+
source=source,
|
|
3111
|
+
target=target,
|
|
3112
|
+
strength=1.0,
|
|
3113
|
+
confidence=rel.get('confidence', 1.0),
|
|
3114
|
+
from_scm=True,
|
|
3115
|
+
equation_id=rel.get('from_equation', ''),
|
|
3116
|
+
equation_expr=rel.get('equation', '')
|
|
3117
|
+
)
|
|
3118
|
+
edges_added += 1
|
|
3119
|
+
|
|
3120
|
+
result['edges_added'] = edges_added
|
|
3121
|
+
result['scm_parsing_success'] = True
|
|
3122
|
+
|
|
3123
|
+
# Track reasoning step: Graph construction from SCM
|
|
3124
|
+
if self.reasoning_tracker:
|
|
3125
|
+
self.reasoning_tracker.add_step(
|
|
3126
|
+
step_type=StepType.TRANSFORMATION,
|
|
3127
|
+
operation="build_graph_from_scm",
|
|
3128
|
+
input_state={'variables': variables, 'relationships': relationships_extracted},
|
|
3129
|
+
output_state={'edges_added': edges_added},
|
|
3130
|
+
conclusion=f"Graph built with {edges_added} edges"
|
|
3131
|
+
)
|
|
3132
|
+
|
|
3133
|
+
# Skip natural language extraction for SCM
|
|
3134
|
+
# Continue to graph-first reasoning and analysis
|
|
3135
|
+
corrected_task = task # Keep original for display
|
|
3136
|
+
else:
|
|
3137
|
+
# Step 0: Text correction (non-destructive) - only for natural language
|
|
3138
|
+
corrected_task = task
|
|
3139
|
+
corrected_tokens = None
|
|
3140
|
+
if self.text_corrector:
|
|
3141
|
+
correction_result = self.text_corrector.correct_text(task)
|
|
3142
|
+
corrected_task = correction_result['corrected_text']
|
|
3143
|
+
corrected_tokens = correction_result['corrected_tokens']
|
|
3144
|
+
result['correction'] = {
|
|
3145
|
+
'original': task,
|
|
3146
|
+
'corrected': corrected_task,
|
|
3147
|
+
'confidence': correction_result['confidence']
|
|
3148
|
+
}
|
|
3149
|
+
|
|
3150
|
+
# Step 0.5: Language compilation (lexical -> grammatical -> pragmatic)
|
|
3151
|
+
# Skip for SCM (already parsed)
|
|
3152
|
+
if not result.get('scm_parsed'):
|
|
3153
|
+
if self.lexical_compiler and self.grammatical_compiler:
|
|
3154
|
+
# Normalize terms using lexical compiler
|
|
3155
|
+
if corrected_tokens:
|
|
3156
|
+
for token in corrected_tokens:
|
|
3157
|
+
normalized = self.lexical_compiler.normalize_term(token.normalized_form)
|
|
3158
|
+
if normalized != token.normalized_form:
|
|
3159
|
+
token.normalized_form = normalized
|
|
3160
|
+
|
|
3161
|
+
# Optional: Validate word using dictionary (helps filter invalid terms)
|
|
3162
|
+
# This is a soft check - we don't reject words that aren't in dictionary
|
|
3163
|
+
# as domain-specific terms may not be in standard dictionaries
|
|
3164
|
+
if self.lexical_compiler.enable_dictionary:
|
|
3165
|
+
word_info = self.lexical_compiler.get_word_info(token.normalized_form)
|
|
3166
|
+
if word_info:
|
|
3167
|
+
# Store word info for later use (part of speech, synonyms, etc.)
|
|
3168
|
+
token.metadata = word_info
|
|
3169
|
+
|
|
3170
|
+
# Parse grammatical structure
|
|
3171
|
+
if self.grammatical_compiler:
|
|
3172
|
+
parse_tree = self.grammatical_compiler.parse_dependencies(corrected_task)
|
|
3173
|
+
causal_structure = self.grammatical_compiler.extract_causal_structure(parse_tree)
|
|
3174
|
+
if causal_structure:
|
|
3175
|
+
result['causal_structure'] = {
|
|
3176
|
+
'cause': causal_structure.cause,
|
|
3177
|
+
'effect': causal_structure.effect,
|
|
3178
|
+
'relation_type': causal_structure.relation_type,
|
|
3179
|
+
'confidence': causal_structure.confidence
|
|
3180
|
+
}
|
|
3181
|
+
|
|
3182
|
+
# Step 0.6: Parse task intent
|
|
3183
|
+
intent = self._parse_task_intent(corrected_task)
|
|
3184
|
+
result['intent'] = intent
|
|
3185
|
+
|
|
3186
|
+
# Step 1: Advanced symbolic extraction (use corrected task) OR use SCM extraction
|
|
3187
|
+
if result.get('scm_parsed'):
|
|
3188
|
+
# Use SCM extraction (already done above and stored in result)
|
|
3189
|
+
extraction = result.get('scm_extraction', {})
|
|
3190
|
+
variables = extraction.get('variables', [])
|
|
3191
|
+
relationships_extracted = extraction.get('relationships', [])
|
|
3192
|
+
edges = extraction.get('edges', [])
|
|
3193
|
+
valid_variables = set(variables) # All SCM variables are valid
|
|
3194
|
+
logger.info(f"Using SCM extraction: {len(variables)} variables, {len(relationships_extracted)} relationships")
|
|
3195
|
+
else:
|
|
3196
|
+
# Natural language extraction with few-shot learning (if enabled)
|
|
3197
|
+
# Check if we have learned patterns to use
|
|
3198
|
+
if hasattr(self.symbolic_reasoner, 'adaptive_extractor') and self.symbolic_reasoner.adaptive_extractor:
|
|
3199
|
+
# Try adaptive extraction first
|
|
3200
|
+
try:
|
|
3201
|
+
adaptive_result = self.symbolic_reasoner.adaptive_extractor.adapt_extraction(corrected_task)
|
|
3202
|
+
if adaptive_result.get('variables'):
|
|
3203
|
+
# Use adaptive extraction result
|
|
3204
|
+
extraction = {
|
|
3205
|
+
'variables': adaptive_result.get('variables', []),
|
|
3206
|
+
'edges': adaptive_result.get('edges', []),
|
|
3207
|
+
'relationships': []
|
|
3208
|
+
}
|
|
3209
|
+
# Convert edges to relationships format
|
|
3210
|
+
for source, target in adaptive_result.get('edges', []):
|
|
3211
|
+
extraction['relationships'].append({
|
|
3212
|
+
'source': source,
|
|
3213
|
+
'target': target,
|
|
3214
|
+
'type': 'causal',
|
|
3215
|
+
'confidence': adaptive_result.get('confidence', 0.8)
|
|
3216
|
+
})
|
|
3217
|
+
else:
|
|
3218
|
+
# Fall back to standard extraction
|
|
3219
|
+
extraction = self.symbolic_reasoner.extract_variables_from_task(corrected_task)
|
|
3220
|
+
except Exception as e:
|
|
3221
|
+
logger.warning(f"Adaptive extraction failed: {e}, falling back to standard extraction")
|
|
3222
|
+
extraction = self.symbolic_reasoner.extract_variables_from_task(corrected_task)
|
|
3223
|
+
else:
|
|
3224
|
+
# Standard extraction
|
|
3225
|
+
extraction = self.symbolic_reasoner.extract_variables_from_task(corrected_task)
|
|
3226
|
+
|
|
3227
|
+
variables = extraction.get('variables', [])
|
|
3228
|
+
relationships_extracted = extraction.get('relationships', [])
|
|
3229
|
+
edges = extraction.get('edges', [])
|
|
3230
|
+
|
|
3231
|
+
# Track reasoning step: Variable extraction
|
|
3232
|
+
if self.reasoning_tracker:
|
|
3233
|
+
self.reasoning_tracker.add_step(
|
|
3234
|
+
step_type=StepType.EXTRACTION,
|
|
3235
|
+
operation="extract_variables",
|
|
3236
|
+
input_state={'task': corrected_task},
|
|
3237
|
+
output_state={'variables': variables, 'relationships': len(relationships_extracted)},
|
|
3238
|
+
conclusion=f"Extracted {len(variables)} variables"
|
|
3239
|
+
)
|
|
3240
|
+
|
|
3241
|
+
# valid_variables will be set below after filtering
|
|
3242
|
+
|
|
3243
|
+
# CRITICAL: Filter variables before graph construction - only use valid variables
|
|
3244
|
+
# This prevents treating action verbs and epistemic terms as causal variables
|
|
3245
|
+
# BUT: Skip filtering for SCM (all SCM variables are explicitly defined and valid)
|
|
3246
|
+
if not result.get('scm_parsed'):
|
|
3247
|
+
valid_variables = {v for v in variables if self.symbolic_reasoner._clean_variable(v)}
|
|
3248
|
+
# For general knowledge tasks, be more permissive with filtering
|
|
3249
|
+
current_graph_type = self.symbolic_reasoner.graph_manager.graph_type
|
|
3250
|
+
if current_graph_type in ['knowledge', 'mixed']:
|
|
3251
|
+
# For general knowledge: Only filter action verbs/epistemic terms, keep entities
|
|
3252
|
+
filtered = set()
|
|
3253
|
+
for var in valid_variables:
|
|
3254
|
+
if not self.symbolic_reasoner._is_action_verb(var) and not self.symbolic_reasoner._is_epistemic_term(var):
|
|
3255
|
+
cleaned = self.symbolic_reasoner._clean_variable(var)
|
|
3256
|
+
if cleaned and cleaned not in self.symbolic_reasoner.stop_words:
|
|
3257
|
+
filtered.add(cleaned)
|
|
3258
|
+
valid_variables = filtered
|
|
3259
|
+
else:
|
|
3260
|
+
# For causal tasks: Strict filtering
|
|
3261
|
+
valid_variables = self.symbolic_reasoner._filter_valid_variables(valid_variables)
|
|
3262
|
+
|
|
3263
|
+
# Enhanced epistemic validation with GroundingValidator
|
|
3264
|
+
# Check epistemic grounding: ∀v ∈ V, ∃ path from observables O to v
|
|
3265
|
+
observable_variables = valid_variables.copy() # For now, all valid variables are considered observable
|
|
3266
|
+
if self.consistency_checker:
|
|
3267
|
+
graph_state_temp = {
|
|
3268
|
+
'nodes': list(valid_variables),
|
|
3269
|
+
'edges': edges
|
|
3270
|
+
}
|
|
3271
|
+
all_grounded, ungrounded = self.consistency_checker.verify_epistemic_grounding(
|
|
3272
|
+
graph_state_temp,
|
|
3273
|
+
observable_variables
|
|
3274
|
+
)
|
|
3275
|
+
if not all_grounded and ungrounded:
|
|
3276
|
+
logger.warning(f"Ungrounded variables detected: {ungrounded}")
|
|
3277
|
+
|
|
3278
|
+
# Epistemic validation: Check if we have sufficient grounding
|
|
3279
|
+
# If task is too vague (no explicit causal relationships, only action verbs/epistemic terms),
|
|
3280
|
+
# we should warn or reject
|
|
3281
|
+
has_explicit_causal_structure = len(relationships_extracted) > 0 or len(edges) > 0
|
|
3282
|
+
has_valid_state_variables = len(valid_variables) > 0
|
|
3283
|
+
|
|
3284
|
+
# Check if variables are mostly action verbs/epistemic terms (bad sign)
|
|
3285
|
+
action_verb_count = sum(1 for v in variables if self.symbolic_reasoner._is_action_verb(v))
|
|
3286
|
+
epistemic_term_count = sum(1 for v in variables if self.symbolic_reasoner._is_epistemic_term(v))
|
|
3287
|
+
total_vars = len(variables)
|
|
3288
|
+
|
|
3289
|
+
if total_vars > 0:
|
|
3290
|
+
action_epistemic_ratio = (action_verb_count + epistemic_term_count) / total_vars
|
|
3291
|
+
if action_epistemic_ratio > 0.5 and not has_explicit_causal_structure:
|
|
3292
|
+
result['epistemic_warning'] = (
|
|
3293
|
+
f"Warning: Task appears to contain mostly action verbs or epistemic terms "
|
|
3294
|
+
f"({action_verb_count + epistemic_term_count}/{total_vars} variables), "
|
|
3295
|
+
f"not causal state variables. Causal relationships cannot be inferred from "
|
|
3296
|
+
f"intent statements alone. Please provide explicit state variables and "
|
|
3297
|
+
f"causal relationships, or an existing SCM with logged policy decisions."
|
|
3298
|
+
)
|
|
3299
|
+
logger.warning(result['epistemic_warning'])
|
|
3300
|
+
|
|
3301
|
+
# Determine task type (causal vs general knowledge)
|
|
3302
|
+
task_intent = self._parse_task_intent(task)
|
|
3303
|
+
|
|
3304
|
+
# Check relationships_extracted for categories (if available)
|
|
3305
|
+
relationships_categories = [rel.get('category', 'causal') for rel in relationships_extracted if isinstance(rel, dict)]
|
|
3306
|
+
has_causal_rels = any(cat == 'causal' for cat in relationships_categories)
|
|
3307
|
+
has_general_rels = any(cat in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual'] for cat in relationships_categories)
|
|
3308
|
+
|
|
3309
|
+
graph_type = self.symbolic_reasoner.graph_manager.graph_type
|
|
3310
|
+
is_causal_task = (
|
|
3311
|
+
task_intent.get('type') in ['analysis', 'prediction', 'counterfactual', 'comparison'] or
|
|
3312
|
+
has_causal_rels or
|
|
3313
|
+
graph_type == 'causal'
|
|
3314
|
+
)
|
|
3315
|
+
is_general_knowledge_task = (
|
|
3316
|
+
task_intent.get('type') in ['definition', 'person_query', 'location_query', 'temporal_query', 'explanation'] or
|
|
3317
|
+
has_general_rels or
|
|
3318
|
+
graph_type in ['knowledge', 'mixed']
|
|
3319
|
+
)
|
|
3320
|
+
|
|
3321
|
+
# Epistemic validation only applies to causal tasks
|
|
3322
|
+
if is_causal_task and not is_general_knowledge_task:
|
|
3323
|
+
# If no valid state variables after filtering, task is epistemically underspecified
|
|
3324
|
+
if not has_valid_state_variables and not has_explicit_causal_structure:
|
|
3325
|
+
result['epistemic_error'] = (
|
|
3326
|
+
"Task is epistemically underspecified. No valid causal state variables were "
|
|
3327
|
+
"extracted. A CRCA agent requires:\n"
|
|
3328
|
+
"- Explicit state variables (not action verbs like 'identify' or epistemic terms like 'policy')\n"
|
|
3329
|
+
"- Transition relations (causal relationships between variables)\n"
|
|
3330
|
+
"- Intervention hooks (variables that can be manipulated)\n"
|
|
3331
|
+
"- Optionally: An existing SCM, logged policy decisions, defined collapse predicates\n\n"
|
|
3332
|
+
"Please provide a task with explicit causal structure, not just intent statements."
|
|
3333
|
+
)
|
|
3334
|
+
logger.error(result['epistemic_error'])
|
|
3335
|
+
# Don't proceed with graph construction if we have no valid variables
|
|
3336
|
+
result['analysis'] = {
|
|
3337
|
+
'variables': [],
|
|
3338
|
+
'relationships': [],
|
|
3339
|
+
'graph_structure': 'No valid causal structure extracted',
|
|
3340
|
+
'insights': [result['epistemic_error']],
|
|
3341
|
+
'epistemic_underspecified': True
|
|
3342
|
+
}
|
|
3343
|
+
return result
|
|
3344
|
+
|
|
3345
|
+
# Add edges to graph with confidence scores
|
|
3346
|
+
# For SCM, all relationships are already added above, so skip this section
|
|
3347
|
+
if not result.get('scm_parsed'):
|
|
3348
|
+
edges_added = 0
|
|
3349
|
+
for rel in relationships_extracted:
|
|
3350
|
+
if not rel.get('negated', False): # Skip negated relationships
|
|
3351
|
+
# For general knowledge, be more permissive with cleaning
|
|
3352
|
+
# Don't filter out entities just because they're short or not in keywords
|
|
3353
|
+
rel_category = rel.get('category', 'causal')
|
|
3354
|
+
if rel_category != 'causal':
|
|
3355
|
+
# For general knowledge: Just normalize, don't filter
|
|
3356
|
+
source_clean = self.symbolic_reasoner._normalize_variable_name(rel['source'])
|
|
3357
|
+
target_clean = self.symbolic_reasoner._normalize_variable_name(rel['target'])
|
|
3358
|
+
# Remove stop words but keep the variable
|
|
3359
|
+
if source_clean in self.symbolic_reasoner.stop_words:
|
|
3360
|
+
source_clean = None
|
|
3361
|
+
if target_clean in self.symbolic_reasoner.stop_words:
|
|
3362
|
+
target_clean = None
|
|
3363
|
+
else:
|
|
3364
|
+
# For causal: Use strict cleaning
|
|
3365
|
+
source_clean = self.symbolic_reasoner._clean_variable(rel['source'])
|
|
3366
|
+
target_clean = self.symbolic_reasoner._clean_variable(rel['target'])
|
|
3367
|
+
|
|
3368
|
+
if not source_clean or not target_clean:
|
|
3369
|
+
continue
|
|
3370
|
+
|
|
3371
|
+
rel_type = rel.get('type', 'causal')
|
|
3372
|
+
|
|
3373
|
+
# For causal relationships: Reject action verbs/epistemic terms (strict validation)
|
|
3374
|
+
if rel_category == 'causal':
|
|
3375
|
+
if self.symbolic_reasoner._is_action_verb(source_clean):
|
|
3376
|
+
logger.warning(f"Rejected causal relationship: source '{source_clean}' is an action verb")
|
|
3377
|
+
continue
|
|
3378
|
+
if self.symbolic_reasoner._is_action_verb(target_clean):
|
|
3379
|
+
logger.warning(f"Rejected causal relationship: target '{target_clean}' is an action verb")
|
|
3380
|
+
continue
|
|
3381
|
+
if self.symbolic_reasoner._is_epistemic_term(source_clean):
|
|
3382
|
+
logger.warning(f"Rejected causal relationship: source '{source_clean}' is an epistemic term")
|
|
3383
|
+
continue
|
|
3384
|
+
if self.symbolic_reasoner._is_epistemic_term(target_clean):
|
|
3385
|
+
logger.warning(f"Rejected causal relationship: target '{target_clean}' is an epistemic term")
|
|
3386
|
+
continue
|
|
3387
|
+
|
|
3388
|
+
# Only add if both are valid state variables for causal relationships
|
|
3389
|
+
if (source_clean in valid_variables and target_clean in valid_variables):
|
|
3390
|
+
# Validate causal relationship using do-calculus
|
|
3391
|
+
graph_state_temp = {
|
|
3392
|
+
'nodes': list(valid_variables),
|
|
3393
|
+
'edges': [(s, t) for s, t in self.symbolic_reasoner.graph_manager.get_edges()]
|
|
3394
|
+
}
|
|
3395
|
+
is_valid_causal, causal_error = self.symbolic_reasoner.validate_causal_relationship(
|
|
3396
|
+
source_clean,
|
|
3397
|
+
target_clean,
|
|
3398
|
+
graph_state_temp
|
|
3399
|
+
)
|
|
3400
|
+
|
|
3401
|
+
if is_valid_causal:
|
|
3402
|
+
self.symbolic_reasoner.graph_manager.add_relationship(
|
|
3403
|
+
source=source_clean,
|
|
3404
|
+
target=target_clean,
|
|
3405
|
+
strength=1.0,
|
|
3406
|
+
relation_type=rel_type,
|
|
3407
|
+
confidence=rel.get('confidence', 0.8),
|
|
3408
|
+
category=rel_category
|
|
3409
|
+
)
|
|
3410
|
+
edges_added += 1
|
|
3411
|
+
else:
|
|
3412
|
+
logger.debug(f"Skipping invalid causal relationship {source_clean} -> {target_clean}: {causal_error}")
|
|
3413
|
+
|
|
3414
|
+
# For general knowledge relationships: More permissive (allow entities, concepts, etc.)
|
|
3415
|
+
else:
|
|
3416
|
+
# Add to valid_variables if not already there (general knowledge can include new entities)
|
|
3417
|
+
if source_clean not in valid_variables:
|
|
3418
|
+
valid_variables.add(source_clean)
|
|
3419
|
+
if target_clean not in valid_variables:
|
|
3420
|
+
valid_variables.add(target_clean)
|
|
3421
|
+
|
|
3422
|
+
# Add relationship (no strict validation for general knowledge)
|
|
3423
|
+
self.symbolic_reasoner.graph_manager.add_relationship(
|
|
3424
|
+
source=source_clean,
|
|
3425
|
+
target=target_clean,
|
|
3426
|
+
strength=1.0,
|
|
3427
|
+
relation_type=rel_type,
|
|
3428
|
+
confidence=rel.get('confidence', 0.8),
|
|
3429
|
+
category=rel_category
|
|
3430
|
+
)
|
|
3431
|
+
edges_added += 1
|
|
3432
|
+
else:
|
|
3433
|
+
# For SCM, edges were already added above
|
|
3434
|
+
edges_added = result.get('edges_added', 0)
|
|
3435
|
+
|
|
3436
|
+
# Add direct edges
|
|
3437
|
+
# Skip for SCM (already added)
|
|
3438
|
+
if not result.get('scm_parsed'):
|
|
3439
|
+
for source, target in edges:
|
|
3440
|
+
source_clean = self.symbolic_reasoner._clean_variable(source)
|
|
3441
|
+
target_clean = self.symbolic_reasoner._clean_variable(target)
|
|
3442
|
+
|
|
3443
|
+
if not source_clean or not target_clean:
|
|
3444
|
+
continue
|
|
3445
|
+
|
|
3446
|
+
# Determine relationship category from context
|
|
3447
|
+
# If graph type is causal, apply strict filtering
|
|
3448
|
+
# If graph type is knowledge/mixed, be more permissive
|
|
3449
|
+
graph_type = self.symbolic_reasoner.graph_manager.graph_type
|
|
3450
|
+
if graph_type == 'causal':
|
|
3451
|
+
# For causal graphs: Reject action verbs/epistemic terms
|
|
3452
|
+
if (self.symbolic_reasoner._is_action_verb(source_clean) or
|
|
3453
|
+
self.symbolic_reasoner._is_epistemic_term(source_clean) or
|
|
3454
|
+
self.symbolic_reasoner._is_action_verb(target_clean) or
|
|
3455
|
+
self.symbolic_reasoner._is_epistemic_term(target_clean)):
|
|
3456
|
+
continue
|
|
3457
|
+
|
|
3458
|
+
# Only add if both are valid state variables
|
|
3459
|
+
if (source_clean in valid_variables and target_clean in valid_variables):
|
|
3460
|
+
self.symbolic_reasoner.graph_manager.add_relationship(
|
|
3461
|
+
source=source_clean,
|
|
3462
|
+
target=target_clean,
|
|
3463
|
+
strength=1.0,
|
|
3464
|
+
relation_type='causal',
|
|
3465
|
+
confidence=0.8,
|
|
3466
|
+
category='causal'
|
|
3467
|
+
)
|
|
3468
|
+
edges_added += 1
|
|
3469
|
+
else:
|
|
3470
|
+
# For knowledge/mixed graphs: More permissive
|
|
3471
|
+
if source_clean not in valid_variables:
|
|
3472
|
+
valid_variables.add(source_clean)
|
|
3473
|
+
if target_clean not in valid_variables:
|
|
3474
|
+
valid_variables.add(target_clean)
|
|
3475
|
+
|
|
3476
|
+
self.symbolic_reasoner.graph_manager.add_relationship(
|
|
3477
|
+
source=source_clean,
|
|
3478
|
+
target=target_clean,
|
|
3479
|
+
strength=1.0,
|
|
3480
|
+
relation_type='related',
|
|
3481
|
+
confidence=0.8,
|
|
3482
|
+
category='general'
|
|
3483
|
+
)
|
|
3484
|
+
edges_added += 1
|
|
3485
|
+
|
|
3486
|
+
# Track if we actually added any edges
|
|
3487
|
+
result['edges_added'] = edges_added
|
|
3488
|
+
if edges_added == 0 and len(relationships_extracted) > 0:
|
|
3489
|
+
result['epistemic_warning'] = (
|
|
3490
|
+
"No valid causal relationships were added to the graph. All extracted relationships "
|
|
3491
|
+
"involved action verbs or epistemic terms rather than causal state variables. "
|
|
3492
|
+
"Please provide explicit state variables and causal relationships."
|
|
3493
|
+
)
|
|
3494
|
+
|
|
3495
|
+
# Infer additional structure if needed (with context) - only for valid variables
|
|
3496
|
+
# Skip for SCM (structure is explicit)
|
|
3497
|
+
if not result.get('scm_parsed') and not edges and valid_variables:
|
|
3498
|
+
inferred_edges = self.symbolic_reasoner.infer_causal_structure(list(valid_variables), context=task)
|
|
3499
|
+
for source, target in inferred_edges:
|
|
3500
|
+
source_clean = self.symbolic_reasoner._clean_variable(source)
|
|
3501
|
+
target_clean = self.symbolic_reasoner._clean_variable(target)
|
|
3502
|
+
# Only add if both are valid
|
|
3503
|
+
if (source_clean and target_clean and
|
|
3504
|
+
source_clean in valid_variables and
|
|
3505
|
+
target_clean in valid_variables):
|
|
3506
|
+
self.symbolic_reasoner.graph_manager.add_relationship(
|
|
3507
|
+
source=source_clean,
|
|
3508
|
+
target=target_clean,
|
|
3509
|
+
strength=0.5,
|
|
3510
|
+
confidence=0.5
|
|
3511
|
+
)
|
|
3512
|
+
|
|
3513
|
+
# Validate graph with consistency checker
|
|
3514
|
+
if self.consistency_checker:
|
|
3515
|
+
graph_state = {
|
|
3516
|
+
'nodes': list(valid_variables),
|
|
3517
|
+
'edges': [(s, t) for s, t in self.symbolic_reasoner.graph_manager.get_edges()
|
|
3518
|
+
if s in valid_variables and t in valid_variables]
|
|
3519
|
+
}
|
|
3520
|
+
is_consistent, consistency_error = self.consistency_checker.verify_consistency(graph_state)
|
|
3521
|
+
if not is_consistent:
|
|
3522
|
+
logger.warning(f"Graph consistency check failed: {consistency_error}")
|
|
3523
|
+
# Try to correct if self_corrector available
|
|
3524
|
+
if self.self_corrector:
|
|
3525
|
+
errors = [{'type': 'inconsistency', 'message': consistency_error, 'graph': graph_state}]
|
|
3526
|
+
corrections = self.self_corrector.correct_errors(errors, graph_state)
|
|
3527
|
+
if corrections:
|
|
3528
|
+
logger.info(f"Applied {len(corrections)} corrections")
|
|
3529
|
+
|
|
3530
|
+
# Validate graph
|
|
3531
|
+
is_valid, error = self.symbolic_reasoner.validate_causal_graph()
|
|
3532
|
+
if not is_valid:
|
|
3533
|
+
logger.warning(f"Graph validation failed: {error}")
|
|
3534
|
+
|
|
3535
|
+
# Track reasoning step: Graph validation
|
|
3536
|
+
if self.reasoning_tracker:
|
|
3537
|
+
self.reasoning_tracker.add_step(
|
|
3538
|
+
step_type=StepType.VALIDATION,
|
|
3539
|
+
operation="validate_graph",
|
|
3540
|
+
input_state={'graph': graph_state if self.consistency_checker else {}},
|
|
3541
|
+
output_state={'is_valid': is_valid, 'error': error},
|
|
3542
|
+
conclusion="Graph validated" if is_valid else f"Graph validation failed: {error}"
|
|
3543
|
+
)
|
|
3544
|
+
|
|
3545
|
+
# Step 2: Statistical fitting (if data available)
|
|
3546
|
+
if data is not None and PANDAS_AVAILABLE:
|
|
3547
|
+
try:
|
|
3548
|
+
# Only use valid variables for statistical fitting
|
|
3549
|
+
graph_nodes = self.symbolic_reasoner.graph_manager.get_nodes()
|
|
3550
|
+
all_variables = [v for v in graph_nodes if v in valid_variables]
|
|
3551
|
+
if all_variables:
|
|
3552
|
+
self.statistical_engine.fit_from_dataframe(
|
|
3553
|
+
df=data,
|
|
3554
|
+
variables=all_variables,
|
|
3555
|
+
window=min(30, len(data)),
|
|
3556
|
+
decay_alpha=0.9
|
|
3557
|
+
)
|
|
3558
|
+
|
|
3559
|
+
# Quantify uncertainty
|
|
3560
|
+
uncertainty = self.statistical_engine.quantify_uncertainty(
|
|
3561
|
+
df=data,
|
|
3562
|
+
variables=all_variables,
|
|
3563
|
+
windows=min(200, len(data))
|
|
3564
|
+
)
|
|
3565
|
+
|
|
3566
|
+
result['statistical'] = {
|
|
3567
|
+
'edge_strengths': {
|
|
3568
|
+
(s, t): self.statistical_engine.assess_causal_strength(s, t)
|
|
3569
|
+
for s, t in self.symbolic_reasoner.graph_manager.get_edges()
|
|
3570
|
+
},
|
|
3571
|
+
'uncertainty': uncertainty,
|
|
3572
|
+
'confidence_intervals': uncertainty.get('edge_intervals', {})
|
|
3573
|
+
}
|
|
3574
|
+
except Exception as e:
|
|
3575
|
+
logger.warning(f"Statistical fitting failed: {e}")
|
|
3576
|
+
|
|
3577
|
+
# Step 3: Build comprehensive analysis result
|
|
3578
|
+
# Only include relationships between valid variables
|
|
3579
|
+
graph_nodes_all = self.symbolic_reasoner.graph_manager.get_nodes()
|
|
3580
|
+
graph_edges_all = self.symbolic_reasoner.graph_manager.get_edges()
|
|
3581
|
+
|
|
3582
|
+
# Filter to only valid variables
|
|
3583
|
+
graph_nodes = [n for n in graph_nodes_all if n in valid_variables]
|
|
3584
|
+
graph_edges = [(s, t) for s, t in graph_edges_all if s in valid_variables and t in valid_variables]
|
|
3585
|
+
|
|
3586
|
+
relationships = []
|
|
3587
|
+
for source, target in graph_edges:
|
|
3588
|
+
edge_data = self.symbolic_reasoner.graph_manager.graph.get(source, {}).get(target, {})
|
|
3589
|
+
strength = self.statistical_engine.assess_causal_strength(source, target) if edge_data.get('category') == 'causal' else 1.0
|
|
3590
|
+
confidence = edge_data.get('confidence', 0.8)
|
|
3591
|
+
relation_type = edge_data.get('relation_type', 'causal')
|
|
3592
|
+
category = edge_data.get('category', 'causal') # Get category from edge metadata
|
|
3593
|
+
relationships.append({
|
|
3594
|
+
'source': source,
|
|
3595
|
+
'target': target,
|
|
3596
|
+
'type': relation_type,
|
|
3597
|
+
'category': category,
|
|
3598
|
+
'strength': strength,
|
|
3599
|
+
'confidence': confidence
|
|
3600
|
+
})
|
|
3601
|
+
|
|
3602
|
+
# Determine relationship type for graph structure description
|
|
3603
|
+
has_causal = any(rel.get('category') == 'causal' for rel in relationships)
|
|
3604
|
+
has_general = any(rel.get('category') in ['taxonomic', 'meronymic', 'spatial', 'functional', 'definitional', 'factual'] for rel in relationships)
|
|
3605
|
+
|
|
3606
|
+
if has_general and not has_causal:
|
|
3607
|
+
rel_type_label = "knowledge relationships"
|
|
3608
|
+
elif has_causal:
|
|
3609
|
+
rel_type_label = "causal relationships"
|
|
3610
|
+
else:
|
|
3611
|
+
rel_type_label = "relationships"
|
|
3612
|
+
|
|
3613
|
+
# Generate graph structure description (only valid variables)
|
|
3614
|
+
graph_structure = f"Nodes: {', '.join(sorted(graph_nodes))}\nEdges: {len(graph_edges)} {rel_type_label}"
|
|
3615
|
+
|
|
3616
|
+
# Generate insights based on intent
|
|
3617
|
+
insights = []
|
|
3618
|
+
if relationships:
|
|
3619
|
+
# Strongest relationship
|
|
3620
|
+
strongest = max(relationships, key=lambda x: abs(x.get('strength', 0)))
|
|
3621
|
+
category = strongest.get('category', 'causal')
|
|
3622
|
+
if category == 'causal':
|
|
3623
|
+
insights.append(
|
|
3624
|
+
f"Strongest causal relationship: {strongest['source']} -> {strongest['target']} "
|
|
3625
|
+
f"(strength: {strongest['strength']:.3f}, confidence: {strongest['confidence']:.2f})"
|
|
3626
|
+
)
|
|
3627
|
+
elif category == 'taxonomic':
|
|
3628
|
+
insights.append(
|
|
3629
|
+
f"Taxonomic relationship: {strongest['source']} is a type of {strongest['target']} "
|
|
3630
|
+
f"(confidence: {strongest['confidence']:.2f})"
|
|
3631
|
+
)
|
|
3632
|
+
else:
|
|
3633
|
+
insights.append(
|
|
3634
|
+
f"Strongest relationship: {strongest['source']} -> {strongest['target']} "
|
|
3635
|
+
f"(type: {category}, confidence: {strongest['confidence']:.2f})"
|
|
3636
|
+
)
|
|
3637
|
+
|
|
3638
|
+
# Most connected variable
|
|
3639
|
+
node_degrees = defaultdict(int)
|
|
3640
|
+
for rel in relationships:
|
|
3641
|
+
node_degrees[rel['source']] += 1
|
|
3642
|
+
node_degrees[rel['target']] += 1
|
|
3643
|
+
if node_degrees:
|
|
3644
|
+
most_connected = max(node_degrees.items(), key=lambda x: x[1])
|
|
3645
|
+
insights.append(
|
|
3646
|
+
f"Most connected variable: {most_connected[0]} ({most_connected[1]} relationships)"
|
|
3647
|
+
)
|
|
3648
|
+
|
|
3649
|
+
# Generate recommendations if requested
|
|
3650
|
+
recommendations = []
|
|
3651
|
+
if intent['type'] == 'recommendation' and relationships:
|
|
3652
|
+
# Find variables with high out-degree (causes) that could be intervened on
|
|
3653
|
+
out_degrees = defaultdict(int)
|
|
3654
|
+
for rel in relationships:
|
|
3655
|
+
out_degrees[rel['source']] += abs(rel['strength'])
|
|
3656
|
+
|
|
3657
|
+
if out_degrees:
|
|
3658
|
+
top_levers = sorted(out_degrees.items(), key=lambda x: x[1], reverse=True)[:3]
|
|
3659
|
+
for var, total_effect in top_levers:
|
|
3660
|
+
recommendations.append(
|
|
3661
|
+
f"Consider intervening on '{var}' - it has strong causal effects on multiple outcomes"
|
|
3662
|
+
)
|
|
3663
|
+
|
|
3664
|
+
# Parse extracted values to create factual state (used for both analysis and counterfactuals)
|
|
3665
|
+
# For SCM, use the values from the SCM structure
|
|
3666
|
+
if result.get('scm_parsed'):
|
|
3667
|
+
variables_with_values = extraction.get('metadata', {}).get('variables_with_values', {})
|
|
3668
|
+
else:
|
|
3669
|
+
variables_with_values = extraction.get('metadata', {}).get('variables_with_values', {})
|
|
3670
|
+
extracted_values = self._parse_extracted_values(variables_with_values)
|
|
3671
|
+
|
|
3672
|
+
# Create factual state using extracted values, fallback to 0.0
|
|
3673
|
+
factual_state = {}
|
|
3674
|
+
if graph_nodes:
|
|
3675
|
+
for var in graph_nodes:
|
|
3676
|
+
# Use extracted value if available, otherwise 0.0
|
|
3677
|
+
# Try exact match first
|
|
3678
|
+
if var in extracted_values:
|
|
3679
|
+
factual_state[var] = extracted_values[var]
|
|
3680
|
+
else:
|
|
3681
|
+
# Try partial match (e.g., "product price" matches "product price")
|
|
3682
|
+
matched = False
|
|
3683
|
+
for extracted_var, value in extracted_values.items():
|
|
3684
|
+
# Normalize both for comparison
|
|
3685
|
+
var_normalized = var.lower().replace(' ', '')
|
|
3686
|
+
extracted_var_normalized = extracted_var.lower().replace(' ', '')
|
|
3687
|
+
if var_normalized in extracted_var_normalized or extracted_var_normalized in var_normalized:
|
|
3688
|
+
factual_state[var] = value
|
|
3689
|
+
matched = True
|
|
3690
|
+
break
|
|
3691
|
+
if not matched:
|
|
3692
|
+
factual_state[var] = 0.0
|
|
3693
|
+
|
|
3694
|
+
result['analysis'] = {
|
|
3695
|
+
'variables': sorted(list(graph_nodes)), # Only valid variables
|
|
3696
|
+
'relationships': relationships, # Only valid relationships
|
|
3697
|
+
'graph_structure': graph_structure,
|
|
3698
|
+
'insights': insights,
|
|
3699
|
+
'extraction_metadata': extraction.get('metadata', {}),
|
|
3700
|
+
'factual_state': factual_state
|
|
3701
|
+
}
|
|
3702
|
+
result['recommendations'] = recommendations
|
|
3703
|
+
|
|
3704
|
+
# Step 4: Graph-first reasoning (answer from graph state ONLY)
|
|
3705
|
+
if self.graph_first_reasoner and graph_nodes:
|
|
3706
|
+
try:
|
|
3707
|
+
graph_state = {
|
|
3708
|
+
'nodes': graph_nodes,
|
|
3709
|
+
'edges': graph_edges,
|
|
3710
|
+
'edge_data': {
|
|
3711
|
+
(s, t): self.symbolic_reasoner.graph_manager.graph.get(s, {}).get(t, {})
|
|
3712
|
+
for s, t in graph_edges
|
|
3713
|
+
}
|
|
3714
|
+
}
|
|
3715
|
+
|
|
3716
|
+
# Track reasoning step: Graph-first reasoning
|
|
3717
|
+
if self.reasoning_tracker:
|
|
3718
|
+
self.reasoning_tracker.add_step(
|
|
3719
|
+
step_type=StepType.INFERENCE,
|
|
3720
|
+
operation="graph_first_reasoning",
|
|
3721
|
+
input_state={'graph_state': graph_state, 'query': corrected_task},
|
|
3722
|
+
output_state={},
|
|
3723
|
+
conclusion="Graph-first reasoning"
|
|
3724
|
+
)
|
|
3725
|
+
|
|
3726
|
+
# Reason from graph state only
|
|
3727
|
+
graph_answer = self.graph_first_reasoner.reason_from_graph_state(
|
|
3728
|
+
state=graph_state,
|
|
3729
|
+
query=corrected_task,
|
|
3730
|
+
graph_manager=self.symbolic_reasoner.graph_manager
|
|
3731
|
+
)
|
|
3732
|
+
result['graph_first_answer'] = graph_answer
|
|
3733
|
+
|
|
3734
|
+
# Track reasoning step: Graph answer
|
|
3735
|
+
if self.reasoning_tracker and graph_answer.get('answer'):
|
|
3736
|
+
self.reasoning_tracker.add_step(
|
|
3737
|
+
step_type=StepType.INFERENCE,
|
|
3738
|
+
operation="graph_answer",
|
|
3739
|
+
input_state={},
|
|
3740
|
+
output_state={'answer': graph_answer.get('answer')},
|
|
3741
|
+
conclusion=graph_answer.get('answer', '')
|
|
3742
|
+
)
|
|
3743
|
+
|
|
3744
|
+
# If graph-first reasoning provides an answer, use it
|
|
3745
|
+
if graph_answer.get('answer'):
|
|
3746
|
+
result['analysis']['graph_first_insight'] = graph_answer['answer']
|
|
3747
|
+
except Exception as e:
|
|
3748
|
+
logger.warning(f"Graph-first reasoning failed: {e}")
|
|
3749
|
+
|
|
3750
|
+
# Step 5: Generate counterfactuals (if requested or if we have a state)
|
|
3751
|
+
if intent['type'] == 'counterfactual' or (graph_nodes and not intent['type'] == 'extraction'):
|
|
3752
|
+
|
|
3753
|
+
# Use intervention variables from intent if available
|
|
3754
|
+
# Prefer state variables (variables with extracted values) over question targets
|
|
3755
|
+
target_vars = intent.get('intervention_variables', [])
|
|
3756
|
+
if not target_vars:
|
|
3757
|
+
# Use variables that have extracted values (state variables)
|
|
3758
|
+
state_vars = [v for v in graph_nodes if v in extracted_values or any(v in k for k in extracted_values.keys())]
|
|
3759
|
+
if state_vars:
|
|
3760
|
+
target_vars = state_vars[:3] # Use first 3 state variables
|
|
3761
|
+
else:
|
|
3762
|
+
target_vars = [v for v in graph_nodes if 'expected' not in v.lower()][:3] # Exclude question targets
|
|
3763
|
+
|
|
3764
|
+
# Filter target_vars to only valid variables
|
|
3765
|
+
target_vars = [v for v in target_vars if v in graph_nodes]
|
|
3766
|
+
|
|
3767
|
+
if target_vars:
|
|
3768
|
+
try:
|
|
3769
|
+
counterfactuals = self.statistical_engine.generate_probabilistic_counterfactuals(
|
|
3770
|
+
factual_state=factual_state,
|
|
3771
|
+
target_variables=target_vars,
|
|
3772
|
+
n_scenarios=min(5, len(target_vars) + 2)
|
|
3773
|
+
)
|
|
3774
|
+
result['counterfactuals'] = counterfactuals
|
|
3775
|
+
except Exception as e:
|
|
3776
|
+
logger.warning(f"Counterfactual generation failed: {e}")
|
|
3777
|
+
|
|
3778
|
+
# Step 6: Apply pragmatic layer for response generation
|
|
3779
|
+
if self.pragmatic_compiler and result.get('analysis'):
|
|
3780
|
+
# Determine confidence and complexity for pragmatic decisions
|
|
3781
|
+
avg_confidence = sum([r.get('confidence', 0.8) for r in relationships]) / len(relationships) if relationships else 0.8
|
|
3782
|
+
complexity = len(graph_nodes) + len(graph_edges)
|
|
3783
|
+
|
|
3784
|
+
register = self.pragmatic_compiler.select_register(avg_confidence, complexity)
|
|
3785
|
+
result['pragmatic'] = {
|
|
3786
|
+
'register': register,
|
|
3787
|
+
'hedging': self.pragmatic_compiler.generate_hedging(avg_confidence),
|
|
3788
|
+
'explicitness': self.pragmatic_compiler.adjust_explicitness(len(graph_nodes))
|
|
3789
|
+
}
|
|
3790
|
+
else:
|
|
3791
|
+
# Default pragmatic info if compiler not available
|
|
3792
|
+
result['pragmatic'] = {
|
|
3793
|
+
'register': 'neutral',
|
|
3794
|
+
'hedging': 'likely',
|
|
3795
|
+
'explicitness': 3
|
|
3796
|
+
}
|
|
3797
|
+
|
|
3798
|
+
# Step 7: Self-verification and error detection
|
|
3799
|
+
if self.error_detector and self.reasoning_tracker and self.reasoning_tracker.current_chain:
|
|
3800
|
+
reasoning_chain = self.reasoning_tracker.current_chain
|
|
3801
|
+
graph_state = {
|
|
3802
|
+
'nodes': graph_nodes,
|
|
3803
|
+
'edges': graph_edges
|
|
3804
|
+
}
|
|
3805
|
+
errors = self.error_detector.detect_errors(reasoning_chain, graph_state)
|
|
3806
|
+
if errors:
|
|
3807
|
+
result['errors'] = errors
|
|
3808
|
+
logger.warning(f"Detected {len(errors)} errors")
|
|
3809
|
+
|
|
3810
|
+
# Try to correct errors
|
|
3811
|
+
if self.self_corrector:
|
|
3812
|
+
corrections = self.self_corrector.correct_errors(errors, graph_state)
|
|
3813
|
+
if corrections:
|
|
3814
|
+
result['corrections'] = corrections
|
|
3815
|
+
logger.info(f"Applied {len(corrections)} corrections")
|
|
3816
|
+
|
|
3817
|
+
# Step 8: Generate explanations
|
|
3818
|
+
if self.explanation_builder and self.reasoning_tracker and self.reasoning_tracker.current_chain:
|
|
3819
|
+
reasoning_chain = self.reasoning_tracker.current_chain
|
|
3820
|
+
graph_state = {
|
|
3821
|
+
'nodes': graph_nodes,
|
|
3822
|
+
'edges': graph_edges
|
|
3823
|
+
}
|
|
3824
|
+
explanation = self.explanation_builder.generate_explanation(reasoning_chain, graph_state)
|
|
3825
|
+
result['explanation'] = explanation
|
|
3826
|
+
|
|
3827
|
+
# Step 9: Transparency layer
|
|
3828
|
+
if self.transparency_layer and self.reasoning_tracker and self.reasoning_tracker.current_chain:
|
|
3829
|
+
reasoning_chain = self.reasoning_tracker.current_chain
|
|
3830
|
+
trace = self.transparency_layer.show_reasoning_trace(reasoning_chain)
|
|
3831
|
+
confidence_viz = self.transparency_layer.visualize_confidence(reasoning_chain)
|
|
3832
|
+
graph_explanation = self.transparency_layer.explain_graph_structure({
|
|
3833
|
+
'nodes': graph_nodes,
|
|
3834
|
+
'edges': graph_edges
|
|
3835
|
+
})
|
|
3836
|
+
result['transparency'] = {
|
|
3837
|
+
'trace': trace,
|
|
3838
|
+
'confidence': confidence_viz,
|
|
3839
|
+
'graph_structure': graph_explanation
|
|
3840
|
+
}
|
|
3841
|
+
|
|
3842
|
+
# Store reasoning chain in result
|
|
3843
|
+
if self.reasoning_tracker and self.reasoning_tracker.current_chain and result:
|
|
3844
|
+
result['reasoning_chain'] = self.reasoning_tracker.current_chain
|
|
3845
|
+
# Mark chain as successful if we have results
|
|
3846
|
+
if result.get('analysis') or result.get('graph_first_answer'):
|
|
3847
|
+
self.reasoning_tracker.current_chain.success = True
|
|
3848
|
+
graph_answer = result.get('graph_first_answer', {})
|
|
3849
|
+
if graph_answer:
|
|
3850
|
+
self.reasoning_tracker.current_chain.final_conclusion = graph_answer.get('answer')
|
|
3851
|
+
|
|
3852
|
+
# Step 10: Validate result completeness
|
|
3853
|
+
self._validate_result(result)
|
|
3854
|
+
|
|
3855
|
+
return result
|
|
3856
|
+
|
|
3857
|
+
def _validate_result(self, result: Dict[str, Any]) -> None:
|
|
3858
|
+
"""
|
|
3859
|
+
Validate result completeness and add warnings if needed.
|
|
3860
|
+
|
|
3861
|
+
Args:
|
|
3862
|
+
result: Result dictionary to validate
|
|
3863
|
+
"""
|
|
3864
|
+
warnings = []
|
|
3865
|
+
|
|
3866
|
+
# Check if we have any meaningful analysis
|
|
3867
|
+
analysis = result.get('analysis', {})
|
|
3868
|
+
if not analysis.get('variables') and not analysis.get('relationships'):
|
|
3869
|
+
warnings.append("No variables or relationships were extracted from the input.")
|
|
3870
|
+
|
|
3871
|
+
# Check graph-first answer quality
|
|
3872
|
+
graph_answer = result.get('graph_first_answer', {})
|
|
3873
|
+
if not graph_answer or not graph_answer.get('answer'):
|
|
3874
|
+
warnings.append("Graph-first reasoning did not produce a clear answer.")
|
|
3875
|
+
|
|
3876
|
+
# Check counterfactuals if requested
|
|
3877
|
+
intent = result.get('intent', {})
|
|
3878
|
+
if intent.get('type') == 'counterfactual' and not result.get('counterfactuals'):
|
|
3879
|
+
warnings.append("Counterfactual analysis was requested but none were generated.")
|
|
3880
|
+
|
|
3881
|
+
if warnings:
|
|
3882
|
+
result['warnings'] = warnings
|
|
3883
|
+
logger.warning(f"Result validation warnings: {warnings}")
|
|
3884
|
+
|
|
3885
|
+
|
|
3886
|
+
class HybridAgent:
|
|
3887
|
+
"""
|
|
3888
|
+
Main hybrid agent class with graph-first reasoning architecture.
|
|
3889
|
+
|
|
3890
|
+
Supports both causal reasoning (CRCA) and general knowledge tasks.
|
|
3891
|
+
|
|
3892
|
+
New Architecture:
|
|
3893
|
+
Text Input -> TextCorrector -> LanguageCompiler -> SymbolicReasoner
|
|
3894
|
+
-> GraphManager -> GraphFirstReasoner -> RuleBasedNLG -> Response
|
|
3895
|
+
|
|
3896
|
+
No LLM dependency - pure symbolic-statistical reasoning with graph-first architecture.
|
|
3897
|
+
|
|
3898
|
+
Graph Types Supported:
|
|
3899
|
+
- "causal": Causal relationships (default for CRCA tasks)
|
|
3900
|
+
- "knowledge": General knowledge graphs (facts, definitions, taxonomic relationships)
|
|
3901
|
+
- "dependency": Dependency relationships
|
|
3902
|
+
- "mixed": Combination of relationship types
|
|
3903
|
+
"""
|
|
3904
|
+
|
|
3905
|
+
def __init__(
|
|
3906
|
+
self,
|
|
3907
|
+
graph_type: str = "causal",
|
|
3908
|
+
seed: int = 42,
|
|
3909
|
+
enable_graph_first: bool = True,
|
|
3910
|
+
enable_compression: bool = True,
|
|
3911
|
+
enable_language_compilation: bool = True,
|
|
3912
|
+
enable_error_correction: bool = True,
|
|
3913
|
+
enable_conversation: bool = True,
|
|
3914
|
+
enable_reasoning_tracking: bool = True,
|
|
3915
|
+
enable_few_shot_learning: bool = True,
|
|
3916
|
+
enable_task_decomposition: bool = True,
|
|
3917
|
+
enable_explanations: bool = True,
|
|
3918
|
+
enable_verification: bool = True,
|
|
3919
|
+
enable_consistency: bool = True
|
|
3920
|
+
):
|
|
3921
|
+
"""
|
|
3922
|
+
Initialize hybrid agent with graph-first architecture.
|
|
3923
|
+
|
|
3924
|
+
Args:
|
|
3925
|
+
graph_type: Type of graph (causal, knowledge, dependency, etc.)
|
|
3926
|
+
seed: Random seed for reproducibility
|
|
3927
|
+
enable_graph_first: Enable graph-first reasoning (answers from graph only)
|
|
3928
|
+
enable_compression: Enable graph compression and abstraction
|
|
3929
|
+
enable_language_compilation: Enable language compilation layers
|
|
3930
|
+
enable_error_correction: Enable non-destructive text correction
|
|
3931
|
+
"""
|
|
3932
|
+
# Initialize core components
|
|
3933
|
+
self.graph_manager = GraphManager(graph_type=graph_type)
|
|
3934
|
+
self.prediction_framework = PredictionFramework(
|
|
3935
|
+
graph_manager=self.graph_manager
|
|
3936
|
+
)
|
|
3937
|
+
|
|
3938
|
+
# Initialize graph-first components
|
|
3939
|
+
self.graph_first_reasoner = GraphFirstReasoner(graph_manager=self.graph_manager) if enable_graph_first else None
|
|
3940
|
+
self.graph_compressor = GraphCompressor(self.graph_manager) if enable_compression else None
|
|
3941
|
+
|
|
3942
|
+
# Initialize language compilation components
|
|
3943
|
+
if enable_language_compilation:
|
|
3944
|
+
# Enable dictionary integration by default (no API key required)
|
|
3945
|
+
self.lexical_compiler = LexicalCompiler(enable_dictionary=True, cache_enabled=True)
|
|
3946
|
+
self.grammatical_compiler = GrammaticalCompiler()
|
|
3947
|
+
self.pragmatic_compiler = PragmaticCompiler()
|
|
3948
|
+
else:
|
|
3949
|
+
self.lexical_compiler = None
|
|
3950
|
+
self.grammatical_compiler = None
|
|
3951
|
+
self.pragmatic_compiler = None
|
|
3952
|
+
|
|
3953
|
+
# Initialize error correction (with dictionary integration)
|
|
3954
|
+
if enable_error_correction:
|
|
3955
|
+
# Pass lexical compiler to text corrector for dictionary-enhanced correction
|
|
3956
|
+
lexical_for_corrector = self.lexical_compiler if enable_language_compilation else None
|
|
3957
|
+
self.text_corrector = TextCorrector(lexical_compiler=lexical_for_corrector)
|
|
3958
|
+
else:
|
|
3959
|
+
self.text_corrector = None
|
|
3960
|
+
|
|
3961
|
+
# Initialize few-shot learning components (needed before symbolic reasoner)
|
|
3962
|
+
if enable_few_shot_learning:
|
|
3963
|
+
self.example_store = ExampleStore()
|
|
3964
|
+
self.pattern_learner = PatternLearner(self.example_store)
|
|
3965
|
+
self.adaptive_extractor = AdaptiveExtractor(self.pattern_learner, self.example_store)
|
|
3966
|
+
else:
|
|
3967
|
+
self.example_store = None
|
|
3968
|
+
self.pattern_learner = None
|
|
3969
|
+
self.adaptive_extractor = None
|
|
3970
|
+
|
|
3971
|
+
# Initialize reasoning components
|
|
3972
|
+
# Pass lexical compiler to symbolic reasoner for dictionary-enhanced validation
|
|
3973
|
+
lexical_for_reasoner = self.lexical_compiler if enable_language_compilation else None
|
|
3974
|
+
adaptive_extractor_for_reasoner = self.adaptive_extractor if enable_few_shot_learning else None
|
|
3975
|
+
self.symbolic_reasoner = SymbolicReasoner(
|
|
3976
|
+
self.graph_manager,
|
|
3977
|
+
lexical_compiler=lexical_for_reasoner,
|
|
3978
|
+
adaptive_extractor=adaptive_extractor_for_reasoner
|
|
3979
|
+
)
|
|
3980
|
+
self.statistical_engine = StatisticalEngine(
|
|
3981
|
+
graph_manager=self.graph_manager,
|
|
3982
|
+
prediction_framework=self.prediction_framework,
|
|
3983
|
+
seed=seed
|
|
3984
|
+
)
|
|
3985
|
+
self.nlg = RuleBasedNLG()
|
|
3986
|
+
# Initialize LLM-enhanced components
|
|
3987
|
+
if enable_conversation:
|
|
3988
|
+
self.conversation_history = ConversationHistory(decay_lambda=0.1)
|
|
3989
|
+
self.context_tracker = ContextTracker(self.conversation_history)
|
|
3990
|
+
else:
|
|
3991
|
+
self.conversation_history = None
|
|
3992
|
+
self.context_tracker = None
|
|
3993
|
+
|
|
3994
|
+
if enable_reasoning_tracking:
|
|
3995
|
+
self.reasoning_tracker = ReasoningTracker()
|
|
3996
|
+
else:
|
|
3997
|
+
self.reasoning_tracker = None
|
|
3998
|
+
|
|
3999
|
+
if enable_few_shot_learning:
|
|
4000
|
+
self.example_store = ExampleStore()
|
|
4001
|
+
self.pattern_learner = PatternLearner(self.example_store)
|
|
4002
|
+
self.adaptive_extractor = AdaptiveExtractor(self.pattern_learner, self.example_store)
|
|
4003
|
+
else:
|
|
4004
|
+
self.example_store = None
|
|
4005
|
+
self.pattern_learner = None
|
|
4006
|
+
self.adaptive_extractor = None
|
|
4007
|
+
|
|
4008
|
+
if enable_task_decomposition:
|
|
4009
|
+
self.task_analyzer = TaskAnalyzer()
|
|
4010
|
+
self.subtask_executor = SubTaskExecutor()
|
|
4011
|
+
self.plan_generator = PlanGenerator(self.task_analyzer)
|
|
4012
|
+
else:
|
|
4013
|
+
self.task_analyzer = None
|
|
4014
|
+
self.subtask_executor = None
|
|
4015
|
+
self.plan_generator = None
|
|
4016
|
+
|
|
4017
|
+
if enable_explanations:
|
|
4018
|
+
self.explanation_builder = ExplanationBuilder()
|
|
4019
|
+
self.transparency_layer = TransparencyLayer()
|
|
4020
|
+
else:
|
|
4021
|
+
self.explanation_builder = None
|
|
4022
|
+
self.transparency_layer = None
|
|
4023
|
+
|
|
4024
|
+
if enable_verification:
|
|
4025
|
+
self.consistency_checker = ConsistencyChecker()
|
|
4026
|
+
self.error_detector = ErrorDetector()
|
|
4027
|
+
self.self_corrector = SelfCorrector()
|
|
4028
|
+
else:
|
|
4029
|
+
self.consistency_checker = None
|
|
4030
|
+
self.error_detector = None
|
|
4031
|
+
self.self_corrector = None
|
|
4032
|
+
|
|
4033
|
+
if enable_consistency:
|
|
4034
|
+
self.consistency_engine = ConsistencyEngine(seed=seed)
|
|
4035
|
+
else:
|
|
4036
|
+
self.consistency_engine = None
|
|
4037
|
+
|
|
4038
|
+
self.orchestrator = HybridOrchestrator(
|
|
4039
|
+
symbolic_reasoner=self.symbolic_reasoner,
|
|
4040
|
+
statistical_engine=self.statistical_engine,
|
|
4041
|
+
nlg=self.nlg,
|
|
4042
|
+
graph_first_reasoner=self.graph_first_reasoner,
|
|
4043
|
+
text_corrector=self.text_corrector,
|
|
4044
|
+
lexical_compiler=self.lexical_compiler,
|
|
4045
|
+
grammatical_compiler=self.grammatical_compiler,
|
|
4046
|
+
pragmatic_compiler=self.pragmatic_compiler,
|
|
4047
|
+
reasoning_tracker=self.reasoning_tracker,
|
|
4048
|
+
explanation_builder=self.explanation_builder,
|
|
4049
|
+
transparency_layer=self.transparency_layer,
|
|
4050
|
+
consistency_checker=self.consistency_checker,
|
|
4051
|
+
error_detector=self.error_detector,
|
|
4052
|
+
self_corrector=self.self_corrector,
|
|
4053
|
+
consistency_engine=self.consistency_engine
|
|
4054
|
+
)
|
|
4055
|
+
|
|
4056
|
+
def run(
|
|
4057
|
+
self,
|
|
4058
|
+
task: str,
|
|
4059
|
+
data: Optional[Any] = None,
|
|
4060
|
+
response_style: str = 'conversational',
|
|
4061
|
+
context: Optional[ConversationContext] = None,
|
|
4062
|
+
show_reasoning: bool = False
|
|
4063
|
+
) -> str:
|
|
4064
|
+
"""
|
|
4065
|
+
Run hybrid agent on a task with conversation context support.
|
|
4066
|
+
|
|
4067
|
+
Args:
|
|
4068
|
+
task: Task description
|
|
4069
|
+
data: Optional data for statistical inference
|
|
4070
|
+
response_style: Response style ('conversational', 'brief', 'full')
|
|
4071
|
+
context: Optional conversation context
|
|
4072
|
+
show_reasoning: Whether to show chain-of-thought reasoning
|
|
4073
|
+
|
|
4074
|
+
Returns:
|
|
4075
|
+
Natural language response
|
|
4076
|
+
"""
|
|
4077
|
+
try:
|
|
4078
|
+
# Handle conversation context
|
|
4079
|
+
if self.conversation_history and context is None:
|
|
4080
|
+
# Use existing context if available
|
|
4081
|
+
context = self.conversation_history.context
|
|
4082
|
+
|
|
4083
|
+
# Add user message to conversation
|
|
4084
|
+
if self.conversation_history:
|
|
4085
|
+
self.conversation_history.add_message(
|
|
4086
|
+
role=MessageRole.USER,
|
|
4087
|
+
content=task
|
|
4088
|
+
)
|
|
4089
|
+
|
|
4090
|
+
# Resolve references in task using context
|
|
4091
|
+
if self.context_tracker and context:
|
|
4092
|
+
resolved_task = self._resolve_task_references(task, context)
|
|
4093
|
+
else:
|
|
4094
|
+
resolved_task = task
|
|
4095
|
+
|
|
4096
|
+
# Task decomposition (if enabled)
|
|
4097
|
+
if self.task_analyzer and self.plan_generator:
|
|
4098
|
+
plan = self.plan_generator.generate_plan(resolved_task)
|
|
4099
|
+
if plan['estimated_steps'] > 1:
|
|
4100
|
+
# Complex task - use decomposition
|
|
4101
|
+
logger.info(f"Decomposing task into {plan['estimated_steps']} subtasks")
|
|
4102
|
+
# For now, proceed with original task
|
|
4103
|
+
# Future: Execute subtasks in parallel
|
|
4104
|
+
|
|
4105
|
+
# Validate input
|
|
4106
|
+
if not resolved_task or not isinstance(resolved_task, str):
|
|
4107
|
+
return "I need a valid task description to analyze. Please provide a question or statement about causal relationships."
|
|
4108
|
+
|
|
4109
|
+
if len(resolved_task.strip()) == 0:
|
|
4110
|
+
return "Please provide a non-empty task description."
|
|
4111
|
+
|
|
4112
|
+
# Execute hybrid reasoning with graph-first architecture
|
|
4113
|
+
result = self.orchestrator.reason_hybrid(task=resolved_task, data=data, context=context)
|
|
4114
|
+
|
|
4115
|
+
# Validate result
|
|
4116
|
+
if not result:
|
|
4117
|
+
return "I couldn't process your request. Please try rephrasing with clearer causal relationships."
|
|
4118
|
+
|
|
4119
|
+
# Add agent response to conversation
|
|
4120
|
+
if self.conversation_history:
|
|
4121
|
+
# Will add after response is generated
|
|
4122
|
+
pass
|
|
4123
|
+
|
|
4124
|
+
# Get pragmatic information for response generation
|
|
4125
|
+
pragmatic_info = result.get('pragmatic', {
|
|
4126
|
+
'register': 'neutral',
|
|
4127
|
+
'hedging': 'likely',
|
|
4128
|
+
'explicitness': 3
|
|
4129
|
+
})
|
|
4130
|
+
|
|
4131
|
+
# Generate natural language response from graph state
|
|
4132
|
+
reasoning_chain = None
|
|
4133
|
+
if self.reasoning_tracker and self.reasoning_tracker.current_chain:
|
|
4134
|
+
reasoning_chain = self.reasoning_tracker.current_chain
|
|
4135
|
+
|
|
4136
|
+
if show_reasoning and reasoning_chain:
|
|
4137
|
+
# Include chain-of-thought reasoning
|
|
4138
|
+
if self.explanation_builder:
|
|
4139
|
+
explanation = self.explanation_builder.generate_explanation(reasoning_chain, result.get('analysis', {}))
|
|
4140
|
+
result['explanation'] = explanation
|
|
4141
|
+
|
|
4142
|
+
response = self.nlg.generate_response(
|
|
4143
|
+
result,
|
|
4144
|
+
response_type=response_style,
|
|
4145
|
+
pragmatic_info=pragmatic_info,
|
|
4146
|
+
show_reasoning=True,
|
|
4147
|
+
reasoning_chain=reasoning_chain
|
|
4148
|
+
)
|
|
4149
|
+
else:
|
|
4150
|
+
response = self.nlg.generate_response(
|
|
4151
|
+
result,
|
|
4152
|
+
response_type=response_style,
|
|
4153
|
+
pragmatic_info=pragmatic_info,
|
|
4154
|
+
show_reasoning=False,
|
|
4155
|
+
reasoning_chain=reasoning_chain
|
|
4156
|
+
)
|
|
4157
|
+
|
|
4158
|
+
# Add agent response to conversation
|
|
4159
|
+
if self.conversation_history:
|
|
4160
|
+
self.conversation_history.add_message(
|
|
4161
|
+
role=MessageRole.AGENT,
|
|
4162
|
+
content=response,
|
|
4163
|
+
metadata={'result': result}
|
|
4164
|
+
)
|
|
4165
|
+
|
|
4166
|
+
return response
|
|
4167
|
+
|
|
4168
|
+
except Exception as e:
|
|
4169
|
+
logger.error(f"Error in hybrid agent run: {e}", exc_info=True)
|
|
4170
|
+
return f"I encountered an error processing your request: {str(e)}. Please try rephrasing."
|
|
4171
|
+
|
|
4172
|
+
def _resolve_task_references(
|
|
4173
|
+
self,
|
|
4174
|
+
task: str,
|
|
4175
|
+
context: ConversationContext
|
|
4176
|
+
) -> str:
|
|
4177
|
+
"""
|
|
4178
|
+
Resolve references in task using conversation context.
|
|
4179
|
+
|
|
4180
|
+
Args:
|
|
4181
|
+
task: Original task
|
|
4182
|
+
context: Conversation context
|
|
4183
|
+
|
|
4184
|
+
Returns:
|
|
4185
|
+
Task with resolved references
|
|
4186
|
+
"""
|
|
4187
|
+
if not self.context_tracker:
|
|
4188
|
+
return task
|
|
4189
|
+
|
|
4190
|
+
# Resolve common references
|
|
4191
|
+
resolved = task
|
|
4192
|
+
references = ['it', 'that', 'this', 'the price', 'the variable']
|
|
4193
|
+
|
|
4194
|
+
for ref in references:
|
|
4195
|
+
if ref.lower() in task.lower():
|
|
4196
|
+
resolved_var = self.context_tracker.resolve_reference(ref, context.current_turn)
|
|
4197
|
+
if resolved_var:
|
|
4198
|
+
resolved = resolved.replace(ref, resolved_var)
|
|
4199
|
+
|
|
4200
|
+
return resolved
|
|
4201
|
+
|
|
4202
|
+
def update_context(
|
|
4203
|
+
self,
|
|
4204
|
+
context: Optional[ConversationContext],
|
|
4205
|
+
user_message: str,
|
|
4206
|
+
agent_response: str
|
|
4207
|
+
) -> ConversationContext:
|
|
4208
|
+
"""
|
|
4209
|
+
Update conversation context after interaction.
|
|
4210
|
+
|
|
4211
|
+
Args:
|
|
4212
|
+
context: Current context
|
|
4213
|
+
user_message: User message
|
|
4214
|
+
agent_response: Agent response
|
|
4215
|
+
|
|
4216
|
+
Returns:
|
|
4217
|
+
Updated context
|
|
4218
|
+
"""
|
|
4219
|
+
if self.conversation_history:
|
|
4220
|
+
return self.conversation_history.context
|
|
4221
|
+
return context
|
|
4222
|
+
|
|
4223
|
+
def learn_from_examples(
|
|
4224
|
+
self,
|
|
4225
|
+
examples: List[Tuple[str, Dict[str, Any]]]
|
|
4226
|
+
) -> None:
|
|
4227
|
+
"""
|
|
4228
|
+
Learn from examples for few-shot learning.
|
|
4229
|
+
|
|
4230
|
+
Args:
|
|
4231
|
+
examples: List of (input_text, output_structure) tuples
|
|
4232
|
+
"""
|
|
4233
|
+
if not self.example_store or not self.pattern_learner:
|
|
4234
|
+
logger.warning("Few-shot learning not enabled")
|
|
4235
|
+
return
|
|
4236
|
+
|
|
4237
|
+
# Add examples to store
|
|
4238
|
+
for input_text, output in examples:
|
|
4239
|
+
self.example_store.add_example(input_text, output)
|
|
4240
|
+
|
|
4241
|
+
# Learn patterns
|
|
4242
|
+
self.pattern_learner.learn_from_examples(examples)
|
|
4243
|
+
logger.info(f"Learned from {len(examples)} examples")
|
|
4244
|
+
|
|
4245
|
+
def _generate_brief_response(
|
|
4246
|
+
self,
|
|
4247
|
+
result: Dict[str, Any],
|
|
4248
|
+
pragmatic_info: Dict[str, Any]
|
|
4249
|
+
) -> str:
|
|
4250
|
+
"""Generate brief summary response."""
|
|
4251
|
+
hedging = pragmatic_info.get('hedging', 'likely')
|
|
4252
|
+
|
|
4253
|
+
# Try graph-first answer first
|
|
4254
|
+
graph_answer = result.get('graph_first_answer', {})
|
|
4255
|
+
if graph_answer and graph_answer.get('answer'):
|
|
4256
|
+
return f"{hedging.capitalize()}, {graph_answer['answer']}"
|
|
4257
|
+
|
|
4258
|
+
# Fallback to analysis summary
|
|
4259
|
+
analysis = result.get('analysis', {})
|
|
4260
|
+
variables = analysis.get('variables', [])
|
|
4261
|
+
relationships = analysis.get('relationships', [])
|
|
4262
|
+
|
|
4263
|
+
if variables and relationships:
|
|
4264
|
+
return f"I've identified {len(variables)} variables with {len(relationships)} causal relationships. {hedging.capitalize()}, the strongest relationship is between '{relationships[0].get('source', '')}' and '{relationships[0].get('target', '')}'."
|
|
4265
|
+
|
|
4266
|
+
return "I've analyzed your request, but couldn't extract clear causal relationships. Please provide more specific information about the variables and their relationships."
|
|
4267
|
+
|
|
4268
|
+
def _generate_fallback_response(self, result: Dict[str, Any]) -> str:
|
|
4269
|
+
"""Generate fallback response when main generation fails."""
|
|
4270
|
+
task = result.get('task', 'your request')
|
|
4271
|
+
return f"I've processed {task}, but couldn't generate a detailed response. The analysis may need more information or clearer causal relationships."
|
|
4272
|
+
|
|
4273
|
+
def query_graph(self, question: str) -> Dict[str, Any]:
|
|
4274
|
+
"""
|
|
4275
|
+
Query graph state directly (graph-first reasoning).
|
|
4276
|
+
|
|
4277
|
+
Args:
|
|
4278
|
+
question: Question to answer from graph state
|
|
4279
|
+
|
|
4280
|
+
Returns:
|
|
4281
|
+
Dictionary with answer derived from graph state
|
|
4282
|
+
"""
|
|
4283
|
+
if self.graph_first_reasoner is None:
|
|
4284
|
+
raise ValueError("Graph-first reasoning is not enabled")
|
|
4285
|
+
|
|
4286
|
+
return self.graph_first_reasoner.query_graph_state(question, self.graph_manager)
|
|
4287
|
+
|
|
4288
|
+
def reason_from_graph_state(self, state: Dict[str, Any], query: str) -> Dict[str, Any]:
|
|
4289
|
+
"""
|
|
4290
|
+
Pure graph reasoning from explicit graph state.
|
|
4291
|
+
|
|
4292
|
+
Args:
|
|
4293
|
+
state: Graph state dictionary
|
|
4294
|
+
query: Query string
|
|
4295
|
+
|
|
4296
|
+
Returns:
|
|
4297
|
+
Dictionary with reasoning results
|
|
4298
|
+
"""
|
|
4299
|
+
if self.graph_first_reasoner is None:
|
|
4300
|
+
raise ValueError("Graph-first reasoning is not enabled")
|
|
4301
|
+
|
|
4302
|
+
return self.graph_first_reasoner.reason_from_graph_state(state, query, self.graph_manager)
|
|
4303
|
+
|
|
4304
|
+
def extract_causal_variables(self, task: str) -> Dict[str, Any]:
|
|
4305
|
+
"""
|
|
4306
|
+
Extract causal variables from a task.
|
|
4307
|
+
|
|
4308
|
+
Args:
|
|
4309
|
+
task: Natural language task description
|
|
4310
|
+
|
|
4311
|
+
Returns:
|
|
4312
|
+
Dictionary with extracted variables and relationships
|
|
4313
|
+
"""
|
|
4314
|
+
extraction = self.symbolic_reasoner.extract_variables_from_task(task)
|
|
4315
|
+
|
|
4316
|
+
# Add edges to graph
|
|
4317
|
+
for source, target in extraction.get('edges', []):
|
|
4318
|
+
self.graph_manager.add_relationship(
|
|
4319
|
+
source=source,
|
|
4320
|
+
target=target,
|
|
4321
|
+
strength=1.0,
|
|
4322
|
+
confidence=0.8
|
|
4323
|
+
)
|
|
4324
|
+
|
|
4325
|
+
return extraction
|
|
4326
|
+
|
|
4327
|
+
def generate_causal_analysis(
|
|
4328
|
+
self,
|
|
4329
|
+
variables: Dict[str, Any],
|
|
4330
|
+
data: Optional[Any] = None
|
|
4331
|
+
) -> Dict[str, Any]:
|
|
4332
|
+
"""
|
|
4333
|
+
Generate causal analysis from variables.
|
|
4334
|
+
|
|
4335
|
+
Args:
|
|
4336
|
+
variables: Dictionary with variables and relationships
|
|
4337
|
+
data: Optional pandas DataFrame for statistical inference
|
|
4338
|
+
|
|
4339
|
+
Returns:
|
|
4340
|
+
Dictionary with causal analysis results
|
|
4341
|
+
"""
|
|
4342
|
+
# Build graph from variables
|
|
4343
|
+
var_list = variables.get('variables', [])
|
|
4344
|
+
edges = variables.get('edges', [])
|
|
4345
|
+
|
|
4346
|
+
for source, target in edges:
|
|
4347
|
+
self.graph_manager.add_relationship(
|
|
4348
|
+
source=source,
|
|
4349
|
+
target=target,
|
|
4350
|
+
strength=1.0
|
|
4351
|
+
)
|
|
4352
|
+
|
|
4353
|
+
# Fit statistical model if data available
|
|
4354
|
+
if data is not None and PANDAS_AVAILABLE:
|
|
4355
|
+
try:
|
|
4356
|
+
self.statistical_engine.fit_from_dataframe(
|
|
4357
|
+
df=data,
|
|
4358
|
+
variables=var_list
|
|
4359
|
+
)
|
|
4360
|
+
except Exception as e:
|
|
4361
|
+
logger.warning(f"Statistical fitting failed: {e}")
|
|
4362
|
+
|
|
4363
|
+
# Build analysis
|
|
4364
|
+
relationships = []
|
|
4365
|
+
for source, target in self.graph_manager.get_edges():
|
|
4366
|
+
strength = self.statistical_engine.assess_causal_strength(source, target)
|
|
4367
|
+
relationships.append({
|
|
4368
|
+
'source': source,
|
|
4369
|
+
'target': target,
|
|
4370
|
+
'strength': strength
|
|
4371
|
+
})
|
|
4372
|
+
|
|
4373
|
+
return {
|
|
4374
|
+
'variables': var_list,
|
|
4375
|
+
'relationships': relationships,
|
|
4376
|
+
'graph_structure': f"{len(var_list)} variables, {len(relationships)} relationships"
|
|
4377
|
+
}
|
|
4378
|
+
|
|
4379
|
+
def generate_counterfactuals(
|
|
4380
|
+
self,
|
|
4381
|
+
state: Dict[str, float],
|
|
4382
|
+
target_vars: List[str]
|
|
4383
|
+
) -> List[Dict[str, Any]]:
|
|
4384
|
+
"""Generate counterfactual scenarios.
|
|
4385
|
+
|
|
4386
|
+
Args:
|
|
4387
|
+
state: Factual state dictionary
|
|
4388
|
+
target_vars: List of variables to intervene on
|
|
4389
|
+
|
|
4390
|
+
Returns:
|
|
4391
|
+
List of counterfactual scenario dictionaries
|
|
4392
|
+
"""
|
|
4393
|
+
return self.statistical_engine.generate_probabilistic_counterfactuals(
|
|
4394
|
+
factual_state=state,
|
|
4395
|
+
target_variables=target_vars,
|
|
4396
|
+
n_scenarios=5
|
|
4397
|
+
)
|
|
4398
|
+
|