aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Any, List, Optional
|
|
3
|
+
import spacy
|
|
4
|
+
from spacy.language import Language
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from scipy.stats import pearsonr
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
from aiecs.tools.base_tool import BaseTool
|
|
11
|
+
from aiecs.tools import register_tool
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Exceptions
|
|
15
|
+
class ResearchToolError(Exception):
|
|
16
|
+
"""Base exception for ResearchTool errors."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FileOperationError(ResearchToolError):
|
|
20
|
+
"""Raised when file operations fail."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_tool("research")
|
|
24
|
+
class ResearchTool(BaseTool):
|
|
25
|
+
"""
|
|
26
|
+
Tool for causal inference using Mill's methods, advanced induction, deduction, and text summarization.
|
|
27
|
+
|
|
28
|
+
Operations:
|
|
29
|
+
- mill_agreement: Identify common factors in positive cases.
|
|
30
|
+
- mill_difference: Identify factors present in positive but absent in negative cases.
|
|
31
|
+
- mill_joint: Combine agreement and difference methods.
|
|
32
|
+
- mill_residues: Identify residual causes after accounting for known causes.
|
|
33
|
+
- mill_concomitant: Analyze correlation between factor and effect variations.
|
|
34
|
+
- induction: Generalize patterns using spaCy-based clustering.
|
|
35
|
+
- deduction: Validate conclusions using spaCy-based rule reasoning.
|
|
36
|
+
- summarize: Summarize text using spaCy sentence ranking.
|
|
37
|
+
|
|
38
|
+
Inherits from BaseTool.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# Configuration schema
|
|
42
|
+
class Config(BaseModel):
|
|
43
|
+
"""Configuration for the research tool"""
|
|
44
|
+
|
|
45
|
+
model_config = ConfigDict(env_prefix="RESEARCH_TOOL_")
|
|
46
|
+
|
|
47
|
+
max_workers: int = Field(
|
|
48
|
+
default=min(32, (os.cpu_count() or 4) * 2),
|
|
49
|
+
description="Maximum number of worker threads",
|
|
50
|
+
)
|
|
51
|
+
spacy_model: str = Field(default="en_core_web_sm", description="Default spaCy model to use")
|
|
52
|
+
max_text_length: int = Field(default=10_000, description="Maximum text length for inputs")
|
|
53
|
+
allowed_spacy_models: List[str] = Field(
|
|
54
|
+
default=["en_core_web_sm", "zh_core_web_sm"],
|
|
55
|
+
description="Allowed spaCy models",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
59
|
+
"""
|
|
60
|
+
Initialize ResearchTool with settings and resources.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
config (Dict, optional): Configuration overrides for ResearchTool.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If config contains invalid settings.
|
|
67
|
+
"""
|
|
68
|
+
super().__init__(config)
|
|
69
|
+
|
|
70
|
+
# Parse configuration
|
|
71
|
+
self.config = self.Config(**(config or {}))
|
|
72
|
+
|
|
73
|
+
self.logger = logging.getLogger(__name__)
|
|
74
|
+
if not self.logger.handlers:
|
|
75
|
+
handler = logging.StreamHandler()
|
|
76
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
77
|
+
self.logger.addHandler(handler)
|
|
78
|
+
self.logger.setLevel(logging.INFO)
|
|
79
|
+
self._spacy_nlp: Optional[Language] = None
|
|
80
|
+
|
|
81
|
+
def __del__(self):
|
|
82
|
+
"""Clean up resources when the object is destroyed."""
|
|
83
|
+
if hasattr(self, "_spacy_nlp") and self._spacy_nlp is not None:
|
|
84
|
+
self._spacy_nlp = None
|
|
85
|
+
|
|
86
|
+
def _get_spacy(self) -> Language:
|
|
87
|
+
"""
|
|
88
|
+
Get or cache a spaCy pipeline.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Language: spaCy NLP object.
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ResearchToolError: If the spaCy model is invalid.
|
|
95
|
+
"""
|
|
96
|
+
if self._spacy_nlp is None:
|
|
97
|
+
if self.config.spacy_model not in self.config.allowed_spacy_models:
|
|
98
|
+
raise ResearchToolError(
|
|
99
|
+
f"Invalid spaCy model '{self.config.spacy_model}', expected {self.config.allowed_spacy_models}"
|
|
100
|
+
)
|
|
101
|
+
self._spacy_nlp = spacy.load(self.config.spacy_model, disable=["textcat"])
|
|
102
|
+
return self._spacy_nlp
|
|
103
|
+
|
|
104
|
+
def mill_agreement(self, cases: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
105
|
+
"""
|
|
106
|
+
Identify attribute(s) common to all cases with a positive outcome using Mill's Method of Agreement.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
cases (List[Dict[str, Any]]): List of cases with attributes and outcomes.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Dict[str, Any]: Common factors {'common_factors': List[str]}.
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
FileOperationError: If processing fails.
|
|
116
|
+
"""
|
|
117
|
+
try:
|
|
118
|
+
truthy = [c["attrs"] for c in cases if c.get("outcome")]
|
|
119
|
+
if not truthy:
|
|
120
|
+
return {"common_factors": []}
|
|
121
|
+
common = set(k for k, v in truthy[0].items() if v)
|
|
122
|
+
for attrs in truthy[1:]:
|
|
123
|
+
common &= set(k for k, v in attrs.items() if v)
|
|
124
|
+
return {"common_factors": list(common)}
|
|
125
|
+
except Exception as e:
|
|
126
|
+
raise FileOperationError(f"Failed to process mill_agreement: {str(e)}")
|
|
127
|
+
|
|
128
|
+
def mill_difference(
|
|
129
|
+
self, positive_case: Dict[str, Any], negative_case: Dict[str, Any]
|
|
130
|
+
) -> Dict[str, Any]:
|
|
131
|
+
"""
|
|
132
|
+
Find attribute(s) present in positive case but absent in negative case using Mill's Method of Difference.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
positive_case (Dict[str, Any]): Positive case with attributes and outcome.
|
|
136
|
+
negative_case (Dict[str, Any]): Negative case with attributes and outcome.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Dict[str, Any]: Difference factors {'difference_factors': List[str]}.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
FileOperationError: If processing fails.
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
pos = {k for k, v in positive_case.get("attrs", {}).items() if v}
|
|
146
|
+
neg = {k for k, v in negative_case.get("attrs", {}).items() if v}
|
|
147
|
+
diff = pos - neg
|
|
148
|
+
return {"difference_factors": list(diff)}
|
|
149
|
+
except Exception as e:
|
|
150
|
+
raise FileOperationError(f"Failed to process mill_difference: {str(e)}")
|
|
151
|
+
|
|
152
|
+
def mill_joint(
|
|
153
|
+
self,
|
|
154
|
+
positive_cases: List[Dict[str, Any]],
|
|
155
|
+
negative_cases: List[Dict[str, Any]],
|
|
156
|
+
) -> Dict[str, Any]:
|
|
157
|
+
"""
|
|
158
|
+
Combine Mill's Method of Agreement and Difference to identify causal factors.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
positive_cases (List[Dict[str, Any]]): List of positive cases.
|
|
162
|
+
negative_cases (List[Dict[str, Any]]): List of negative cases.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Dict[str, Any]: Causal factors {'causal_factors': List[str]}.
|
|
166
|
+
|
|
167
|
+
Raises:
|
|
168
|
+
FileOperationError: If processing fails.
|
|
169
|
+
"""
|
|
170
|
+
try:
|
|
171
|
+
truthy = [c["attrs"] for c in positive_cases if c.get("outcome")]
|
|
172
|
+
if not truthy:
|
|
173
|
+
return {"causal_factors": []}
|
|
174
|
+
common = set(k for k, v in truthy[0].items() if v)
|
|
175
|
+
for attrs in truthy[1:]:
|
|
176
|
+
common &= set(k for k, v in attrs.items() if v)
|
|
177
|
+
falsy = [c["attrs"] for c in negative_cases if not c.get("outcome")]
|
|
178
|
+
if not falsy:
|
|
179
|
+
return {"causal_factors": list(common)}
|
|
180
|
+
for attrs in falsy:
|
|
181
|
+
common -= set(k for k, v in attrs.items() if v)
|
|
182
|
+
return {"causal_factors": list(common)}
|
|
183
|
+
except Exception as e:
|
|
184
|
+
raise FileOperationError(f"Failed to process mill_joint: {str(e)}")
|
|
185
|
+
|
|
186
|
+
def mill_residues(
|
|
187
|
+
self, cases: List[Dict[str, Any]], known_causes: Dict[str, List[str]]
|
|
188
|
+
) -> Dict[str, Any]:
|
|
189
|
+
"""
|
|
190
|
+
Identify residual causes after accounting for known causes using Mill's Method of Residues.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
cases (List[Dict[str, Any]]): List of cases with attributes and effects.
|
|
194
|
+
known_causes (Dict[str, List[str]]): Known causes for effects.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Dict[str, Any]: Residual causes {'residual_causes': Dict[str, List[str]]}.
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
FileOperationError: If processing fails.
|
|
201
|
+
"""
|
|
202
|
+
try:
|
|
203
|
+
residual = {}
|
|
204
|
+
for case in cases:
|
|
205
|
+
effects = case.get("effects", {})
|
|
206
|
+
attrs = set(k for k, v in case.get("attrs", {}).items() if v)
|
|
207
|
+
for effect in effects:
|
|
208
|
+
if effect in known_causes:
|
|
209
|
+
known = set(known_causes[effect])
|
|
210
|
+
residual[effect] = list(attrs - known)
|
|
211
|
+
else:
|
|
212
|
+
residual[effect] = list(attrs)
|
|
213
|
+
return {"residual_causes": residual}
|
|
214
|
+
except Exception as e:
|
|
215
|
+
raise FileOperationError(f"Failed to process mill_residues: {str(e)}")
|
|
216
|
+
|
|
217
|
+
def mill_concomitant(
|
|
218
|
+
self, cases: List[Dict[str, Any]], factor: str, effect: str
|
|
219
|
+
) -> Dict[str, Any]:
|
|
220
|
+
"""
|
|
221
|
+
Analyze correlation between factor and effect variations using Mill's Method of Concomitant Variations.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
cases (List[Dict[str, Any]]): List of cases with attributes.
|
|
225
|
+
factor (str): Factor to analyze.
|
|
226
|
+
effect (str): Effect to analyze.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Dict[str, Any]: Correlation results {'correlation': float, 'pvalue': float}.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
FileOperationError: If processing fails.
|
|
233
|
+
"""
|
|
234
|
+
try:
|
|
235
|
+
factor_vals = [case["attrs"].get(factor, 0) for case in cases]
|
|
236
|
+
effect_vals = [case["attrs"].get(effect, 0) for case in cases]
|
|
237
|
+
if len(factor_vals) < 2:
|
|
238
|
+
return {"correlation": 0.0, "pvalue": 1.0}
|
|
239
|
+
|
|
240
|
+
# Convert to numpy arrays to avoid PyTorch compatibility issues
|
|
241
|
+
import numpy as np
|
|
242
|
+
|
|
243
|
+
factor_array = np.array(factor_vals, dtype=np.float64)
|
|
244
|
+
effect_array = np.array(effect_vals, dtype=np.float64)
|
|
245
|
+
|
|
246
|
+
# Calculate correlation using numpy if scipy fails
|
|
247
|
+
try:
|
|
248
|
+
corr, pval = pearsonr(factor_array, effect_array)
|
|
249
|
+
except (AttributeError, ImportError) as e:
|
|
250
|
+
# Fallback to numpy correlation calculation
|
|
251
|
+
self.logger.warning(f"scipy pearsonr failed ({e}), using numpy fallback")
|
|
252
|
+
corr = np.corrcoef(factor_array, effect_array)[0, 1]
|
|
253
|
+
# Simple p-value approximation (not statistically rigorous but
|
|
254
|
+
# functional)
|
|
255
|
+
n = len(factor_array)
|
|
256
|
+
if n <= 2:
|
|
257
|
+
pval = 1.0
|
|
258
|
+
else:
|
|
259
|
+
# Approximate p-value using t-distribution
|
|
260
|
+
t_stat = corr * np.sqrt((n - 2) / (1 - corr**2 + 1e-10))
|
|
261
|
+
from scipy.stats import t
|
|
262
|
+
|
|
263
|
+
pval = 2 * (1 - t.cdf(abs(t_stat), n - 2))
|
|
264
|
+
|
|
265
|
+
return {"correlation": float(corr), "pvalue": float(pval)}
|
|
266
|
+
except Exception as e:
|
|
267
|
+
raise FileOperationError(f"Failed to process mill_concomitant: {str(e)}")
|
|
268
|
+
|
|
269
|
+
def induction(self, examples: List[str], max_keywords: int = 10) -> Dict[str, Any]:
|
|
270
|
+
"""
|
|
271
|
+
Generalize patterns from examples using spaCy-based noun phrase clustering.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
examples (List[str]): List of example texts.
|
|
275
|
+
max_keywords (int): Maximum number of keywords to extract.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Dict[str, Any]: Generalized patterns {'patterns': List[str]}.
|
|
279
|
+
|
|
280
|
+
Raises:
|
|
281
|
+
FileOperationError: If induction fails.
|
|
282
|
+
"""
|
|
283
|
+
try:
|
|
284
|
+
nlp = self._get_spacy()
|
|
285
|
+
docs = [nlp(ex) for ex in examples]
|
|
286
|
+
patterns = []
|
|
287
|
+
for doc in docs:
|
|
288
|
+
patterns.extend([chunk.text.lower() for chunk in doc.noun_chunks])
|
|
289
|
+
patterns.extend([token.lemma_.lower() for token in doc if token.pos_ == "VERB"])
|
|
290
|
+
counter = Counter(patterns)
|
|
291
|
+
common = [word for word, count in counter.most_common() if count > 1][:max_keywords]
|
|
292
|
+
return {"patterns": common}
|
|
293
|
+
except Exception as e:
|
|
294
|
+
raise FileOperationError(f"Failed to process induction: {str(e)}")
|
|
295
|
+
|
|
296
|
+
def deduction(self, premises: List[str], conclusion: Optional[str]) -> Dict[str, Any]:
|
|
297
|
+
"""
|
|
298
|
+
Validate if conclusion logically follows premises using spaCy dependency parsing.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
premises (List[str]): List of premise statements.
|
|
302
|
+
conclusion (Optional[str]): Conclusion to validate.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Dict[str, Any]: Validation result {'valid': bool, 'conclusion': str, 'reason': str}.
|
|
306
|
+
|
|
307
|
+
Raises:
|
|
308
|
+
FileOperationError: If deduction fails.
|
|
309
|
+
"""
|
|
310
|
+
try:
|
|
311
|
+
nlp = self._get_spacy()
|
|
312
|
+
premises_docs = [nlp(p) for p in premises]
|
|
313
|
+
conclusion_doc = nlp(conclusion) if conclusion else None
|
|
314
|
+
if not conclusion_doc:
|
|
315
|
+
return {
|
|
316
|
+
"valid": False,
|
|
317
|
+
"conclusion": None,
|
|
318
|
+
"reason": "No conclusion provided",
|
|
319
|
+
}
|
|
320
|
+
premise_entities = set()
|
|
321
|
+
premise_predicates = set()
|
|
322
|
+
for doc in premises_docs:
|
|
323
|
+
premise_entities.update(ent.text.lower() for ent in doc.ents)
|
|
324
|
+
premise_predicates.update(
|
|
325
|
+
token.lemma_.lower() for token in doc if token.pos_ == "VERB"
|
|
326
|
+
)
|
|
327
|
+
conclusion_entities = set(ent.text.lower() for ent in conclusion_doc.ents)
|
|
328
|
+
conclusion_predicates = set(
|
|
329
|
+
token.lemma_.lower() for token in conclusion_doc if token.pos_ == "VERB"
|
|
330
|
+
)
|
|
331
|
+
entities_valid = conclusion_entities.issubset(premise_entities)
|
|
332
|
+
predicates_valid = conclusion_predicates.issubset(premise_predicates)
|
|
333
|
+
valid = entities_valid and predicates_valid
|
|
334
|
+
reason = (
|
|
335
|
+
"Conclusion matches premise patterns."
|
|
336
|
+
if valid
|
|
337
|
+
else f"Conclusion contains unmatched {'entities' if not entities_valid else ''} "
|
|
338
|
+
f"{'and ' if not entities_valid and not predicates_valid else ''}"
|
|
339
|
+
f"{'predicates' if not predicates_valid else ''}."
|
|
340
|
+
)
|
|
341
|
+
return {"valid": valid, "conclusion": conclusion, "reason": reason}
|
|
342
|
+
except Exception as e:
|
|
343
|
+
raise FileOperationError(f"Failed to process deduction: {str(e)}")
|
|
344
|
+
|
|
345
|
+
def summarize(self, text: str, max_length: int = 150, language: Optional[str] = None) -> str:
|
|
346
|
+
"""
|
|
347
|
+
Summarize text using spaCy-based sentence ranking.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
text (str): Text to summarize.
|
|
351
|
+
max_length (int): Maximum length of the summary.
|
|
352
|
+
language (Optional[str]): Language of the text.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
str: Summarized text.
|
|
356
|
+
|
|
357
|
+
Raises:
|
|
358
|
+
FileOperationError: If summarization fails.
|
|
359
|
+
"""
|
|
360
|
+
try:
|
|
361
|
+
nlp = self._get_spacy()
|
|
362
|
+
doc = nlp(text)
|
|
363
|
+
sentences = [sent.text for sent in doc.sents]
|
|
364
|
+
if not sentences:
|
|
365
|
+
return ""
|
|
366
|
+
keywords = [
|
|
367
|
+
token.lemma_.lower()
|
|
368
|
+
for token in doc
|
|
369
|
+
if token.pos_ in ("NOUN", "VERB", "ADJ") and not token.is_stop
|
|
370
|
+
]
|
|
371
|
+
keyword_freq = Counter(keywords)
|
|
372
|
+
scores = []
|
|
373
|
+
for sent in sentences:
|
|
374
|
+
sent_doc = nlp(sent)
|
|
375
|
+
sent_keywords = [
|
|
376
|
+
token.lemma_.lower()
|
|
377
|
+
for token in sent_doc
|
|
378
|
+
if token.pos_ in ("NOUN", "VERB", "ADJ")
|
|
379
|
+
]
|
|
380
|
+
score = sum(keyword_freq.get(k, 0) for k in sent_keywords) / (
|
|
381
|
+
len(sent_keywords) + 1
|
|
382
|
+
)
|
|
383
|
+
scores.append((sent, score))
|
|
384
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
385
|
+
selected = [sent for sent, _ in scores[: max(1, max_length // 50)]]
|
|
386
|
+
summary = " ".join(selected)
|
|
387
|
+
words = summary.split()
|
|
388
|
+
if len(words) > max_length:
|
|
389
|
+
summary = " ".join(words[:max_length]) + "..."
|
|
390
|
+
return summary
|
|
391
|
+
except Exception as e:
|
|
392
|
+
raise FileOperationError(f"Failed to process summarize: {str(e)}")
|