aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Graph Builder
|
|
3
|
+
|
|
4
|
+
Builds knowledge graphs from documents (PDF, DOCX, TXT, etc.).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Optional, Dict, Any, Union
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
from aiecs.application.knowledge_graph.builder.graph_builder import (
|
|
13
|
+
GraphBuilder,
|
|
14
|
+
BuildResult,
|
|
15
|
+
)
|
|
16
|
+
from aiecs.application.knowledge_graph.builder.text_chunker import TextChunker
|
|
17
|
+
from aiecs.tools.docs.document_parser_tool import (
|
|
18
|
+
DocumentParserTool,
|
|
19
|
+
ParsingStrategy,
|
|
20
|
+
OutputFormat,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class DocumentBuildResult:
|
|
26
|
+
"""
|
|
27
|
+
Result of document-to-graph build operation
|
|
28
|
+
|
|
29
|
+
Extends BuildResult with document-specific information.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
document_path: str
|
|
33
|
+
document_type: str
|
|
34
|
+
total_chunks: int = 0
|
|
35
|
+
chunks_processed: int = 0
|
|
36
|
+
chunk_results: List[BuildResult] = field(default_factory=list)
|
|
37
|
+
success: bool = True
|
|
38
|
+
errors: List[str] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def total_entities_added(self) -> int:
|
|
42
|
+
"""Total entities added across all chunks"""
|
|
43
|
+
return sum(r.entities_added for r in self.chunk_results)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def total_relations_added(self) -> int:
|
|
47
|
+
"""Total relations added across all chunks"""
|
|
48
|
+
return sum(r.relations_added for r in self.chunk_results)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DocumentGraphBuilder:
|
|
52
|
+
"""
|
|
53
|
+
Build knowledge graphs from documents
|
|
54
|
+
|
|
55
|
+
Supports multiple document formats:
|
|
56
|
+
- PDF
|
|
57
|
+
- DOCX (Microsoft Word)
|
|
58
|
+
- TXT (Plain text)
|
|
59
|
+
- And more via AIECS DocumentParserTool
|
|
60
|
+
|
|
61
|
+
For large documents, automatically chunks text into manageable pieces.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
```python
|
|
65
|
+
builder = DocumentGraphBuilder(
|
|
66
|
+
graph_builder=graph_builder,
|
|
67
|
+
chunk_size=1000
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
result = await builder.build_from_document("research_paper.pdf")
|
|
71
|
+
|
|
72
|
+
print(f"Processed {result.total_chunks} chunks")
|
|
73
|
+
print(f"Added {result.total_entities_added} entities")
|
|
74
|
+
print(f"Added {result.total_relations_added} relations")
|
|
75
|
+
```
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
graph_builder: GraphBuilder,
|
|
81
|
+
chunk_size: int = 2000,
|
|
82
|
+
chunk_overlap: int = 200,
|
|
83
|
+
enable_chunking: bool = True,
|
|
84
|
+
parallel_chunks: bool = True,
|
|
85
|
+
max_parallel_chunks: int = 3,
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Initialize document graph builder
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
graph_builder: GraphBuilder instance for text processing
|
|
92
|
+
chunk_size: Size of text chunks (in characters)
|
|
93
|
+
chunk_overlap: Overlap between chunks
|
|
94
|
+
enable_chunking: Whether to chunk large documents
|
|
95
|
+
parallel_chunks: Process chunks in parallel
|
|
96
|
+
max_parallel_chunks: Maximum parallel chunk processing
|
|
97
|
+
"""
|
|
98
|
+
self.graph_builder = graph_builder
|
|
99
|
+
self.chunk_size = chunk_size
|
|
100
|
+
self.chunk_overlap = chunk_overlap
|
|
101
|
+
self.enable_chunking = enable_chunking
|
|
102
|
+
self.parallel_chunks = parallel_chunks
|
|
103
|
+
self.max_parallel_chunks = max_parallel_chunks
|
|
104
|
+
|
|
105
|
+
# Initialize document parser (will read config from environment
|
|
106
|
+
# variables)
|
|
107
|
+
self.document_parser = DocumentParserTool()
|
|
108
|
+
|
|
109
|
+
# Initialize text chunker
|
|
110
|
+
self.text_chunker = TextChunker(
|
|
111
|
+
chunk_size=chunk_size,
|
|
112
|
+
overlap=chunk_overlap,
|
|
113
|
+
respect_sentences=True,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
async def build_from_document(
|
|
117
|
+
self,
|
|
118
|
+
document_path: Union[str, Path],
|
|
119
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
120
|
+
) -> DocumentBuildResult:
|
|
121
|
+
"""
|
|
122
|
+
Build knowledge graph from a document
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
document_path: Path to document file
|
|
126
|
+
metadata: Optional metadata to attach to extracted entities/relations
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
DocumentBuildResult with statistics
|
|
130
|
+
"""
|
|
131
|
+
document_path = str(document_path)
|
|
132
|
+
result = DocumentBuildResult(document_path=document_path, document_type="unknown")
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
# Step 1: Parse document to text
|
|
136
|
+
text = await self._parse_document(document_path)
|
|
137
|
+
|
|
138
|
+
if not text or not text.strip():
|
|
139
|
+
result.success = False
|
|
140
|
+
result.errors.append("Document parsing returned empty text")
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
# Determine document type
|
|
144
|
+
result.document_type = Path(document_path).suffix[1:].lower() # Remove leading dot
|
|
145
|
+
|
|
146
|
+
# Step 2: Chunk text if needed
|
|
147
|
+
if self.enable_chunking and len(text) > self.chunk_size:
|
|
148
|
+
chunks = self.text_chunker.chunk_text(text, metadata={"document": document_path})
|
|
149
|
+
result.total_chunks = len(chunks)
|
|
150
|
+
else:
|
|
151
|
+
# Single chunk (small document)
|
|
152
|
+
from aiecs.application.knowledge_graph.builder.text_chunker import (
|
|
153
|
+
TextChunk,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
chunks = [
|
|
157
|
+
TextChunk(
|
|
158
|
+
text=text,
|
|
159
|
+
start_char=0,
|
|
160
|
+
end_char=len(text),
|
|
161
|
+
chunk_index=0,
|
|
162
|
+
metadata={"document": document_path},
|
|
163
|
+
)
|
|
164
|
+
]
|
|
165
|
+
result.total_chunks = 1
|
|
166
|
+
|
|
167
|
+
# Step 3: Process each chunk
|
|
168
|
+
if self.parallel_chunks and len(chunks) > 1:
|
|
169
|
+
# Process chunks in parallel
|
|
170
|
+
chunk_results = await self._process_chunks_parallel(chunks, document_path, metadata)
|
|
171
|
+
else:
|
|
172
|
+
# Process chunks sequentially
|
|
173
|
+
chunk_results = await self._process_chunks_sequential(
|
|
174
|
+
chunks, document_path, metadata
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
result.chunk_results = chunk_results
|
|
178
|
+
result.chunks_processed = len(chunk_results)
|
|
179
|
+
|
|
180
|
+
# Check if all chunks succeeded
|
|
181
|
+
failed_chunks = [r for r in chunk_results if not r.success]
|
|
182
|
+
if failed_chunks:
|
|
183
|
+
result.errors.append(f"{len(failed_chunks)} chunks failed processing")
|
|
184
|
+
|
|
185
|
+
result.success = len(failed_chunks) < len(chunks) # At least some chunks succeeded
|
|
186
|
+
|
|
187
|
+
except Exception as e:
|
|
188
|
+
result.success = False
|
|
189
|
+
result.errors.append(f"Document processing failed: {str(e)}")
|
|
190
|
+
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
async def build_from_documents(
|
|
194
|
+
self,
|
|
195
|
+
document_paths: List[Union[str, Path]],
|
|
196
|
+
parallel: bool = True,
|
|
197
|
+
max_parallel: int = 3,
|
|
198
|
+
) -> List[DocumentBuildResult]:
|
|
199
|
+
"""
|
|
200
|
+
Build knowledge graph from multiple documents
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
document_paths: List of document paths
|
|
204
|
+
parallel: Process documents in parallel
|
|
205
|
+
max_parallel: Maximum parallel documents
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
List of DocumentBuildResult objects
|
|
209
|
+
"""
|
|
210
|
+
if parallel:
|
|
211
|
+
semaphore = asyncio.Semaphore(max_parallel)
|
|
212
|
+
|
|
213
|
+
async def process_one(doc_path):
|
|
214
|
+
async with semaphore:
|
|
215
|
+
return await self.build_from_document(doc_path)
|
|
216
|
+
|
|
217
|
+
tasks = [process_one(doc_path) for doc_path in document_paths]
|
|
218
|
+
gather_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
219
|
+
|
|
220
|
+
# Handle exceptions - convert all to DocumentBuildResult
|
|
221
|
+
results: List[DocumentBuildResult] = []
|
|
222
|
+
for i, result in enumerate(gather_results):
|
|
223
|
+
if isinstance(result, Exception):
|
|
224
|
+
error_result = DocumentBuildResult(
|
|
225
|
+
document_path=str(document_paths[i]),
|
|
226
|
+
document_type="unknown",
|
|
227
|
+
success=False,
|
|
228
|
+
)
|
|
229
|
+
error_result.errors.append(str(result))
|
|
230
|
+
results.append(error_result)
|
|
231
|
+
elif isinstance(result, DocumentBuildResult):
|
|
232
|
+
results.append(result)
|
|
233
|
+
else:
|
|
234
|
+
# Fallback for unexpected types
|
|
235
|
+
error_result = DocumentBuildResult(
|
|
236
|
+
document_path=str(document_paths[i]),
|
|
237
|
+
document_type="unknown",
|
|
238
|
+
success=False,
|
|
239
|
+
)
|
|
240
|
+
error_result.errors.append(f"Unexpected result type: {type(result)}")
|
|
241
|
+
results.append(error_result)
|
|
242
|
+
|
|
243
|
+
return results
|
|
244
|
+
else:
|
|
245
|
+
# Sequential processing
|
|
246
|
+
results = []
|
|
247
|
+
for doc_path in document_paths:
|
|
248
|
+
result = await self.build_from_document(doc_path)
|
|
249
|
+
results.append(result)
|
|
250
|
+
return results
|
|
251
|
+
|
|
252
|
+
async def _parse_document(self, document_path: str) -> str:
|
|
253
|
+
"""
|
|
254
|
+
Parse document to text using AIECS document parser
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
document_path: Path to document
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Extracted text content
|
|
261
|
+
"""
|
|
262
|
+
try:
|
|
263
|
+
# Use document parser tool
|
|
264
|
+
parse_result = self.document_parser.parse_document(
|
|
265
|
+
source=document_path,
|
|
266
|
+
strategy=ParsingStrategy.TEXT_ONLY,
|
|
267
|
+
output_format=OutputFormat.TEXT,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
if isinstance(parse_result, dict):
|
|
271
|
+
return parse_result.get("content", "")
|
|
272
|
+
elif isinstance(parse_result, str):
|
|
273
|
+
return parse_result
|
|
274
|
+
else:
|
|
275
|
+
return ""
|
|
276
|
+
|
|
277
|
+
except Exception:
|
|
278
|
+
# Fallback: try reading as plain text
|
|
279
|
+
try:
|
|
280
|
+
with open(document_path, "r", encoding="utf-8") as f:
|
|
281
|
+
return f.read()
|
|
282
|
+
except Exception as fallback_error:
|
|
283
|
+
raise RuntimeError(f"Failed to parse document: {str(fallback_error)}")
|
|
284
|
+
|
|
285
|
+
async def _process_chunks_parallel(
|
|
286
|
+
self,
|
|
287
|
+
chunks: List,
|
|
288
|
+
document_path: str,
|
|
289
|
+
metadata: Optional[Dict[str, Any]],
|
|
290
|
+
) -> List[BuildResult]:
|
|
291
|
+
"""
|
|
292
|
+
Process chunks in parallel
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
chunks: List of TextChunk objects
|
|
296
|
+
document_path: Source document path
|
|
297
|
+
metadata: Optional metadata
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
List of BuildResult objects
|
|
301
|
+
"""
|
|
302
|
+
semaphore = asyncio.Semaphore(self.max_parallel_chunks)
|
|
303
|
+
|
|
304
|
+
async def process_chunk(chunk):
|
|
305
|
+
async with semaphore:
|
|
306
|
+
chunk_metadata = {
|
|
307
|
+
"document": document_path,
|
|
308
|
+
"chunk_index": chunk.chunk_index,
|
|
309
|
+
"chunk_start": chunk.start_char,
|
|
310
|
+
"chunk_end": chunk.end_char,
|
|
311
|
+
}
|
|
312
|
+
if metadata:
|
|
313
|
+
chunk_metadata.update(metadata)
|
|
314
|
+
|
|
315
|
+
source = f"{document_path}#chunk{chunk.chunk_index}"
|
|
316
|
+
return await self.graph_builder.build_from_text(
|
|
317
|
+
text=chunk.text, source=source, metadata=chunk_metadata
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
tasks = [process_chunk(chunk) for chunk in chunks]
|
|
321
|
+
gather_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
322
|
+
|
|
323
|
+
# Handle exceptions - convert all to BuildResult
|
|
324
|
+
results: List[BuildResult] = []
|
|
325
|
+
for i, result in enumerate(gather_results):
|
|
326
|
+
if isinstance(result, Exception):
|
|
327
|
+
error_result = BuildResult(success=False)
|
|
328
|
+
error_result.errors.append(f"Chunk {i} failed: {str(result)}")
|
|
329
|
+
results.append(error_result)
|
|
330
|
+
elif isinstance(result, BuildResult):
|
|
331
|
+
results.append(result)
|
|
332
|
+
else:
|
|
333
|
+
# Fallback for unexpected types
|
|
334
|
+
error_result = BuildResult(success=False)
|
|
335
|
+
error_result.errors.append(f"Unexpected result type: {type(result)}")
|
|
336
|
+
results.append(error_result)
|
|
337
|
+
|
|
338
|
+
return results
|
|
339
|
+
|
|
340
|
+
async def _process_chunks_sequential(
|
|
341
|
+
self,
|
|
342
|
+
chunks: List,
|
|
343
|
+
document_path: str,
|
|
344
|
+
metadata: Optional[Dict[str, Any]],
|
|
345
|
+
) -> List[BuildResult]:
|
|
346
|
+
"""
|
|
347
|
+
Process chunks sequentially
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
chunks: List of TextChunk objects
|
|
351
|
+
document_path: Source document path
|
|
352
|
+
metadata: Optional metadata
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
List of BuildResult objects
|
|
356
|
+
"""
|
|
357
|
+
results = []
|
|
358
|
+
|
|
359
|
+
for chunk in chunks:
|
|
360
|
+
chunk_metadata = {
|
|
361
|
+
"document": document_path,
|
|
362
|
+
"chunk_index": chunk.chunk_index,
|
|
363
|
+
"chunk_start": chunk.start_char,
|
|
364
|
+
"chunk_end": chunk.end_char,
|
|
365
|
+
}
|
|
366
|
+
if metadata:
|
|
367
|
+
chunk_metadata.update(metadata)
|
|
368
|
+
|
|
369
|
+
source = f"{document_path}#chunk{chunk.chunk_index}"
|
|
370
|
+
result = await self.graph_builder.build_from_text(
|
|
371
|
+
text=chunk.text, source=source, metadata=chunk_metadata
|
|
372
|
+
)
|
|
373
|
+
results.append(result)
|
|
374
|
+
|
|
375
|
+
return results
|