aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,375 @@
1
+ """
2
+ Document Graph Builder
3
+
4
+ Builds knowledge graphs from documents (PDF, DOCX, TXT, etc.).
5
+ """
6
+
7
+ import asyncio
8
+ from pathlib import Path
9
+ from typing import List, Optional, Dict, Any, Union
10
+ from dataclasses import dataclass, field
11
+
12
+ from aiecs.application.knowledge_graph.builder.graph_builder import (
13
+ GraphBuilder,
14
+ BuildResult,
15
+ )
16
+ from aiecs.application.knowledge_graph.builder.text_chunker import TextChunker
17
+ from aiecs.tools.docs.document_parser_tool import (
18
+ DocumentParserTool,
19
+ ParsingStrategy,
20
+ OutputFormat,
21
+ )
22
+
23
+
24
+ @dataclass
25
+ class DocumentBuildResult:
26
+ """
27
+ Result of document-to-graph build operation
28
+
29
+ Extends BuildResult with document-specific information.
30
+ """
31
+
32
+ document_path: str
33
+ document_type: str
34
+ total_chunks: int = 0
35
+ chunks_processed: int = 0
36
+ chunk_results: List[BuildResult] = field(default_factory=list)
37
+ success: bool = True
38
+ errors: List[str] = field(default_factory=list)
39
+
40
+ @property
41
+ def total_entities_added(self) -> int:
42
+ """Total entities added across all chunks"""
43
+ return sum(r.entities_added for r in self.chunk_results)
44
+
45
+ @property
46
+ def total_relations_added(self) -> int:
47
+ """Total relations added across all chunks"""
48
+ return sum(r.relations_added for r in self.chunk_results)
49
+
50
+
51
+ class DocumentGraphBuilder:
52
+ """
53
+ Build knowledge graphs from documents
54
+
55
+ Supports multiple document formats:
56
+ - PDF
57
+ - DOCX (Microsoft Word)
58
+ - TXT (Plain text)
59
+ - And more via AIECS DocumentParserTool
60
+
61
+ For large documents, automatically chunks text into manageable pieces.
62
+
63
+ Example:
64
+ ```python
65
+ builder = DocumentGraphBuilder(
66
+ graph_builder=graph_builder,
67
+ chunk_size=1000
68
+ )
69
+
70
+ result = await builder.build_from_document("research_paper.pdf")
71
+
72
+ print(f"Processed {result.total_chunks} chunks")
73
+ print(f"Added {result.total_entities_added} entities")
74
+ print(f"Added {result.total_relations_added} relations")
75
+ ```
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ graph_builder: GraphBuilder,
81
+ chunk_size: int = 2000,
82
+ chunk_overlap: int = 200,
83
+ enable_chunking: bool = True,
84
+ parallel_chunks: bool = True,
85
+ max_parallel_chunks: int = 3,
86
+ ):
87
+ """
88
+ Initialize document graph builder
89
+
90
+ Args:
91
+ graph_builder: GraphBuilder instance for text processing
92
+ chunk_size: Size of text chunks (in characters)
93
+ chunk_overlap: Overlap between chunks
94
+ enable_chunking: Whether to chunk large documents
95
+ parallel_chunks: Process chunks in parallel
96
+ max_parallel_chunks: Maximum parallel chunk processing
97
+ """
98
+ self.graph_builder = graph_builder
99
+ self.chunk_size = chunk_size
100
+ self.chunk_overlap = chunk_overlap
101
+ self.enable_chunking = enable_chunking
102
+ self.parallel_chunks = parallel_chunks
103
+ self.max_parallel_chunks = max_parallel_chunks
104
+
105
+ # Initialize document parser (will read config from environment
106
+ # variables)
107
+ self.document_parser = DocumentParserTool()
108
+
109
+ # Initialize text chunker
110
+ self.text_chunker = TextChunker(
111
+ chunk_size=chunk_size,
112
+ overlap=chunk_overlap,
113
+ respect_sentences=True,
114
+ )
115
+
116
+ async def build_from_document(
117
+ self,
118
+ document_path: Union[str, Path],
119
+ metadata: Optional[Dict[str, Any]] = None,
120
+ ) -> DocumentBuildResult:
121
+ """
122
+ Build knowledge graph from a document
123
+
124
+ Args:
125
+ document_path: Path to document file
126
+ metadata: Optional metadata to attach to extracted entities/relations
127
+
128
+ Returns:
129
+ DocumentBuildResult with statistics
130
+ """
131
+ document_path = str(document_path)
132
+ result = DocumentBuildResult(document_path=document_path, document_type="unknown")
133
+
134
+ try:
135
+ # Step 1: Parse document to text
136
+ text = await self._parse_document(document_path)
137
+
138
+ if not text or not text.strip():
139
+ result.success = False
140
+ result.errors.append("Document parsing returned empty text")
141
+ return result
142
+
143
+ # Determine document type
144
+ result.document_type = Path(document_path).suffix[1:].lower() # Remove leading dot
145
+
146
+ # Step 2: Chunk text if needed
147
+ if self.enable_chunking and len(text) > self.chunk_size:
148
+ chunks = self.text_chunker.chunk_text(text, metadata={"document": document_path})
149
+ result.total_chunks = len(chunks)
150
+ else:
151
+ # Single chunk (small document)
152
+ from aiecs.application.knowledge_graph.builder.text_chunker import (
153
+ TextChunk,
154
+ )
155
+
156
+ chunks = [
157
+ TextChunk(
158
+ text=text,
159
+ start_char=0,
160
+ end_char=len(text),
161
+ chunk_index=0,
162
+ metadata={"document": document_path},
163
+ )
164
+ ]
165
+ result.total_chunks = 1
166
+
167
+ # Step 3: Process each chunk
168
+ if self.parallel_chunks and len(chunks) > 1:
169
+ # Process chunks in parallel
170
+ chunk_results = await self._process_chunks_parallel(chunks, document_path, metadata)
171
+ else:
172
+ # Process chunks sequentially
173
+ chunk_results = await self._process_chunks_sequential(
174
+ chunks, document_path, metadata
175
+ )
176
+
177
+ result.chunk_results = chunk_results
178
+ result.chunks_processed = len(chunk_results)
179
+
180
+ # Check if all chunks succeeded
181
+ failed_chunks = [r for r in chunk_results if not r.success]
182
+ if failed_chunks:
183
+ result.errors.append(f"{len(failed_chunks)} chunks failed processing")
184
+
185
+ result.success = len(failed_chunks) < len(chunks) # At least some chunks succeeded
186
+
187
+ except Exception as e:
188
+ result.success = False
189
+ result.errors.append(f"Document processing failed: {str(e)}")
190
+
191
+ return result
192
+
193
+ async def build_from_documents(
194
+ self,
195
+ document_paths: List[Union[str, Path]],
196
+ parallel: bool = True,
197
+ max_parallel: int = 3,
198
+ ) -> List[DocumentBuildResult]:
199
+ """
200
+ Build knowledge graph from multiple documents
201
+
202
+ Args:
203
+ document_paths: List of document paths
204
+ parallel: Process documents in parallel
205
+ max_parallel: Maximum parallel documents
206
+
207
+ Returns:
208
+ List of DocumentBuildResult objects
209
+ """
210
+ if parallel:
211
+ semaphore = asyncio.Semaphore(max_parallel)
212
+
213
+ async def process_one(doc_path):
214
+ async with semaphore:
215
+ return await self.build_from_document(doc_path)
216
+
217
+ tasks = [process_one(doc_path) for doc_path in document_paths]
218
+ gather_results = await asyncio.gather(*tasks, return_exceptions=True)
219
+
220
+ # Handle exceptions - convert all to DocumentBuildResult
221
+ results: List[DocumentBuildResult] = []
222
+ for i, result in enumerate(gather_results):
223
+ if isinstance(result, Exception):
224
+ error_result = DocumentBuildResult(
225
+ document_path=str(document_paths[i]),
226
+ document_type="unknown",
227
+ success=False,
228
+ )
229
+ error_result.errors.append(str(result))
230
+ results.append(error_result)
231
+ elif isinstance(result, DocumentBuildResult):
232
+ results.append(result)
233
+ else:
234
+ # Fallback for unexpected types
235
+ error_result = DocumentBuildResult(
236
+ document_path=str(document_paths[i]),
237
+ document_type="unknown",
238
+ success=False,
239
+ )
240
+ error_result.errors.append(f"Unexpected result type: {type(result)}")
241
+ results.append(error_result)
242
+
243
+ return results
244
+ else:
245
+ # Sequential processing
246
+ results = []
247
+ for doc_path in document_paths:
248
+ result = await self.build_from_document(doc_path)
249
+ results.append(result)
250
+ return results
251
+
252
+ async def _parse_document(self, document_path: str) -> str:
253
+ """
254
+ Parse document to text using AIECS document parser
255
+
256
+ Args:
257
+ document_path: Path to document
258
+
259
+ Returns:
260
+ Extracted text content
261
+ """
262
+ try:
263
+ # Use document parser tool
264
+ parse_result = self.document_parser.parse_document(
265
+ source=document_path,
266
+ strategy=ParsingStrategy.TEXT_ONLY,
267
+ output_format=OutputFormat.TEXT,
268
+ )
269
+
270
+ if isinstance(parse_result, dict):
271
+ return parse_result.get("content", "")
272
+ elif isinstance(parse_result, str):
273
+ return parse_result
274
+ else:
275
+ return ""
276
+
277
+ except Exception:
278
+ # Fallback: try reading as plain text
279
+ try:
280
+ with open(document_path, "r", encoding="utf-8") as f:
281
+ return f.read()
282
+ except Exception as fallback_error:
283
+ raise RuntimeError(f"Failed to parse document: {str(fallback_error)}")
284
+
285
+ async def _process_chunks_parallel(
286
+ self,
287
+ chunks: List,
288
+ document_path: str,
289
+ metadata: Optional[Dict[str, Any]],
290
+ ) -> List[BuildResult]:
291
+ """
292
+ Process chunks in parallel
293
+
294
+ Args:
295
+ chunks: List of TextChunk objects
296
+ document_path: Source document path
297
+ metadata: Optional metadata
298
+
299
+ Returns:
300
+ List of BuildResult objects
301
+ """
302
+ semaphore = asyncio.Semaphore(self.max_parallel_chunks)
303
+
304
+ async def process_chunk(chunk):
305
+ async with semaphore:
306
+ chunk_metadata = {
307
+ "document": document_path,
308
+ "chunk_index": chunk.chunk_index,
309
+ "chunk_start": chunk.start_char,
310
+ "chunk_end": chunk.end_char,
311
+ }
312
+ if metadata:
313
+ chunk_metadata.update(metadata)
314
+
315
+ source = f"{document_path}#chunk{chunk.chunk_index}"
316
+ return await self.graph_builder.build_from_text(
317
+ text=chunk.text, source=source, metadata=chunk_metadata
318
+ )
319
+
320
+ tasks = [process_chunk(chunk) for chunk in chunks]
321
+ gather_results = await asyncio.gather(*tasks, return_exceptions=True)
322
+
323
+ # Handle exceptions - convert all to BuildResult
324
+ results: List[BuildResult] = []
325
+ for i, result in enumerate(gather_results):
326
+ if isinstance(result, Exception):
327
+ error_result = BuildResult(success=False)
328
+ error_result.errors.append(f"Chunk {i} failed: {str(result)}")
329
+ results.append(error_result)
330
+ elif isinstance(result, BuildResult):
331
+ results.append(result)
332
+ else:
333
+ # Fallback for unexpected types
334
+ error_result = BuildResult(success=False)
335
+ error_result.errors.append(f"Unexpected result type: {type(result)}")
336
+ results.append(error_result)
337
+
338
+ return results
339
+
340
+ async def _process_chunks_sequential(
341
+ self,
342
+ chunks: List,
343
+ document_path: str,
344
+ metadata: Optional[Dict[str, Any]],
345
+ ) -> List[BuildResult]:
346
+ """
347
+ Process chunks sequentially
348
+
349
+ Args:
350
+ chunks: List of TextChunk objects
351
+ document_path: Source document path
352
+ metadata: Optional metadata
353
+
354
+ Returns:
355
+ List of BuildResult objects
356
+ """
357
+ results = []
358
+
359
+ for chunk in chunks:
360
+ chunk_metadata = {
361
+ "document": document_path,
362
+ "chunk_index": chunk.chunk_index,
363
+ "chunk_start": chunk.start_char,
364
+ "chunk_end": chunk.end_char,
365
+ }
366
+ if metadata:
367
+ chunk_metadata.update(metadata)
368
+
369
+ source = f"{document_path}#chunk{chunk.chunk_index}"
370
+ result = await self.graph_builder.build_from_text(
371
+ text=chunk.text, source=source, metadata=chunk_metadata
372
+ )
373
+ results.append(result)
374
+
375
+ return results