aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (340) hide show
  1. aiecs/__init__.py +13 -16
  2. aiecs/__main__.py +7 -7
  3. aiecs/aiecs_client.py +269 -75
  4. aiecs/application/executors/operation_executor.py +79 -54
  5. aiecs/application/knowledge_graph/__init__.py +7 -0
  6. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  7. aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
  8. aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
  11. aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
  12. aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
  13. aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
  14. aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
  15. aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
  16. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  17. aiecs/application/knowledge_graph/extractors/base.py +98 -0
  18. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
  19. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
  20. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
  21. aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
  22. aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
  23. aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
  24. aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
  25. aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
  26. aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
  27. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
  28. aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
  29. aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
  30. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
  31. aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
  32. aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
  33. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
  34. aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
  35. aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
  36. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  37. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
  38. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
  39. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  40. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
  41. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  42. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  43. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
  44. aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
  45. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
  46. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  47. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  48. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
  49. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
  50. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
  51. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
  52. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
  53. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
  54. aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
  55. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
  56. aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
  57. aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
  58. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
  59. aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
  60. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  61. aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
  62. aiecs/application/knowledge_graph/search/reranker.py +293 -0
  63. aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
  64. aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
  65. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  66. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
  67. aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
  68. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  69. aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
  70. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  71. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
  72. aiecs/common/__init__.py +9 -0
  73. aiecs/common/knowledge_graph/__init__.py +17 -0
  74. aiecs/common/knowledge_graph/runnable.py +471 -0
  75. aiecs/config/__init__.py +20 -5
  76. aiecs/config/config.py +762 -31
  77. aiecs/config/graph_config.py +131 -0
  78. aiecs/config/tool_config.py +399 -0
  79. aiecs/core/__init__.py +29 -13
  80. aiecs/core/interface/__init__.py +2 -2
  81. aiecs/core/interface/execution_interface.py +22 -22
  82. aiecs/core/interface/storage_interface.py +37 -88
  83. aiecs/core/registry/__init__.py +31 -0
  84. aiecs/core/registry/service_registry.py +92 -0
  85. aiecs/domain/__init__.py +270 -1
  86. aiecs/domain/agent/__init__.py +191 -0
  87. aiecs/domain/agent/base_agent.py +3870 -0
  88. aiecs/domain/agent/exceptions.py +99 -0
  89. aiecs/domain/agent/graph_aware_mixin.py +569 -0
  90. aiecs/domain/agent/hybrid_agent.py +1435 -0
  91. aiecs/domain/agent/integration/__init__.py +29 -0
  92. aiecs/domain/agent/integration/context_compressor.py +216 -0
  93. aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
  94. aiecs/domain/agent/integration/protocols.py +281 -0
  95. aiecs/domain/agent/integration/retry_policy.py +218 -0
  96. aiecs/domain/agent/integration/role_config.py +213 -0
  97. aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
  98. aiecs/domain/agent/lifecycle.py +291 -0
  99. aiecs/domain/agent/llm_agent.py +692 -0
  100. aiecs/domain/agent/memory/__init__.py +12 -0
  101. aiecs/domain/agent/memory/conversation.py +1124 -0
  102. aiecs/domain/agent/migration/__init__.py +14 -0
  103. aiecs/domain/agent/migration/conversion.py +163 -0
  104. aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
  105. aiecs/domain/agent/models.py +884 -0
  106. aiecs/domain/agent/observability.py +479 -0
  107. aiecs/domain/agent/persistence.py +449 -0
  108. aiecs/domain/agent/prompts/__init__.py +29 -0
  109. aiecs/domain/agent/prompts/builder.py +159 -0
  110. aiecs/domain/agent/prompts/formatters.py +187 -0
  111. aiecs/domain/agent/prompts/template.py +255 -0
  112. aiecs/domain/agent/registry.py +253 -0
  113. aiecs/domain/agent/tool_agent.py +444 -0
  114. aiecs/domain/agent/tools/__init__.py +15 -0
  115. aiecs/domain/agent/tools/schema_generator.py +364 -0
  116. aiecs/domain/community/__init__.py +155 -0
  117. aiecs/domain/community/agent_adapter.py +469 -0
  118. aiecs/domain/community/analytics.py +432 -0
  119. aiecs/domain/community/collaborative_workflow.py +648 -0
  120. aiecs/domain/community/communication_hub.py +634 -0
  121. aiecs/domain/community/community_builder.py +320 -0
  122. aiecs/domain/community/community_integration.py +796 -0
  123. aiecs/domain/community/community_manager.py +803 -0
  124. aiecs/domain/community/decision_engine.py +849 -0
  125. aiecs/domain/community/exceptions.py +231 -0
  126. aiecs/domain/community/models/__init__.py +33 -0
  127. aiecs/domain/community/models/community_models.py +234 -0
  128. aiecs/domain/community/resource_manager.py +461 -0
  129. aiecs/domain/community/shared_context_manager.py +589 -0
  130. aiecs/domain/context/__init__.py +40 -10
  131. aiecs/domain/context/context_engine.py +1910 -0
  132. aiecs/domain/context/conversation_models.py +87 -53
  133. aiecs/domain/context/graph_memory.py +582 -0
  134. aiecs/domain/execution/model.py +12 -4
  135. aiecs/domain/knowledge_graph/__init__.py +19 -0
  136. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  137. aiecs/domain/knowledge_graph/models/entity.py +148 -0
  138. aiecs/domain/knowledge_graph/models/evidence.py +178 -0
  139. aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
  140. aiecs/domain/knowledge_graph/models/path.py +171 -0
  141. aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
  142. aiecs/domain/knowledge_graph/models/query.py +261 -0
  143. aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
  144. aiecs/domain/knowledge_graph/models/relation.py +202 -0
  145. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  146. aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
  147. aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
  148. aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
  149. aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
  150. aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
  151. aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
  152. aiecs/domain/task/dsl_processor.py +172 -56
  153. aiecs/domain/task/model.py +20 -8
  154. aiecs/domain/task/task_context.py +27 -24
  155. aiecs/infrastructure/__init__.py +0 -2
  156. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  157. aiecs/infrastructure/graph_storage/base.py +837 -0
  158. aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
  159. aiecs/infrastructure/graph_storage/cache.py +424 -0
  160. aiecs/infrastructure/graph_storage/distributed.py +223 -0
  161. aiecs/infrastructure/graph_storage/error_handling.py +380 -0
  162. aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
  163. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  164. aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
  165. aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
  166. aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
  167. aiecs/infrastructure/graph_storage/metrics.py +344 -0
  168. aiecs/infrastructure/graph_storage/migration.py +400 -0
  169. aiecs/infrastructure/graph_storage/pagination.py +483 -0
  170. aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
  171. aiecs/infrastructure/graph_storage/postgres.py +1563 -0
  172. aiecs/infrastructure/graph_storage/property_storage.py +353 -0
  173. aiecs/infrastructure/graph_storage/protocols.py +76 -0
  174. aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
  175. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  176. aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
  177. aiecs/infrastructure/graph_storage/streaming.py +487 -0
  178. aiecs/infrastructure/graph_storage/tenant.py +412 -0
  179. aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
  180. aiecs/infrastructure/messaging/websocket_manager.py +51 -35
  181. aiecs/infrastructure/monitoring/__init__.py +22 -0
  182. aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
  183. aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
  184. aiecs/infrastructure/monitoring/structured_logger.py +3 -7
  185. aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
  186. aiecs/infrastructure/persistence/__init__.py +14 -1
  187. aiecs/infrastructure/persistence/context_engine_client.py +184 -0
  188. aiecs/infrastructure/persistence/database_manager.py +67 -43
  189. aiecs/infrastructure/persistence/file_storage.py +180 -103
  190. aiecs/infrastructure/persistence/redis_client.py +74 -21
  191. aiecs/llm/__init__.py +73 -25
  192. aiecs/llm/callbacks/__init__.py +11 -0
  193. aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
  194. aiecs/llm/client_factory.py +224 -36
  195. aiecs/llm/client_resolver.py +155 -0
  196. aiecs/llm/clients/__init__.py +38 -0
  197. aiecs/llm/clients/base_client.py +324 -0
  198. aiecs/llm/clients/google_function_calling_mixin.py +457 -0
  199. aiecs/llm/clients/googleai_client.py +241 -0
  200. aiecs/llm/clients/openai_client.py +158 -0
  201. aiecs/llm/clients/openai_compatible_mixin.py +367 -0
  202. aiecs/llm/clients/vertex_client.py +897 -0
  203. aiecs/llm/clients/xai_client.py +201 -0
  204. aiecs/llm/config/__init__.py +51 -0
  205. aiecs/llm/config/config_loader.py +272 -0
  206. aiecs/llm/config/config_validator.py +206 -0
  207. aiecs/llm/config/model_config.py +143 -0
  208. aiecs/llm/protocols.py +149 -0
  209. aiecs/llm/utils/__init__.py +10 -0
  210. aiecs/llm/utils/validate_config.py +89 -0
  211. aiecs/main.py +140 -121
  212. aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
  213. aiecs/scripts/aid/__init__.py +19 -0
  214. aiecs/scripts/aid/module_checker.py +499 -0
  215. aiecs/scripts/aid/version_manager.py +235 -0
  216. aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
  217. aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
  218. aiecs/scripts/dependance_check/__init__.py +15 -0
  219. aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
  220. aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
  221. aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
  222. aiecs/scripts/dependance_patch/__init__.py +7 -0
  223. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  224. aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
  225. aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
  226. aiecs/scripts/knowledge_graph/__init__.py +3 -0
  227. aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
  228. aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
  229. aiecs/scripts/tools_develop/README.md +671 -0
  230. aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
  231. aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
  232. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  233. aiecs/scripts/tools_develop/__init__.py +21 -0
  234. aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
  235. aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
  236. aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
  237. aiecs/scripts/tools_develop/schema_coverage.py +511 -0
  238. aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
  239. aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
  240. aiecs/scripts/tools_develop/verify_tools.py +352 -0
  241. aiecs/tasks/__init__.py +0 -1
  242. aiecs/tasks/worker.py +115 -47
  243. aiecs/tools/__init__.py +194 -72
  244. aiecs/tools/apisource/__init__.py +99 -0
  245. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  246. aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
  247. aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
  248. aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
  249. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  250. aiecs/tools/apisource/monitoring/metrics.py +330 -0
  251. aiecs/tools/apisource/providers/__init__.py +112 -0
  252. aiecs/tools/apisource/providers/base.py +671 -0
  253. aiecs/tools/apisource/providers/census.py +397 -0
  254. aiecs/tools/apisource/providers/fred.py +535 -0
  255. aiecs/tools/apisource/providers/newsapi.py +409 -0
  256. aiecs/tools/apisource/providers/worldbank.py +352 -0
  257. aiecs/tools/apisource/reliability/__init__.py +12 -0
  258. aiecs/tools/apisource/reliability/error_handler.py +363 -0
  259. aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
  260. aiecs/tools/apisource/tool.py +832 -0
  261. aiecs/tools/apisource/utils/__init__.py +9 -0
  262. aiecs/tools/apisource/utils/validators.py +334 -0
  263. aiecs/tools/base_tool.py +415 -21
  264. aiecs/tools/docs/__init__.py +121 -0
  265. aiecs/tools/docs/ai_document_orchestrator.py +607 -0
  266. aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
  267. aiecs/tools/docs/content_insertion_tool.py +1320 -0
  268. aiecs/tools/docs/document_creator_tool.py +1323 -0
  269. aiecs/tools/docs/document_layout_tool.py +1160 -0
  270. aiecs/tools/docs/document_parser_tool.py +1011 -0
  271. aiecs/tools/docs/document_writer_tool.py +1829 -0
  272. aiecs/tools/knowledge_graph/__init__.py +17 -0
  273. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
  274. aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
  275. aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
  276. aiecs/tools/langchain_adapter.py +300 -138
  277. aiecs/tools/schema_generator.py +455 -0
  278. aiecs/tools/search_tool/__init__.py +100 -0
  279. aiecs/tools/search_tool/analyzers.py +581 -0
  280. aiecs/tools/search_tool/cache.py +264 -0
  281. aiecs/tools/search_tool/constants.py +128 -0
  282. aiecs/tools/search_tool/context.py +224 -0
  283. aiecs/tools/search_tool/core.py +778 -0
  284. aiecs/tools/search_tool/deduplicator.py +119 -0
  285. aiecs/tools/search_tool/error_handler.py +242 -0
  286. aiecs/tools/search_tool/metrics.py +343 -0
  287. aiecs/tools/search_tool/rate_limiter.py +172 -0
  288. aiecs/tools/search_tool/schemas.py +275 -0
  289. aiecs/tools/statistics/__init__.py +80 -0
  290. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
  291. aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
  292. aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
  293. aiecs/tools/statistics/data_loader_tool.py +555 -0
  294. aiecs/tools/statistics/data_profiler_tool.py +638 -0
  295. aiecs/tools/statistics/data_transformer_tool.py +580 -0
  296. aiecs/tools/statistics/data_visualizer_tool.py +498 -0
  297. aiecs/tools/statistics/model_trainer_tool.py +507 -0
  298. aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
  299. aiecs/tools/task_tools/__init__.py +49 -36
  300. aiecs/tools/task_tools/chart_tool.py +200 -184
  301. aiecs/tools/task_tools/classfire_tool.py +268 -267
  302. aiecs/tools/task_tools/image_tool.py +175 -131
  303. aiecs/tools/task_tools/office_tool.py +226 -146
  304. aiecs/tools/task_tools/pandas_tool.py +477 -121
  305. aiecs/tools/task_tools/report_tool.py +390 -142
  306. aiecs/tools/task_tools/research_tool.py +149 -79
  307. aiecs/tools/task_tools/scraper_tool.py +339 -145
  308. aiecs/tools/task_tools/stats_tool.py +448 -209
  309. aiecs/tools/temp_file_manager.py +26 -24
  310. aiecs/tools/tool_executor/__init__.py +18 -16
  311. aiecs/tools/tool_executor/tool_executor.py +364 -52
  312. aiecs/utils/LLM_output_structor.py +74 -48
  313. aiecs/utils/__init__.py +14 -3
  314. aiecs/utils/base_callback.py +0 -3
  315. aiecs/utils/cache_provider.py +696 -0
  316. aiecs/utils/execution_utils.py +50 -31
  317. aiecs/utils/prompt_loader.py +1 -0
  318. aiecs/utils/token_usage_repository.py +37 -11
  319. aiecs/ws/socket_server.py +14 -4
  320. {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
  321. aiecs-1.7.6.dist-info/RECORD +337 -0
  322. aiecs-1.7.6.dist-info/entry_points.txt +13 -0
  323. aiecs/config/registry.py +0 -19
  324. aiecs/domain/context/content_engine.py +0 -982
  325. aiecs/llm/base_client.py +0 -99
  326. aiecs/llm/openai_client.py +0 -125
  327. aiecs/llm/vertex_client.py +0 -186
  328. aiecs/llm/xai_client.py +0 -184
  329. aiecs/scripts/dependency_checker.py +0 -857
  330. aiecs/scripts/quick_dependency_check.py +0 -269
  331. aiecs/tools/task_tools/search_api.py +0 -7
  332. aiecs-1.0.1.dist-info/RECORD +0 -90
  333. aiecs-1.0.1.dist-info/entry_points.txt +0 -7
  334. /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
  335. /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
  336. /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
  337. /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
  338. {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
  339. {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
  340. {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1384 @@
1
+ """
2
+ Structured Data Pipeline
3
+
4
+ Import structured data (CSV, JSON, SPSS, Excel) into knowledge graphs using schema mappings.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import List, Optional, Dict, Any, Callable, Union
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+
14
+ try:
15
+ import pandas as pd # type: ignore[import-untyped]
16
+
17
+ PANDAS_AVAILABLE = True
18
+ except ImportError:
19
+ PANDAS_AVAILABLE = False
20
+
21
+ from aiecs.infrastructure.graph_storage.base import GraphStore
22
+ from aiecs.domain.knowledge_graph.models.entity import Entity
23
+ from aiecs.domain.knowledge_graph.models.relation import Relation
24
+ from aiecs.application.knowledge_graph.builder.schema_mapping import (
25
+ SchemaMapping,
26
+ )
27
+ from aiecs.application.knowledge_graph.builder.data_quality import (
28
+ DataQualityValidator,
29
+ ValidationConfig,
30
+ QualityReport,
31
+ RangeRule,
32
+ )
33
+ from aiecs.application.knowledge_graph.builder.import_optimizer import (
34
+ PerformanceMetrics,
35
+ BatchSizeOptimizer,
36
+ ParallelBatchProcessor,
37
+ MemoryTracker,
38
+ StreamingCSVReader,
39
+ )
40
+
41
+ # Import InferredSchema for type hints (avoid circular import)
42
+ from typing import TYPE_CHECKING
43
+ if TYPE_CHECKING:
44
+ from aiecs.application.knowledge_graph.builder.schema_inference import InferredSchema
45
+
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ @dataclass
51
+ class ImportResult:
52
+ """
53
+ Result of structured data import operation
54
+
55
+ Attributes:
56
+ success: Whether import completed successfully
57
+ entities_added: Number of entities added to graph
58
+ relations_added: Number of relations added to graph
59
+ rows_processed: Number of rows processed
60
+ rows_failed: Number of rows that failed to process
61
+ errors: List of errors encountered
62
+ warnings: List of warnings
63
+ quality_report: Data quality validation report (if validation enabled)
64
+ start_time: When import started
65
+ end_time: When import ended
66
+ duration_seconds: Total duration in seconds
67
+ performance_metrics: Detailed performance metrics (if tracking enabled)
68
+ """
69
+
70
+ success: bool = True
71
+ entities_added: int = 0
72
+ relations_added: int = 0
73
+ rows_processed: int = 0
74
+ rows_failed: int = 0
75
+ errors: List[str] = field(default_factory=list)
76
+ warnings: List[str] = field(default_factory=list)
77
+ quality_report: Optional[QualityReport] = None
78
+ start_time: Optional[datetime] = None
79
+ end_time: Optional[datetime] = None
80
+ duration_seconds: float = 0.0
81
+ performance_metrics: Optional[PerformanceMetrics] = None
82
+
83
+
84
+ class AggregationAccumulator:
85
+ """
86
+ Accumulator for incremental statistical aggregation
87
+
88
+ Computes statistics incrementally as data is processed in batches.
89
+ """
90
+
91
+ def __init__(self):
92
+ self.count = 0
93
+ self.sum = 0.0
94
+ self.sum_sq = 0.0 # Sum of squares for variance/std
95
+ self.min_val = float('inf')
96
+ self.max_val = float('-inf')
97
+ self.values = [] # For median (if needed)
98
+
99
+ def add(self, value: Any):
100
+ """Add a value to the accumulator"""
101
+ if value is None:
102
+ return
103
+
104
+ try:
105
+ num_val = float(value)
106
+ except (ValueError, TypeError):
107
+ return
108
+
109
+ self.count += 1
110
+ self.sum += num_val
111
+ self.sum_sq += num_val * num_val
112
+ self.min_val = min(self.min_val, num_val)
113
+ self.max_val = max(self.max_val, num_val)
114
+ self.values.append(num_val)
115
+
116
+ def get_mean(self) -> Optional[float]:
117
+ """Get mean value"""
118
+ if self.count == 0:
119
+ return None
120
+ return self.sum / self.count
121
+
122
+ def get_std(self) -> Optional[float]:
123
+ """Get standard deviation (sample std with Bessel's correction)"""
124
+ if self.count < 2:
125
+ return None
126
+ mean = self.get_mean()
127
+ if mean is None:
128
+ return None
129
+ # Use sample variance formula: sum((x - mean)^2) / (n - 1)
130
+ # Which equals: (sum(x^2) - n*mean^2) / (n - 1)
131
+ variance = (self.sum_sq - self.count * mean * mean) / (self.count - 1)
132
+ return variance ** 0.5 if variance >= 0 else 0.0
133
+
134
+ def get_variance(self) -> Optional[float]:
135
+ """Get variance (sample variance with Bessel's correction)"""
136
+ if self.count < 2:
137
+ return None
138
+ mean = self.get_mean()
139
+ if mean is None:
140
+ return None
141
+ # Use sample variance formula: (sum(x^2) - n*mean^2) / (n - 1)
142
+ return (self.sum_sq - self.count * mean * mean) / (self.count - 1)
143
+
144
+ def get_min(self) -> Optional[float]:
145
+ """Get minimum value"""
146
+ if self.count == 0:
147
+ return None
148
+ return self.min_val
149
+
150
+ def get_max(self) -> Optional[float]:
151
+ """Get maximum value"""
152
+ if self.count == 0:
153
+ return None
154
+ return self.max_val
155
+
156
+ def get_sum(self) -> Optional[float]:
157
+ """Get sum"""
158
+ if self.count == 0:
159
+ return None
160
+ return self.sum
161
+
162
+ def get_count(self) -> int:
163
+ """Get count"""
164
+ return self.count
165
+
166
+ def get_median(self) -> Optional[float]:
167
+ """Get median value"""
168
+ if self.count == 0:
169
+ return None
170
+ sorted_vals = sorted(self.values)
171
+ mid = self.count // 2
172
+ if self.count % 2 == 0:
173
+ return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
174
+ return sorted_vals[mid]
175
+
176
+
177
+ class StructuredDataPipeline:
178
+ """
179
+ Pipeline for importing structured data (CSV, JSON, SPSS, Excel) into knowledge graphs
180
+
181
+ Uses SchemaMapping to map source data columns to entity and relation types.
182
+ Supports batch processing, progress tracking, and error handling.
183
+
184
+ Example:
185
+ ```python
186
+ # Define schema mapping
187
+ mapping = SchemaMapping(
188
+ entity_mappings=[
189
+ EntityMapping(
190
+ source_columns=["id", "name", "age"],
191
+ entity_type="Person",
192
+ property_mapping={"id": "id", "name": "name", "age": "age"}
193
+ )
194
+ ],
195
+ relation_mappings=[
196
+ RelationMapping(
197
+ source_columns=["person_id", "company_id"],
198
+ relation_type="WORKS_FOR",
199
+ source_entity_column="person_id",
200
+ target_entity_column="company_id"
201
+ )
202
+ ]
203
+ )
204
+
205
+ # Create pipeline
206
+ pipeline = StructuredDataPipeline(
207
+ mapping=mapping,
208
+ graph_store=store
209
+ )
210
+
211
+ # Import CSV
212
+ result = await pipeline.import_from_csv("employees.csv")
213
+ print(f"Added {result.entities_added} entities, {result.relations_added} relations")
214
+ ```
215
+ """
216
+
217
+ def __init__(
218
+ self,
219
+ mapping: SchemaMapping,
220
+ graph_store: GraphStore,
221
+ batch_size: int = 100,
222
+ progress_callback: Optional[Callable[[str, float], None]] = None,
223
+ skip_errors: bool = True,
224
+ enable_parallel: bool = False,
225
+ max_workers: Optional[int] = None,
226
+ auto_tune_batch_size: bool = False,
227
+ enable_streaming: bool = False,
228
+ use_bulk_writes: bool = True,
229
+ track_performance: bool = True,
230
+ ):
231
+ """
232
+ Initialize structured data pipeline
233
+
234
+ Args:
235
+ mapping: Schema mapping configuration
236
+ graph_store: Graph storage to save entities/relations
237
+ batch_size: Number of rows to process in each batch (ignored if auto_tune_batch_size=True)
238
+ progress_callback: Optional callback for progress updates (message, progress_pct)
239
+ skip_errors: Whether to skip rows with errors and continue processing
240
+ enable_parallel: Enable parallel batch processing for faster imports
241
+ max_workers: Maximum number of parallel workers (default: CPU count - 1)
242
+ auto_tune_batch_size: Automatically tune batch size based on system resources
243
+ enable_streaming: Enable streaming mode for large files (memory-efficient)
244
+ use_bulk_writes: Use bulk write operations for better performance
245
+ track_performance: Track detailed performance metrics
246
+ """
247
+ # Validate mapping
248
+ validation_errors = mapping.validate_mapping()
249
+ if validation_errors:
250
+ raise ValueError(f"Invalid schema mapping: {validation_errors}")
251
+
252
+ self.mapping = mapping
253
+ self.graph_store = graph_store
254
+ self.batch_size = batch_size
255
+ self.progress_callback = progress_callback
256
+ self.skip_errors = skip_errors
257
+
258
+ # Performance optimization settings
259
+ self.enable_parallel = enable_parallel
260
+ self.max_workers = max_workers
261
+ self.auto_tune_batch_size = auto_tune_batch_size
262
+ self.enable_streaming = enable_streaming
263
+ self.use_bulk_writes = use_bulk_writes
264
+ self.track_performance = track_performance
265
+
266
+ # Initialize optimizers
267
+ self._batch_optimizer = BatchSizeOptimizer() if auto_tune_batch_size else None
268
+ self._memory_tracker = MemoryTracker() if track_performance else None
269
+
270
+ # Initialize aggregation tracking
271
+ self._aggregation_accumulators: Dict[str, Dict[str, Any]] = {} # entity_type -> {property -> accumulator}
272
+
273
+ # Initialize data quality validator if validation config is provided
274
+ self.validator: Optional[DataQualityValidator] = None
275
+ if mapping.validation_config:
276
+ self.validator = self._create_validator_from_config(mapping.validation_config)
277
+
278
+ if not PANDAS_AVAILABLE:
279
+ logger.warning("pandas not available. CSV import will use basic CSV reader. " "Install pandas for better performance: pip install pandas")
280
+
281
+ @staticmethod
282
+ def infer_schema_from_csv(
283
+ file_path: Union[str, Path],
284
+ encoding: str = "utf-8",
285
+ sample_size: int = 1000,
286
+ ) -> 'InferredSchema':
287
+ """
288
+ Infer schema mapping from CSV file
289
+
290
+ Analyzes CSV structure and content to automatically generate schema mappings.
291
+
292
+ Args:
293
+ file_path: Path to CSV file
294
+ encoding: File encoding (default: utf-8)
295
+ sample_size: Number of rows to sample for inference (default: 1000)
296
+
297
+ Returns:
298
+ InferredSchema with entity and relation mappings
299
+
300
+ Example:
301
+ ```python
302
+ # Infer schema from CSV
303
+ inferred = StructuredDataPipeline.infer_schema_from_csv("data.csv")
304
+
305
+ # Review and modify if needed
306
+ print(f"Inferred entity types: {[em.entity_type for em in inferred.entity_mappings]}")
307
+ print(f"Warnings: {inferred.warnings}")
308
+
309
+ # Use inferred schema
310
+ mapping = inferred.to_schema_mapping()
311
+ pipeline = StructuredDataPipeline(mapping, graph_store)
312
+ ```
313
+ """
314
+ from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
315
+
316
+ inference = SchemaInference(sample_size=sample_size)
317
+ return inference.infer_from_csv(file_path, encoding=encoding)
318
+
319
+ @staticmethod
320
+ def infer_schema_from_spss(
321
+ file_path: Union[str, Path],
322
+ encoding: str = "utf-8",
323
+ sample_size: int = 1000,
324
+ ) -> 'InferredSchema':
325
+ """
326
+ Infer schema mapping from SPSS file
327
+
328
+ Uses SPSS variable labels and value labels to generate schema mappings.
329
+
330
+ Args:
331
+ file_path: Path to SPSS file
332
+ encoding: File encoding (default: utf-8)
333
+ sample_size: Number of rows to sample for inference (default: 1000)
334
+
335
+ Returns:
336
+ InferredSchema with entity and relation mappings
337
+ """
338
+ from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
339
+
340
+ inference = SchemaInference(sample_size=sample_size)
341
+ return inference.infer_from_spss(file_path, encoding=encoding)
342
+
343
+ @staticmethod
344
+ def infer_schema_from_dataframe(
345
+ df: 'pd.DataFrame',
346
+ entity_type_hint: Optional[str] = None,
347
+ metadata: Optional[Dict[str, Any]] = None,
348
+ sample_size: int = 1000,
349
+ ) -> 'InferredSchema':
350
+ """
351
+ Infer schema mapping from pandas DataFrame
352
+
353
+ Args:
354
+ df: DataFrame to analyze
355
+ entity_type_hint: Optional hint for entity type name
356
+ metadata: Optional metadata (e.g., SPSS variable labels)
357
+ sample_size: Number of rows to sample for inference (default: 1000)
358
+
359
+ Returns:
360
+ InferredSchema with entity and relation mappings
361
+ """
362
+ from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
363
+
364
+ inference = SchemaInference(sample_size=sample_size)
365
+ return inference.infer_from_dataframe(df, entity_type_hint=entity_type_hint, metadata=metadata)
366
+
367
+ @staticmethod
368
+ def create_with_auto_reshape(
369
+ file_path: Union[str, Path],
370
+ graph_store: GraphStore,
371
+ entity_type_hint: Optional[str] = None,
372
+ reshape_threshold: int = 50,
373
+ **kwargs,
374
+ ) -> 'StructuredDataPipeline':
375
+ """
376
+ Create pipeline with automatic reshaping for wide format data
377
+
378
+ Detects wide format data and automatically reshapes to normalized structure
379
+ before creating the pipeline.
380
+
381
+ Args:
382
+ file_path: Path to data file (CSV, SPSS, Excel)
383
+ graph_store: Graph storage to save entities/relations
384
+ entity_type_hint: Optional hint for entity type name
385
+ reshape_threshold: Minimum columns to trigger reshaping (default: 50)
386
+ **kwargs: Additional arguments for StructuredDataPipeline
387
+
388
+ Returns:
389
+ StructuredDataPipeline configured for the data
390
+
391
+ Example:
392
+ ```python
393
+ # Automatically detect and reshape wide format data
394
+ pipeline = StructuredDataPipeline.create_with_auto_reshape(
395
+ "wide_data.csv",
396
+ graph_store,
397
+ entity_type_hint="Sample"
398
+ )
399
+
400
+ # Import reshaped data
401
+ result = await pipeline.import_from_csv("wide_data.csv")
402
+ ```
403
+ """
404
+ from aiecs.application.knowledge_graph.builder.data_reshaping import DataReshaping
405
+ from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
406
+
407
+ if not PANDAS_AVAILABLE:
408
+ raise ImportError("pandas is required for automatic reshaping")
409
+
410
+ # Load data to analyze
411
+ file_path_str = str(file_path)
412
+ if file_path_str.endswith('.csv'):
413
+ df = pd.read_csv(file_path, nrows=1000) # Sample for analysis
414
+ elif file_path_str.endswith(('.sav', '.por')):
415
+ import pyreadstat
416
+ df, _ = pyreadstat.read_sav(file_path_str, row_limit=1000)
417
+ elif file_path_str.endswith(('.xlsx', '.xls')):
418
+ df = pd.read_excel(file_path, nrows=1000)
419
+ else:
420
+ raise ValueError(f"Unsupported file format: {file_path}")
421
+
422
+ # Check if data is in wide format
423
+ is_wide = DataReshaping.detect_wide_format(df, threshold_columns=reshape_threshold)
424
+
425
+ if is_wide:
426
+ logger.info(f"Detected wide format data ({df.shape[1]} columns). Suggesting normalized structure.")
427
+
428
+ # Suggest melt configuration
429
+ melt_config = DataReshaping.suggest_melt_config(df)
430
+ logger.info(f"Suggested melt config: id_vars={melt_config['id_vars']}, "
431
+ f"{len(melt_config['value_vars'])} value columns")
432
+
433
+ # For wide format, we'll need to reshape during import
434
+ # For now, infer schema from original data
435
+ inference = SchemaInference()
436
+ inferred = inference.infer_from_dataframe(df, entity_type_hint=entity_type_hint)
437
+
438
+ # Add warning about wide format
439
+ inferred.warnings.append(
440
+ f"Wide format detected ({df.shape[1]} columns). "
441
+ f"Consider using reshape_and_import() for normalized structure."
442
+ )
443
+
444
+ mapping = inferred.to_schema_mapping()
445
+ else:
446
+ # Normal format - infer schema directly
447
+ inference = SchemaInference()
448
+ inferred = inference.infer_from_dataframe(df, entity_type_hint=entity_type_hint)
449
+ mapping = inferred.to_schema_mapping()
450
+
451
+ return StructuredDataPipeline(mapping=mapping, graph_store=graph_store, **kwargs)
452
+
453
+ async def import_from_csv(
454
+ self,
455
+ file_path: Union[str, Path],
456
+ encoding: str = "utf-8",
457
+ delimiter: str = ",",
458
+ header: bool = True,
459
+ ) -> ImportResult:
460
+ """
461
+ Import data from CSV file
462
+
463
+ Args:
464
+ file_path: Path to CSV file
465
+ encoding: File encoding (default: utf-8)
466
+ delimiter: CSV delimiter (default: comma)
467
+ header: Whether file has header row (default: True)
468
+
469
+ Returns:
470
+ ImportResult with statistics
471
+ """
472
+ result = ImportResult(start_time=datetime.now())
473
+
474
+ try:
475
+ # Read CSV file
476
+ if PANDAS_AVAILABLE:
477
+ df = pd.read_csv(
478
+ file_path,
479
+ encoding=encoding,
480
+ sep=delimiter,
481
+ header=0 if header else None,
482
+ )
483
+
484
+ # Run data quality validation if validator is configured
485
+ if self.validator:
486
+ # Determine ID column for validation
487
+ id_column = None
488
+ for entity_mapping in self.mapping.entity_mappings:
489
+ if entity_mapping.id_column:
490
+ id_column = entity_mapping.id_column
491
+ break
492
+
493
+ quality_report = self.validator.validate_dataframe(df, id_column=id_column)
494
+ result.quality_report = quality_report
495
+
496
+ # Log quality issues
497
+ if quality_report.violations:
498
+ logger.warning(f"Data quality validation found {len(quality_report.violations)} violations")
499
+ for violation in quality_report.violations[:5]: # Log first 5
500
+ logger.warning(f" {violation.message}")
501
+ if len(quality_report.violations) > 5:
502
+ logger.warning(f" ... and {len(quality_report.violations) - 5} more violations")
503
+
504
+ # Fail import if configured and validation failed
505
+ if not quality_report.passed:
506
+ result.success = False
507
+ result.errors.append(f"Data quality validation failed: {len(quality_report.violations)} violations")
508
+ return result
509
+
510
+ rows = df.to_dict("records")
511
+ else:
512
+ # Fallback to basic CSV reader
513
+ import csv
514
+
515
+ rows = []
516
+ with open(file_path, "r", encoding=encoding) as f:
517
+ reader = csv.DictReader(f) if header else csv.reader(f)
518
+ if header:
519
+ for row in reader:
520
+ rows.append(row)
521
+ else:
522
+ # No header - use column indices
523
+ for row in reader:
524
+ rows.append({str(i): val for i, val in enumerate(row)})
525
+
526
+ # Process rows
527
+ result = await self._process_rows(rows, result)
528
+
529
+ except Exception as e:
530
+ error_msg = f"Failed to import CSV file {file_path}: {e}"
531
+ logger.error(error_msg, exc_info=True)
532
+ result.success = False
533
+ result.errors.append(error_msg)
534
+
535
+ finally:
536
+ result.end_time = datetime.now()
537
+ if result.start_time:
538
+ result.duration_seconds = (result.end_time - result.start_time).total_seconds()
539
+
540
+ return result
541
+
542
+ async def import_from_json(
543
+ self,
544
+ file_path: Union[str, Path],
545
+ encoding: str = "utf-8",
546
+ array_key: Optional[str] = None,
547
+ ) -> ImportResult:
548
+ """
549
+ Import data from JSON file
550
+
551
+ Supports:
552
+ - Array of objects: [{"id": 1, "name": "Alice"}, ...]
553
+ - Object with array: {"items": [{"id": 1, ...}, ...]}
554
+ - Single object: {"id": 1, "name": "Alice"}
555
+
556
+ Args:
557
+ file_path: Path to JSON file
558
+ encoding: File encoding (default: utf-8)
559
+ array_key: If JSON is object with array, key containing the array
560
+
561
+ Returns:
562
+ ImportResult with statistics
563
+ """
564
+ result = ImportResult(start_time=datetime.now())
565
+
566
+ try:
567
+ # Read JSON file
568
+ with open(file_path, "r", encoding=encoding) as f:
569
+ data = json.load(f)
570
+
571
+ # Extract rows
572
+ if isinstance(data, list):
573
+ rows = data
574
+ elif isinstance(data, dict):
575
+ if array_key:
576
+ rows = data.get(array_key, [])
577
+ if not isinstance(rows, list):
578
+ raise ValueError(f"Key '{array_key}' does not contain an array")
579
+ else:
580
+ # Single object - wrap in list
581
+ rows = [data]
582
+ else:
583
+ raise ValueError(f"JSON file must contain array or object, got {type(data)}")
584
+
585
+ # Process rows
586
+ result = await self._process_rows(rows, result)
587
+
588
+ except Exception as e:
589
+ error_msg = f"Failed to import JSON file {file_path}: {e}"
590
+ logger.error(error_msg, exc_info=True)
591
+ result.success = False
592
+ result.errors.append(error_msg)
593
+
594
+ finally:
595
+ result.end_time = datetime.now()
596
+ if result.start_time:
597
+ result.duration_seconds = (result.end_time - result.start_time).total_seconds()
598
+
599
+ return result
600
+
601
+ async def import_from_csv_streaming(
602
+ self,
603
+ file_path: Union[str, Path],
604
+ encoding: str = "utf-8",
605
+ delimiter: str = ",",
606
+ chunk_size: int = 10000,
607
+ ) -> ImportResult:
608
+ """
609
+ Import data from CSV file using streaming mode.
610
+
611
+ Memory-efficient import for large files (>1GB). Reads file in chunks
612
+ without loading entire file into memory.
613
+
614
+ Args:
615
+ file_path: Path to CSV file
616
+ encoding: File encoding (default: utf-8)
617
+ delimiter: CSV delimiter (default: comma)
618
+ chunk_size: Number of rows per chunk (default: 10000)
619
+
620
+ Returns:
621
+ ImportResult with statistics and performance metrics
622
+ """
623
+ import time
624
+
625
+ result = ImportResult(start_time=datetime.now())
626
+
627
+ # Initialize performance metrics
628
+ metrics = PerformanceMetrics() if self.track_performance else None
629
+ if metrics:
630
+ metrics.start_time = time.time()
631
+ if self._memory_tracker:
632
+ self._memory_tracker.start_tracking()
633
+ metrics.initial_memory_mb = self._memory_tracker.initial_memory_mb
634
+
635
+ try:
636
+ if not PANDAS_AVAILABLE:
637
+ raise ImportError("pandas is required for streaming CSV import")
638
+
639
+ # Count total rows for progress tracking
640
+ streaming_reader = StreamingCSVReader(
641
+ str(file_path),
642
+ chunk_size=chunk_size,
643
+ encoding=encoding,
644
+ delimiter=delimiter,
645
+ )
646
+ total_rows = streaming_reader.count_rows()
647
+ if metrics:
648
+ metrics.total_rows = total_rows
649
+
650
+ processed_rows = 0
651
+ batch_count = 0
652
+
653
+ # Process file in chunks
654
+ async for chunk_df in streaming_reader.read_chunks():
655
+ read_start = time.time()
656
+ rows = chunk_df.to_dict("records")
657
+ if metrics:
658
+ metrics.read_time_seconds += time.time() - read_start
659
+
660
+ # Update progress
661
+ if self.progress_callback:
662
+ progress_pct = (processed_rows / total_rows) * 100 if total_rows > 0 else 0
663
+ self.progress_callback(
664
+ f"Streaming chunk {batch_count + 1}: {processed_rows}/{total_rows} rows",
665
+ progress_pct,
666
+ )
667
+
668
+ # Process chunk
669
+ transform_start = time.time()
670
+ for row in rows:
671
+ try:
672
+ row_entities = await self._row_to_entities(row)
673
+ row_relations = await self._row_to_relations(row)
674
+
675
+ # Add entities and relations
676
+ if self.use_bulk_writes and hasattr(self.graph_store, 'add_entities_bulk'):
677
+ added = await self.graph_store.add_entities_bulk(row_entities)
678
+ result.entities_added += added
679
+ else:
680
+ for entity in row_entities:
681
+ try:
682
+ await self.graph_store.add_entity(entity)
683
+ result.entities_added += 1
684
+ except ValueError:
685
+ pass
686
+
687
+ if self.use_bulk_writes and hasattr(self.graph_store, 'add_relations_bulk'):
688
+ added = await self.graph_store.add_relations_bulk(row_relations)
689
+ result.relations_added += added
690
+ else:
691
+ for relation in row_relations:
692
+ try:
693
+ await self.graph_store.add_relation(relation)
694
+ result.relations_added += 1
695
+ except ValueError:
696
+ pass
697
+
698
+ result.rows_processed += 1
699
+ except Exception as e:
700
+ result.rows_failed += 1
701
+ if not self.skip_errors:
702
+ raise
703
+ result.warnings.append(f"Row error: {e}")
704
+
705
+ if metrics:
706
+ metrics.transform_time_seconds += time.time() - transform_start
707
+
708
+ processed_rows += len(rows)
709
+ batch_count += 1
710
+
711
+ # Update memory tracking
712
+ if self._memory_tracker:
713
+ self._memory_tracker.update()
714
+
715
+ # Finalize metrics
716
+ if metrics:
717
+ metrics.end_time = time.time()
718
+ metrics.batch_count = batch_count
719
+ if self._memory_tracker:
720
+ metrics.peak_memory_mb = self._memory_tracker.peak_memory_mb
721
+ metrics.calculate_throughput()
722
+ result.performance_metrics = metrics
723
+
724
+ except Exception as e:
725
+ error_msg = f"Failed to import CSV file (streaming): {e}"
726
+ logger.error(error_msg, exc_info=True)
727
+ result.success = False
728
+ result.errors.append(error_msg)
729
+
730
+ finally:
731
+ result.end_time = datetime.now()
732
+ if result.start_time:
733
+ result.duration_seconds = (result.end_time - result.start_time).total_seconds()
734
+
735
+ return result
736
+
737
+ async def import_from_spss(
738
+ self,
739
+ file_path: Union[str, Path],
740
+ encoding: str = "utf-8",
741
+ preserve_metadata: bool = True,
742
+ ) -> ImportResult:
743
+ """
744
+ Import data from SPSS file (.sav, .por)
745
+
746
+ Uses pyreadstat library to read SPSS files and extract metadata.
747
+ SPSS variable labels and value labels are preserved as entity properties.
748
+
749
+ Args:
750
+ file_path: Path to SPSS file (.sav or .por)
751
+ encoding: File encoding (default: utf-8)
752
+ preserve_metadata: Whether to preserve SPSS metadata (variable labels, value labels)
753
+
754
+ Returns:
755
+ ImportResult with statistics
756
+ """
757
+ result = ImportResult(start_time=datetime.now())
758
+
759
+ try:
760
+ # Import pyreadstat
761
+ try:
762
+ import pyreadstat # type: ignore[import-untyped]
763
+ except ImportError:
764
+ raise ImportError(
765
+ "pyreadstat is required for SPSS import. "
766
+ "Install with: pip install pyreadstat"
767
+ )
768
+
769
+ if not PANDAS_AVAILABLE:
770
+ raise ImportError("pandas is required for SPSS import. Install with: pip install pandas")
771
+
772
+ # Read SPSS file
773
+ df, meta = pyreadstat.read_sav(str(file_path), encoding=encoding)
774
+
775
+ # Convert DataFrame to list of dictionaries
776
+ rows = df.to_dict("records")
777
+
778
+ # If preserve_metadata is True, add SPSS metadata to each row
779
+ if preserve_metadata and meta:
780
+ # Extract metadata
781
+ spss_metadata = {
782
+ "column_names": meta.column_names if hasattr(meta, 'column_names') else [],
783
+ "column_labels": meta.column_labels if hasattr(meta, 'column_labels') else [],
784
+ "variable_value_labels": meta.variable_value_labels if hasattr(meta, 'variable_value_labels') else {},
785
+ }
786
+
787
+ # Store metadata in result for reference
788
+ if spss_metadata.get('column_labels'):
789
+ result.warnings.append(f"SPSS metadata preserved: {len(spss_metadata['column_labels'])} variable labels")
790
+
791
+ # Add metadata to each row's properties
792
+ for row in rows:
793
+ row["_spss_metadata"] = spss_metadata
794
+
795
+ # Process rows
796
+ result = await self._process_rows(rows, result)
797
+
798
+ except Exception as e:
799
+ error_msg = f"Failed to import SPSS file {file_path}: {e}"
800
+ logger.error(error_msg, exc_info=True)
801
+ result.success = False
802
+ result.errors.append(error_msg)
803
+
804
+ finally:
805
+ result.end_time = datetime.now()
806
+ if result.start_time:
807
+ result.duration_seconds = (result.end_time - result.start_time).total_seconds()
808
+
809
+ return result
810
+
811
+ async def import_from_excel(
812
+ self,
813
+ file_path: Union[str, Path],
814
+ sheet_name: Union[str, int, None] = 0,
815
+ encoding: str = "utf-8",
816
+ header: bool = True,
817
+ ) -> ImportResult:
818
+ """
819
+ Import data from Excel file (.xlsx, .xls)
820
+
821
+ Supports importing from specific sheets or all sheets.
822
+
823
+ Args:
824
+ file_path: Path to Excel file
825
+ sheet_name: Sheet name (str), sheet index (int), or None for all sheets (default: 0 = first sheet)
826
+ encoding: File encoding (default: utf-8)
827
+ header: Whether file has header row (default: True)
828
+
829
+ Returns:
830
+ ImportResult with statistics
831
+ """
832
+ result = ImportResult(start_time=datetime.now())
833
+
834
+ try:
835
+ if not PANDAS_AVAILABLE:
836
+ raise ImportError("pandas is required for Excel import. Install with: pip install pandas openpyxl")
837
+
838
+ # Read Excel file
839
+ if sheet_name is None:
840
+ # Read all sheets
841
+ excel_data = pd.read_excel(
842
+ file_path,
843
+ sheet_name=None, # Returns dict of sheet_name -> DataFrame
844
+ header=0 if header else None,
845
+ )
846
+
847
+ # Process each sheet
848
+ all_rows = []
849
+ for sheet_name_key, df in excel_data.items():
850
+ sheet_rows = df.to_dict("records")
851
+ # Add sheet name to each row for reference
852
+ for row in sheet_rows:
853
+ row["_excel_sheet"] = sheet_name_key
854
+ all_rows.extend(sheet_rows)
855
+
856
+ rows = all_rows
857
+ result.warnings.append(f"Imported {len(excel_data)} sheets from Excel file")
858
+
859
+ else:
860
+ # Read specific sheet
861
+ df = pd.read_excel(
862
+ file_path,
863
+ sheet_name=sheet_name,
864
+ header=0 if header else None,
865
+ )
866
+ rows = df.to_dict("records")
867
+
868
+ # Process rows
869
+ result = await self._process_rows(rows, result)
870
+
871
+ except Exception as e:
872
+ error_msg = f"Failed to import Excel file {file_path}: {e}"
873
+ logger.error(error_msg, exc_info=True)
874
+ result.success = False
875
+ result.errors.append(error_msg)
876
+
877
+ finally:
878
+ result.end_time = datetime.now()
879
+ if result.start_time:
880
+ result.duration_seconds = (result.end_time - result.start_time).total_seconds()
881
+
882
+ return result
883
+
884
+ async def reshape_and_import_csv(
885
+ self,
886
+ file_path: Union[str, Path],
887
+ id_vars: Optional[List[str]] = None,
888
+ value_vars: Optional[List[str]] = None,
889
+ var_name: str = 'variable',
890
+ value_name: str = 'value',
891
+ entity_type_hint: Optional[str] = None,
892
+ encoding: str = "utf-8",
893
+ ) -> ImportResult:
894
+ """
895
+ Reshape wide format CSV to normalized structure and import
896
+
897
+ Automatically converts wide format data (many columns) to long format
898
+ (normalized structure) before importing into the graph.
899
+
900
+ Args:
901
+ file_path: Path to CSV file
902
+ id_vars: Columns to use as identifiers (auto-detected if None)
903
+ value_vars: Columns to unpivot (auto-detected if None)
904
+ var_name: Name for variable column (default: 'variable')
905
+ value_name: Name for value column (default: 'value')
906
+ entity_type_hint: Optional hint for entity type name
907
+ encoding: File encoding (default: utf-8)
908
+
909
+ Returns:
910
+ ImportResult with statistics
911
+
912
+ Example:
913
+ ```python
914
+ # Wide format: sample_id, option1, option2, ..., option200
915
+ # Will be reshaped to: sample_id, variable, value
916
+
917
+ result = await pipeline.reshape_and_import_csv(
918
+ "wide_data.csv",
919
+ id_vars=['sample_id'],
920
+ var_name='option_name',
921
+ value_name='option_value'
922
+ )
923
+ ```
924
+ """
925
+ from aiecs.application.knowledge_graph.builder.data_reshaping import DataReshaping
926
+
927
+ result = ImportResult(start_time=datetime.now())
928
+
929
+ try:
930
+ if not PANDAS_AVAILABLE:
931
+ raise ImportError("pandas is required for reshaping")
932
+
933
+ # Read CSV
934
+ df = pd.read_csv(file_path, encoding=encoding)
935
+
936
+ # Auto-detect melt configuration if not provided
937
+ if id_vars is None:
938
+ melt_config = DataReshaping.suggest_melt_config(df)
939
+ id_vars = melt_config['id_vars']
940
+ if value_vars is None:
941
+ value_vars = melt_config['value_vars']
942
+ result.warnings.append(f"Auto-detected id_vars: {id_vars}")
943
+
944
+ # Reshape data
945
+ reshape_result = DataReshaping.melt(
946
+ df,
947
+ id_vars=id_vars,
948
+ value_vars=value_vars,
949
+ var_name=var_name,
950
+ value_name=value_name,
951
+ dropna=True,
952
+ )
953
+
954
+ result.warnings.extend(reshape_result.warnings)
955
+ result.warnings.append(
956
+ f"Reshaped from {reshape_result.original_shape} to {reshape_result.new_shape}"
957
+ )
958
+
959
+ # Convert reshaped data to rows
960
+ rows = reshape_result.data.to_dict("records")
961
+
962
+ # Process rows
963
+ result = await self._process_rows(rows, result)
964
+
965
+ except Exception as e:
966
+ error_msg = f"Failed to reshape and import CSV {file_path}: {e}"
967
+ logger.error(error_msg, exc_info=True)
968
+ result.success = False
969
+ result.errors.append(error_msg)
970
+
971
+ finally:
972
+ result.end_time = datetime.now()
973
+ if result.start_time:
974
+ result.duration_seconds = (result.end_time - result.start_time).total_seconds()
975
+
976
+ return result
977
+
978
+ async def _process_rows(self, rows: List[Dict[str, Any]], result: ImportResult) -> ImportResult:
979
+ """
980
+ Process rows and convert to entities/relations
981
+
982
+ Args:
983
+ rows: List of row dictionaries
984
+ result: ImportResult to update
985
+
986
+ Returns:
987
+ Updated ImportResult
988
+ """
989
+ import time
990
+
991
+ total_rows = len(rows)
992
+
993
+ if total_rows == 0:
994
+ result.warnings.append("No rows to process")
995
+ return result
996
+
997
+ # Initialize performance metrics if tracking enabled
998
+ metrics = None
999
+ if self.track_performance:
1000
+ metrics = PerformanceMetrics()
1001
+ metrics.start_time = time.time()
1002
+ metrics.total_rows = total_rows
1003
+ if self._memory_tracker:
1004
+ self._memory_tracker.start_tracking()
1005
+ metrics.initial_memory_mb = self._memory_tracker.initial_memory_mb
1006
+
1007
+ # Determine batch size (auto-tune if enabled)
1008
+ batch_size = self.batch_size
1009
+ if self._batch_optimizer is not None:
1010
+ # Estimate column count from first row
1011
+ column_count = len(rows[0]) if rows else 10
1012
+ batch_size = self._batch_optimizer.estimate_batch_size(column_count)
1013
+ logger.debug(f"Auto-tuned batch size: {batch_size}")
1014
+
1015
+ # Process in batches
1016
+ batch_count = 0
1017
+ for batch_start in range(0, total_rows, batch_size):
1018
+ batch_time_start = time.time() if metrics else 0
1019
+
1020
+ batch_end = min(batch_start + batch_size, total_rows)
1021
+ batch_rows = rows[batch_start:batch_end]
1022
+
1023
+ # Update progress
1024
+ if self.progress_callback:
1025
+ progress_pct = (batch_end / total_rows) * 100
1026
+ self.progress_callback(
1027
+ f"Processing rows {batch_start+1}-{batch_end} of {total_rows}",
1028
+ progress_pct,
1029
+ )
1030
+
1031
+ # Process batch
1032
+ batch_result = await self._process_batch(batch_rows)
1033
+ batch_count += 1
1034
+
1035
+ # Update result
1036
+ result.entities_added += batch_result.entities_added
1037
+ result.relations_added += batch_result.relations_added
1038
+ result.rows_processed += batch_result.rows_processed
1039
+ result.rows_failed += batch_result.rows_failed
1040
+ result.errors.extend(batch_result.errors)
1041
+ result.warnings.extend(batch_result.warnings)
1042
+
1043
+ # Record batch time for adaptive tuning
1044
+ if self._batch_optimizer is not None:
1045
+ batch_time = time.time() - batch_time_start
1046
+ self._batch_optimizer.record_batch_time(batch_time, len(batch_rows))
1047
+ # Adjust batch size for next iteration
1048
+ batch_size = self._batch_optimizer.adjust_batch_size()
1049
+
1050
+ # Update memory tracking
1051
+ if self._memory_tracker:
1052
+ self._memory_tracker.update()
1053
+
1054
+ # Finalize performance metrics
1055
+ if metrics:
1056
+ metrics.end_time = time.time()
1057
+ metrics.batch_count = batch_count
1058
+ if self._memory_tracker:
1059
+ metrics.peak_memory_mb = self._memory_tracker.peak_memory_mb
1060
+ metrics.calculate_throughput()
1061
+ result.performance_metrics = metrics
1062
+
1063
+ # Apply aggregations after all batches processed
1064
+ if self.mapping.aggregations:
1065
+ aggregation_results = await self._apply_aggregations()
1066
+
1067
+ # Store aggregated values as summary entities
1068
+ for entity_type, properties in aggregation_results.items():
1069
+ try:
1070
+ # Create a summary entity with aggregated statistics
1071
+ summary_entity = Entity(
1072
+ id=f"{entity_type}_summary",
1073
+ entity_type=f"{entity_type}Summary",
1074
+ properties=properties,
1075
+ )
1076
+
1077
+ # Try to add the summary entity (may already exist from previous import)
1078
+ try:
1079
+ await self.graph_store.add_entity(summary_entity)
1080
+ result.entities_added += 1
1081
+ except ValueError:
1082
+ # Entity already exists, try to update if method exists
1083
+ if hasattr(self.graph_store, 'update_entity'):
1084
+ await self.graph_store.update_entity(summary_entity)
1085
+ else:
1086
+ # For stores without update_entity, just skip
1087
+ pass
1088
+
1089
+ result.warnings.append(
1090
+ f"Applied aggregations to {entity_type}: {list(properties.keys())}"
1091
+ )
1092
+ except Exception as e:
1093
+ result.warnings.append(f"Failed to apply aggregations for {entity_type}: {e}")
1094
+
1095
+ return result
1096
+
1097
+ async def _process_batch(self, rows: List[Dict[str, Any]]) -> ImportResult:
1098
+ """
1099
+ Process a batch of rows
1100
+
1101
+ Args:
1102
+ rows: List of row dictionaries
1103
+
1104
+ Returns:
1105
+ ImportResult for this batch
1106
+ """
1107
+ batch_result = ImportResult()
1108
+ batch_result.rows_processed = len(rows)
1109
+
1110
+ # Collect entities and relations
1111
+ entities_to_add: List[Entity] = []
1112
+ relations_to_add: List[Relation] = []
1113
+
1114
+ for i, row in enumerate(rows):
1115
+ try:
1116
+ # Convert row to entities
1117
+ row_entities = await self._row_to_entities(row)
1118
+ entities_to_add.extend(row_entities)
1119
+
1120
+ # Convert row to relations
1121
+ row_relations = await self._row_to_relations(row)
1122
+ relations_to_add.extend(row_relations)
1123
+
1124
+ except Exception as e:
1125
+ error_msg = f"Failed to process row {i+1}: {e}"
1126
+ logger.warning(error_msg, exc_info=True)
1127
+ batch_result.rows_failed += 1
1128
+
1129
+ if self.skip_errors:
1130
+ batch_result.warnings.append(error_msg)
1131
+ else:
1132
+ batch_result.errors.append(error_msg)
1133
+ raise
1134
+
1135
+ # Update aggregation accumulators
1136
+ if self.mapping.aggregations:
1137
+ self._update_aggregations(rows)
1138
+
1139
+ # Add entities to graph store (use bulk writes if enabled)
1140
+ if self.use_bulk_writes and hasattr(self.graph_store, 'add_entities_bulk'):
1141
+ try:
1142
+ added = await self.graph_store.add_entities_bulk(entities_to_add)
1143
+ batch_result.entities_added = added
1144
+ except Exception as e:
1145
+ error_msg = f"Bulk entity add failed: {e}"
1146
+ logger.warning(error_msg)
1147
+ batch_result.warnings.append(error_msg)
1148
+ if not self.skip_errors:
1149
+ raise
1150
+ else:
1151
+ for entity in entities_to_add:
1152
+ try:
1153
+ await self.graph_store.add_entity(entity)
1154
+ batch_result.entities_added += 1
1155
+ except Exception as e:
1156
+ error_msg = f"Failed to add entity {entity.id}: {e}"
1157
+ logger.warning(error_msg)
1158
+ batch_result.warnings.append(error_msg)
1159
+ if not self.skip_errors:
1160
+ raise
1161
+
1162
+ # Add relations to graph store (use bulk writes if enabled)
1163
+ if self.use_bulk_writes and hasattr(self.graph_store, 'add_relations_bulk'):
1164
+ try:
1165
+ added = await self.graph_store.add_relations_bulk(relations_to_add)
1166
+ batch_result.relations_added = added
1167
+ except Exception as e:
1168
+ error_msg = f"Bulk relation add failed: {e}"
1169
+ logger.warning(error_msg)
1170
+ batch_result.warnings.append(error_msg)
1171
+ if not self.skip_errors:
1172
+ raise
1173
+ else:
1174
+ for relation in relations_to_add:
1175
+ try:
1176
+ await self.graph_store.add_relation(relation)
1177
+ batch_result.relations_added += 1
1178
+ except Exception as e:
1179
+ error_msg = f"Failed to add relation {relation.id}: {e}"
1180
+ logger.warning(error_msg)
1181
+ batch_result.warnings.append(error_msg)
1182
+ if not self.skip_errors:
1183
+ raise
1184
+
1185
+ return batch_result
1186
+
1187
+ async def _row_to_entities(self, row: Dict[str, Any]) -> List[Entity]:
1188
+ """
1189
+ Convert a row to entities based on entity mappings
1190
+
1191
+ Args:
1192
+ row: Dictionary of column name -> value
1193
+
1194
+ Returns:
1195
+ List of Entity objects
1196
+ """
1197
+ entities = []
1198
+
1199
+ for entity_mapping in self.mapping.entity_mappings:
1200
+ try:
1201
+ # Map row to entity using mapping
1202
+ entity_data = entity_mapping.map_row_to_entity(row)
1203
+
1204
+ # Create Entity object
1205
+ # Merge metadata into properties since Entity doesn't have a metadata field
1206
+ properties = entity_data["properties"].copy()
1207
+ properties["_metadata"] = {
1208
+ "source": "structured_data_import",
1209
+ "imported_at": datetime.now().isoformat(),
1210
+ }
1211
+ entity = Entity(
1212
+ id=entity_data["id"],
1213
+ entity_type=entity_data["type"],
1214
+ properties=properties,
1215
+ )
1216
+
1217
+ entities.append(entity)
1218
+
1219
+ except Exception as e:
1220
+ error_msg = f"Failed to map row to entity type '{entity_mapping.entity_type}': {e}"
1221
+ logger.warning(error_msg)
1222
+ if not self.skip_errors:
1223
+ raise ValueError(error_msg)
1224
+
1225
+ return entities
1226
+
1227
+ async def _row_to_relations(self, row: Dict[str, Any]) -> List[Relation]:
1228
+ """
1229
+ Convert a row to relations based on relation mappings
1230
+
1231
+ Args:
1232
+ row: Dictionary of column name -> value
1233
+
1234
+ Returns:
1235
+ List of Relation objects
1236
+ """
1237
+ relations = []
1238
+
1239
+ for relation_mapping in self.mapping.relation_mappings:
1240
+ try:
1241
+ # Map row to relation using mapping
1242
+ relation_data = relation_mapping.map_row_to_relation(row)
1243
+
1244
+ # Create Relation object
1245
+ # Merge metadata into properties since Relation doesn't have a metadata field
1246
+ rel_properties = relation_data["properties"].copy()
1247
+ rel_properties["_metadata"] = {
1248
+ "source": "structured_data_import",
1249
+ "imported_at": datetime.now().isoformat(),
1250
+ }
1251
+ relation = Relation(
1252
+ id=f"{relation_data['source_id']}_{relation_data['type']}_{relation_data['target_id']}",
1253
+ relation_type=relation_data["type"],
1254
+ source_id=relation_data["source_id"],
1255
+ target_id=relation_data["target_id"],
1256
+ properties=rel_properties,
1257
+ )
1258
+
1259
+ relations.append(relation)
1260
+
1261
+ except Exception as e:
1262
+ error_msg = f"Failed to map row to relation type '{relation_mapping.relation_type}': {e}"
1263
+ logger.warning(error_msg)
1264
+ if not self.skip_errors:
1265
+ raise ValueError(error_msg)
1266
+
1267
+ return relations
1268
+
1269
+ def _update_aggregations(self, rows: List[Dict[str, Any]]):
1270
+ """
1271
+ Update aggregation accumulators with batch data
1272
+
1273
+ Args:
1274
+ rows: List of row dictionaries
1275
+ """
1276
+ from aiecs.application.knowledge_graph.builder.schema_mapping import AggregationFunction
1277
+
1278
+ for entity_agg in self.mapping.aggregations:
1279
+ entity_type = entity_agg.entity_type
1280
+
1281
+ # Initialize accumulator for this entity type if needed
1282
+ if entity_type not in self._aggregation_accumulators:
1283
+ self._aggregation_accumulators[entity_type] = {}
1284
+
1285
+ for agg_config in entity_agg.aggregations:
1286
+ target_prop = agg_config.target_property
1287
+
1288
+ # Initialize accumulator for this property if needed
1289
+ if target_prop not in self._aggregation_accumulators[entity_type]:
1290
+ self._aggregation_accumulators[entity_type][target_prop] = AggregationAccumulator()
1291
+
1292
+ accumulator = self._aggregation_accumulators[entity_type][target_prop]
1293
+
1294
+ # Add values from rows
1295
+ for row in rows:
1296
+ value = row.get(agg_config.source_property)
1297
+ if value is not None:
1298
+ accumulator.add(value)
1299
+
1300
+ async def _apply_aggregations(self) -> Dict[str, Dict[str, Any]]:
1301
+ """
1302
+ Apply aggregations and return computed statistics
1303
+
1304
+ Returns:
1305
+ Dictionary of entity_type -> {property -> value}
1306
+ """
1307
+ from aiecs.application.knowledge_graph.builder.schema_mapping import AggregationFunction
1308
+
1309
+ results = {}
1310
+
1311
+ for entity_agg in self.mapping.aggregations:
1312
+ entity_type = entity_agg.entity_type
1313
+
1314
+ if entity_type not in self._aggregation_accumulators:
1315
+ continue
1316
+
1317
+ if entity_type not in results:
1318
+ results[entity_type] = {}
1319
+
1320
+ for agg_config in entity_agg.aggregations:
1321
+ target_prop = agg_config.target_property
1322
+
1323
+ if target_prop not in self._aggregation_accumulators[entity_type]:
1324
+ continue
1325
+
1326
+ accumulator = self._aggregation_accumulators[entity_type][target_prop]
1327
+
1328
+ # Compute aggregated value based on function
1329
+ if agg_config.function == AggregationFunction.MEAN:
1330
+ value = accumulator.get_mean()
1331
+ elif agg_config.function == AggregationFunction.STD:
1332
+ value = accumulator.get_std()
1333
+ elif agg_config.function == AggregationFunction.MIN:
1334
+ value = accumulator.get_min()
1335
+ elif agg_config.function == AggregationFunction.MAX:
1336
+ value = accumulator.get_max()
1337
+ elif agg_config.function == AggregationFunction.SUM:
1338
+ value = accumulator.get_sum()
1339
+ elif agg_config.function == AggregationFunction.COUNT:
1340
+ value = accumulator.get_count()
1341
+ elif agg_config.function == AggregationFunction.MEDIAN:
1342
+ value = accumulator.get_median()
1343
+ elif agg_config.function == AggregationFunction.VARIANCE:
1344
+ value = accumulator.get_variance()
1345
+ else:
1346
+ value = None
1347
+
1348
+ if value is not None:
1349
+ results[entity_type][target_prop] = value
1350
+
1351
+ return results
1352
+
1353
+ def _create_validator_from_config(self, config: Dict[str, Any]) -> DataQualityValidator:
1354
+ """
1355
+ Create DataQualityValidator from configuration dictionary
1356
+
1357
+ Args:
1358
+ config: Validation configuration dictionary
1359
+
1360
+ Returns:
1361
+ Configured DataQualityValidator
1362
+ """
1363
+ # Parse range rules
1364
+ range_rules = {}
1365
+ if "range_rules" in config:
1366
+ for prop, rule_dict in config["range_rules"].items():
1367
+ range_rules[prop] = RangeRule(
1368
+ min_value=rule_dict.get("min"),
1369
+ max_value=rule_dict.get("max")
1370
+ )
1371
+
1372
+ # Parse required properties
1373
+ required_properties = set(config.get("required_properties", []))
1374
+
1375
+ # Create validation config
1376
+ validation_config = ValidationConfig(
1377
+ range_rules=range_rules,
1378
+ required_properties=required_properties,
1379
+ detect_outliers=config.get("detect_outliers", False),
1380
+ fail_on_violations=config.get("fail_on_violations", False),
1381
+ max_violation_rate=config.get("max_violation_rate", 0.1)
1382
+ )
1383
+
1384
+ return DataQualityValidator(validation_config)