aiecs 1.0.1__py3-none-any.whl → 1.7.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +435 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3949 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1731 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +894 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +377 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +230 -37
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +328 -0
- aiecs/llm/clients/google_function_calling_mixin.py +415 -0
- aiecs/llm/clients/googleai_client.py +314 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +1186 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1464 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1016 -0
- aiecs/tools/docs/document_writer_tool.py +2008 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +220 -141
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/METADATA +52 -15
- aiecs-1.7.17.dist-info/RECORD +337 -0
- aiecs-1.7.17.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1384 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structured Data Pipeline
|
|
3
|
+
|
|
4
|
+
Import structured data (CSV, JSON, SPSS, Excel) into knowledge graphs using schema mappings.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Optional, Dict, Any, Callable, Union
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import pandas as pd # type: ignore[import-untyped]
|
|
16
|
+
|
|
17
|
+
PANDAS_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
PANDAS_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
from aiecs.infrastructure.graph_storage.base import GraphStore
|
|
22
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
23
|
+
from aiecs.domain.knowledge_graph.models.relation import Relation
|
|
24
|
+
from aiecs.application.knowledge_graph.builder.schema_mapping import (
|
|
25
|
+
SchemaMapping,
|
|
26
|
+
)
|
|
27
|
+
from aiecs.application.knowledge_graph.builder.data_quality import (
|
|
28
|
+
DataQualityValidator,
|
|
29
|
+
ValidationConfig,
|
|
30
|
+
QualityReport,
|
|
31
|
+
RangeRule,
|
|
32
|
+
)
|
|
33
|
+
from aiecs.application.knowledge_graph.builder.import_optimizer import (
|
|
34
|
+
PerformanceMetrics,
|
|
35
|
+
BatchSizeOptimizer,
|
|
36
|
+
ParallelBatchProcessor,
|
|
37
|
+
MemoryTracker,
|
|
38
|
+
StreamingCSVReader,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Import InferredSchema for type hints (avoid circular import)
|
|
42
|
+
from typing import TYPE_CHECKING
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from aiecs.application.knowledge_graph.builder.schema_inference import InferredSchema
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ImportResult:
|
|
52
|
+
"""
|
|
53
|
+
Result of structured data import operation
|
|
54
|
+
|
|
55
|
+
Attributes:
|
|
56
|
+
success: Whether import completed successfully
|
|
57
|
+
entities_added: Number of entities added to graph
|
|
58
|
+
relations_added: Number of relations added to graph
|
|
59
|
+
rows_processed: Number of rows processed
|
|
60
|
+
rows_failed: Number of rows that failed to process
|
|
61
|
+
errors: List of errors encountered
|
|
62
|
+
warnings: List of warnings
|
|
63
|
+
quality_report: Data quality validation report (if validation enabled)
|
|
64
|
+
start_time: When import started
|
|
65
|
+
end_time: When import ended
|
|
66
|
+
duration_seconds: Total duration in seconds
|
|
67
|
+
performance_metrics: Detailed performance metrics (if tracking enabled)
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
success: bool = True
|
|
71
|
+
entities_added: int = 0
|
|
72
|
+
relations_added: int = 0
|
|
73
|
+
rows_processed: int = 0
|
|
74
|
+
rows_failed: int = 0
|
|
75
|
+
errors: List[str] = field(default_factory=list)
|
|
76
|
+
warnings: List[str] = field(default_factory=list)
|
|
77
|
+
quality_report: Optional[QualityReport] = None
|
|
78
|
+
start_time: Optional[datetime] = None
|
|
79
|
+
end_time: Optional[datetime] = None
|
|
80
|
+
duration_seconds: float = 0.0
|
|
81
|
+
performance_metrics: Optional[PerformanceMetrics] = None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class AggregationAccumulator:
|
|
85
|
+
"""
|
|
86
|
+
Accumulator for incremental statistical aggregation
|
|
87
|
+
|
|
88
|
+
Computes statistics incrementally as data is processed in batches.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self):
|
|
92
|
+
self.count = 0
|
|
93
|
+
self.sum = 0.0
|
|
94
|
+
self.sum_sq = 0.0 # Sum of squares for variance/std
|
|
95
|
+
self.min_val = float('inf')
|
|
96
|
+
self.max_val = float('-inf')
|
|
97
|
+
self.values = [] # For median (if needed)
|
|
98
|
+
|
|
99
|
+
def add(self, value: Any):
|
|
100
|
+
"""Add a value to the accumulator"""
|
|
101
|
+
if value is None:
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
num_val = float(value)
|
|
106
|
+
except (ValueError, TypeError):
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
self.count += 1
|
|
110
|
+
self.sum += num_val
|
|
111
|
+
self.sum_sq += num_val * num_val
|
|
112
|
+
self.min_val = min(self.min_val, num_val)
|
|
113
|
+
self.max_val = max(self.max_val, num_val)
|
|
114
|
+
self.values.append(num_val)
|
|
115
|
+
|
|
116
|
+
def get_mean(self) -> Optional[float]:
|
|
117
|
+
"""Get mean value"""
|
|
118
|
+
if self.count == 0:
|
|
119
|
+
return None
|
|
120
|
+
return self.sum / self.count
|
|
121
|
+
|
|
122
|
+
def get_std(self) -> Optional[float]:
|
|
123
|
+
"""Get standard deviation (sample std with Bessel's correction)"""
|
|
124
|
+
if self.count < 2:
|
|
125
|
+
return None
|
|
126
|
+
mean = self.get_mean()
|
|
127
|
+
if mean is None:
|
|
128
|
+
return None
|
|
129
|
+
# Use sample variance formula: sum((x - mean)^2) / (n - 1)
|
|
130
|
+
# Which equals: (sum(x^2) - n*mean^2) / (n - 1)
|
|
131
|
+
variance = (self.sum_sq - self.count * mean * mean) / (self.count - 1)
|
|
132
|
+
return variance ** 0.5 if variance >= 0 else 0.0
|
|
133
|
+
|
|
134
|
+
def get_variance(self) -> Optional[float]:
|
|
135
|
+
"""Get variance (sample variance with Bessel's correction)"""
|
|
136
|
+
if self.count < 2:
|
|
137
|
+
return None
|
|
138
|
+
mean = self.get_mean()
|
|
139
|
+
if mean is None:
|
|
140
|
+
return None
|
|
141
|
+
# Use sample variance formula: (sum(x^2) - n*mean^2) / (n - 1)
|
|
142
|
+
return (self.sum_sq - self.count * mean * mean) / (self.count - 1)
|
|
143
|
+
|
|
144
|
+
def get_min(self) -> Optional[float]:
|
|
145
|
+
"""Get minimum value"""
|
|
146
|
+
if self.count == 0:
|
|
147
|
+
return None
|
|
148
|
+
return self.min_val
|
|
149
|
+
|
|
150
|
+
def get_max(self) -> Optional[float]:
|
|
151
|
+
"""Get maximum value"""
|
|
152
|
+
if self.count == 0:
|
|
153
|
+
return None
|
|
154
|
+
return self.max_val
|
|
155
|
+
|
|
156
|
+
def get_sum(self) -> Optional[float]:
|
|
157
|
+
"""Get sum"""
|
|
158
|
+
if self.count == 0:
|
|
159
|
+
return None
|
|
160
|
+
return self.sum
|
|
161
|
+
|
|
162
|
+
def get_count(self) -> int:
|
|
163
|
+
"""Get count"""
|
|
164
|
+
return self.count
|
|
165
|
+
|
|
166
|
+
def get_median(self) -> Optional[float]:
|
|
167
|
+
"""Get median value"""
|
|
168
|
+
if self.count == 0:
|
|
169
|
+
return None
|
|
170
|
+
sorted_vals = sorted(self.values)
|
|
171
|
+
mid = self.count // 2
|
|
172
|
+
if self.count % 2 == 0:
|
|
173
|
+
return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
|
|
174
|
+
return sorted_vals[mid]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class StructuredDataPipeline:
|
|
178
|
+
"""
|
|
179
|
+
Pipeline for importing structured data (CSV, JSON, SPSS, Excel) into knowledge graphs
|
|
180
|
+
|
|
181
|
+
Uses SchemaMapping to map source data columns to entity and relation types.
|
|
182
|
+
Supports batch processing, progress tracking, and error handling.
|
|
183
|
+
|
|
184
|
+
Example:
|
|
185
|
+
```python
|
|
186
|
+
# Define schema mapping
|
|
187
|
+
mapping = SchemaMapping(
|
|
188
|
+
entity_mappings=[
|
|
189
|
+
EntityMapping(
|
|
190
|
+
source_columns=["id", "name", "age"],
|
|
191
|
+
entity_type="Person",
|
|
192
|
+
property_mapping={"id": "id", "name": "name", "age": "age"}
|
|
193
|
+
)
|
|
194
|
+
],
|
|
195
|
+
relation_mappings=[
|
|
196
|
+
RelationMapping(
|
|
197
|
+
source_columns=["person_id", "company_id"],
|
|
198
|
+
relation_type="WORKS_FOR",
|
|
199
|
+
source_entity_column="person_id",
|
|
200
|
+
target_entity_column="company_id"
|
|
201
|
+
)
|
|
202
|
+
]
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Create pipeline
|
|
206
|
+
pipeline = StructuredDataPipeline(
|
|
207
|
+
mapping=mapping,
|
|
208
|
+
graph_store=store
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Import CSV
|
|
212
|
+
result = await pipeline.import_from_csv("employees.csv")
|
|
213
|
+
print(f"Added {result.entities_added} entities, {result.relations_added} relations")
|
|
214
|
+
```
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
def __init__(
|
|
218
|
+
self,
|
|
219
|
+
mapping: SchemaMapping,
|
|
220
|
+
graph_store: GraphStore,
|
|
221
|
+
batch_size: int = 100,
|
|
222
|
+
progress_callback: Optional[Callable[[str, float], None]] = None,
|
|
223
|
+
skip_errors: bool = True,
|
|
224
|
+
enable_parallel: bool = False,
|
|
225
|
+
max_workers: Optional[int] = None,
|
|
226
|
+
auto_tune_batch_size: bool = False,
|
|
227
|
+
enable_streaming: bool = False,
|
|
228
|
+
use_bulk_writes: bool = True,
|
|
229
|
+
track_performance: bool = True,
|
|
230
|
+
):
|
|
231
|
+
"""
|
|
232
|
+
Initialize structured data pipeline
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
mapping: Schema mapping configuration
|
|
236
|
+
graph_store: Graph storage to save entities/relations
|
|
237
|
+
batch_size: Number of rows to process in each batch (ignored if auto_tune_batch_size=True)
|
|
238
|
+
progress_callback: Optional callback for progress updates (message, progress_pct)
|
|
239
|
+
skip_errors: Whether to skip rows with errors and continue processing
|
|
240
|
+
enable_parallel: Enable parallel batch processing for faster imports
|
|
241
|
+
max_workers: Maximum number of parallel workers (default: CPU count - 1)
|
|
242
|
+
auto_tune_batch_size: Automatically tune batch size based on system resources
|
|
243
|
+
enable_streaming: Enable streaming mode for large files (memory-efficient)
|
|
244
|
+
use_bulk_writes: Use bulk write operations for better performance
|
|
245
|
+
track_performance: Track detailed performance metrics
|
|
246
|
+
"""
|
|
247
|
+
# Validate mapping
|
|
248
|
+
validation_errors = mapping.validate_mapping()
|
|
249
|
+
if validation_errors:
|
|
250
|
+
raise ValueError(f"Invalid schema mapping: {validation_errors}")
|
|
251
|
+
|
|
252
|
+
self.mapping = mapping
|
|
253
|
+
self.graph_store = graph_store
|
|
254
|
+
self.batch_size = batch_size
|
|
255
|
+
self.progress_callback = progress_callback
|
|
256
|
+
self.skip_errors = skip_errors
|
|
257
|
+
|
|
258
|
+
# Performance optimization settings
|
|
259
|
+
self.enable_parallel = enable_parallel
|
|
260
|
+
self.max_workers = max_workers
|
|
261
|
+
self.auto_tune_batch_size = auto_tune_batch_size
|
|
262
|
+
self.enable_streaming = enable_streaming
|
|
263
|
+
self.use_bulk_writes = use_bulk_writes
|
|
264
|
+
self.track_performance = track_performance
|
|
265
|
+
|
|
266
|
+
# Initialize optimizers
|
|
267
|
+
self._batch_optimizer = BatchSizeOptimizer() if auto_tune_batch_size else None
|
|
268
|
+
self._memory_tracker = MemoryTracker() if track_performance else None
|
|
269
|
+
|
|
270
|
+
# Initialize aggregation tracking
|
|
271
|
+
self._aggregation_accumulators: Dict[str, Dict[str, Any]] = {} # entity_type -> {property -> accumulator}
|
|
272
|
+
|
|
273
|
+
# Initialize data quality validator if validation config is provided
|
|
274
|
+
self.validator: Optional[DataQualityValidator] = None
|
|
275
|
+
if mapping.validation_config:
|
|
276
|
+
self.validator = self._create_validator_from_config(mapping.validation_config)
|
|
277
|
+
|
|
278
|
+
if not PANDAS_AVAILABLE:
|
|
279
|
+
logger.warning("pandas not available. CSV import will use basic CSV reader. " "Install pandas for better performance: pip install pandas")
|
|
280
|
+
|
|
281
|
+
@staticmethod
|
|
282
|
+
def infer_schema_from_csv(
|
|
283
|
+
file_path: Union[str, Path],
|
|
284
|
+
encoding: str = "utf-8",
|
|
285
|
+
sample_size: int = 1000,
|
|
286
|
+
) -> 'InferredSchema':
|
|
287
|
+
"""
|
|
288
|
+
Infer schema mapping from CSV file
|
|
289
|
+
|
|
290
|
+
Analyzes CSV structure and content to automatically generate schema mappings.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
file_path: Path to CSV file
|
|
294
|
+
encoding: File encoding (default: utf-8)
|
|
295
|
+
sample_size: Number of rows to sample for inference (default: 1000)
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
InferredSchema with entity and relation mappings
|
|
299
|
+
|
|
300
|
+
Example:
|
|
301
|
+
```python
|
|
302
|
+
# Infer schema from CSV
|
|
303
|
+
inferred = StructuredDataPipeline.infer_schema_from_csv("data.csv")
|
|
304
|
+
|
|
305
|
+
# Review and modify if needed
|
|
306
|
+
print(f"Inferred entity types: {[em.entity_type for em in inferred.entity_mappings]}")
|
|
307
|
+
print(f"Warnings: {inferred.warnings}")
|
|
308
|
+
|
|
309
|
+
# Use inferred schema
|
|
310
|
+
mapping = inferred.to_schema_mapping()
|
|
311
|
+
pipeline = StructuredDataPipeline(mapping, graph_store)
|
|
312
|
+
```
|
|
313
|
+
"""
|
|
314
|
+
from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
|
|
315
|
+
|
|
316
|
+
inference = SchemaInference(sample_size=sample_size)
|
|
317
|
+
return inference.infer_from_csv(file_path, encoding=encoding)
|
|
318
|
+
|
|
319
|
+
@staticmethod
|
|
320
|
+
def infer_schema_from_spss(
|
|
321
|
+
file_path: Union[str, Path],
|
|
322
|
+
encoding: str = "utf-8",
|
|
323
|
+
sample_size: int = 1000,
|
|
324
|
+
) -> 'InferredSchema':
|
|
325
|
+
"""
|
|
326
|
+
Infer schema mapping from SPSS file
|
|
327
|
+
|
|
328
|
+
Uses SPSS variable labels and value labels to generate schema mappings.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
file_path: Path to SPSS file
|
|
332
|
+
encoding: File encoding (default: utf-8)
|
|
333
|
+
sample_size: Number of rows to sample for inference (default: 1000)
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
InferredSchema with entity and relation mappings
|
|
337
|
+
"""
|
|
338
|
+
from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
|
|
339
|
+
|
|
340
|
+
inference = SchemaInference(sample_size=sample_size)
|
|
341
|
+
return inference.infer_from_spss(file_path, encoding=encoding)
|
|
342
|
+
|
|
343
|
+
@staticmethod
|
|
344
|
+
def infer_schema_from_dataframe(
|
|
345
|
+
df: 'pd.DataFrame',
|
|
346
|
+
entity_type_hint: Optional[str] = None,
|
|
347
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
348
|
+
sample_size: int = 1000,
|
|
349
|
+
) -> 'InferredSchema':
|
|
350
|
+
"""
|
|
351
|
+
Infer schema mapping from pandas DataFrame
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
df: DataFrame to analyze
|
|
355
|
+
entity_type_hint: Optional hint for entity type name
|
|
356
|
+
metadata: Optional metadata (e.g., SPSS variable labels)
|
|
357
|
+
sample_size: Number of rows to sample for inference (default: 1000)
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
InferredSchema with entity and relation mappings
|
|
361
|
+
"""
|
|
362
|
+
from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
|
|
363
|
+
|
|
364
|
+
inference = SchemaInference(sample_size=sample_size)
|
|
365
|
+
return inference.infer_from_dataframe(df, entity_type_hint=entity_type_hint, metadata=metadata)
|
|
366
|
+
|
|
367
|
+
@staticmethod
|
|
368
|
+
def create_with_auto_reshape(
|
|
369
|
+
file_path: Union[str, Path],
|
|
370
|
+
graph_store: GraphStore,
|
|
371
|
+
entity_type_hint: Optional[str] = None,
|
|
372
|
+
reshape_threshold: int = 50,
|
|
373
|
+
**kwargs,
|
|
374
|
+
) -> 'StructuredDataPipeline':
|
|
375
|
+
"""
|
|
376
|
+
Create pipeline with automatic reshaping for wide format data
|
|
377
|
+
|
|
378
|
+
Detects wide format data and automatically reshapes to normalized structure
|
|
379
|
+
before creating the pipeline.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
file_path: Path to data file (CSV, SPSS, Excel)
|
|
383
|
+
graph_store: Graph storage to save entities/relations
|
|
384
|
+
entity_type_hint: Optional hint for entity type name
|
|
385
|
+
reshape_threshold: Minimum columns to trigger reshaping (default: 50)
|
|
386
|
+
**kwargs: Additional arguments for StructuredDataPipeline
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
StructuredDataPipeline configured for the data
|
|
390
|
+
|
|
391
|
+
Example:
|
|
392
|
+
```python
|
|
393
|
+
# Automatically detect and reshape wide format data
|
|
394
|
+
pipeline = StructuredDataPipeline.create_with_auto_reshape(
|
|
395
|
+
"wide_data.csv",
|
|
396
|
+
graph_store,
|
|
397
|
+
entity_type_hint="Sample"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Import reshaped data
|
|
401
|
+
result = await pipeline.import_from_csv("wide_data.csv")
|
|
402
|
+
```
|
|
403
|
+
"""
|
|
404
|
+
from aiecs.application.knowledge_graph.builder.data_reshaping import DataReshaping
|
|
405
|
+
from aiecs.application.knowledge_graph.builder.schema_inference import SchemaInference
|
|
406
|
+
|
|
407
|
+
if not PANDAS_AVAILABLE:
|
|
408
|
+
raise ImportError("pandas is required for automatic reshaping")
|
|
409
|
+
|
|
410
|
+
# Load data to analyze
|
|
411
|
+
file_path_str = str(file_path)
|
|
412
|
+
if file_path_str.endswith('.csv'):
|
|
413
|
+
df = pd.read_csv(file_path, nrows=1000) # Sample for analysis
|
|
414
|
+
elif file_path_str.endswith(('.sav', '.por')):
|
|
415
|
+
import pyreadstat
|
|
416
|
+
df, _ = pyreadstat.read_sav(file_path_str, row_limit=1000)
|
|
417
|
+
elif file_path_str.endswith(('.xlsx', '.xls')):
|
|
418
|
+
df = pd.read_excel(file_path, nrows=1000)
|
|
419
|
+
else:
|
|
420
|
+
raise ValueError(f"Unsupported file format: {file_path}")
|
|
421
|
+
|
|
422
|
+
# Check if data is in wide format
|
|
423
|
+
is_wide = DataReshaping.detect_wide_format(df, threshold_columns=reshape_threshold)
|
|
424
|
+
|
|
425
|
+
if is_wide:
|
|
426
|
+
logger.info(f"Detected wide format data ({df.shape[1]} columns). Suggesting normalized structure.")
|
|
427
|
+
|
|
428
|
+
# Suggest melt configuration
|
|
429
|
+
melt_config = DataReshaping.suggest_melt_config(df)
|
|
430
|
+
logger.info(f"Suggested melt config: id_vars={melt_config['id_vars']}, "
|
|
431
|
+
f"{len(melt_config['value_vars'])} value columns")
|
|
432
|
+
|
|
433
|
+
# For wide format, we'll need to reshape during import
|
|
434
|
+
# For now, infer schema from original data
|
|
435
|
+
inference = SchemaInference()
|
|
436
|
+
inferred = inference.infer_from_dataframe(df, entity_type_hint=entity_type_hint)
|
|
437
|
+
|
|
438
|
+
# Add warning about wide format
|
|
439
|
+
inferred.warnings.append(
|
|
440
|
+
f"Wide format detected ({df.shape[1]} columns). "
|
|
441
|
+
f"Consider using reshape_and_import() for normalized structure."
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
mapping = inferred.to_schema_mapping()
|
|
445
|
+
else:
|
|
446
|
+
# Normal format - infer schema directly
|
|
447
|
+
inference = SchemaInference()
|
|
448
|
+
inferred = inference.infer_from_dataframe(df, entity_type_hint=entity_type_hint)
|
|
449
|
+
mapping = inferred.to_schema_mapping()
|
|
450
|
+
|
|
451
|
+
return StructuredDataPipeline(mapping=mapping, graph_store=graph_store, **kwargs)
|
|
452
|
+
|
|
453
|
+
async def import_from_csv(
|
|
454
|
+
self,
|
|
455
|
+
file_path: Union[str, Path],
|
|
456
|
+
encoding: str = "utf-8",
|
|
457
|
+
delimiter: str = ",",
|
|
458
|
+
header: bool = True,
|
|
459
|
+
) -> ImportResult:
|
|
460
|
+
"""
|
|
461
|
+
Import data from CSV file
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
file_path: Path to CSV file
|
|
465
|
+
encoding: File encoding (default: utf-8)
|
|
466
|
+
delimiter: CSV delimiter (default: comma)
|
|
467
|
+
header: Whether file has header row (default: True)
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
ImportResult with statistics
|
|
471
|
+
"""
|
|
472
|
+
result = ImportResult(start_time=datetime.now())
|
|
473
|
+
|
|
474
|
+
try:
|
|
475
|
+
# Read CSV file
|
|
476
|
+
if PANDAS_AVAILABLE:
|
|
477
|
+
df = pd.read_csv(
|
|
478
|
+
file_path,
|
|
479
|
+
encoding=encoding,
|
|
480
|
+
sep=delimiter,
|
|
481
|
+
header=0 if header else None,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Run data quality validation if validator is configured
|
|
485
|
+
if self.validator:
|
|
486
|
+
# Determine ID column for validation
|
|
487
|
+
id_column = None
|
|
488
|
+
for entity_mapping in self.mapping.entity_mappings:
|
|
489
|
+
if entity_mapping.id_column:
|
|
490
|
+
id_column = entity_mapping.id_column
|
|
491
|
+
break
|
|
492
|
+
|
|
493
|
+
quality_report = self.validator.validate_dataframe(df, id_column=id_column)
|
|
494
|
+
result.quality_report = quality_report
|
|
495
|
+
|
|
496
|
+
# Log quality issues
|
|
497
|
+
if quality_report.violations:
|
|
498
|
+
logger.warning(f"Data quality validation found {len(quality_report.violations)} violations")
|
|
499
|
+
for violation in quality_report.violations[:5]: # Log first 5
|
|
500
|
+
logger.warning(f" {violation.message}")
|
|
501
|
+
if len(quality_report.violations) > 5:
|
|
502
|
+
logger.warning(f" ... and {len(quality_report.violations) - 5} more violations")
|
|
503
|
+
|
|
504
|
+
# Fail import if configured and validation failed
|
|
505
|
+
if not quality_report.passed:
|
|
506
|
+
result.success = False
|
|
507
|
+
result.errors.append(f"Data quality validation failed: {len(quality_report.violations)} violations")
|
|
508
|
+
return result
|
|
509
|
+
|
|
510
|
+
rows = df.to_dict("records")
|
|
511
|
+
else:
|
|
512
|
+
# Fallback to basic CSV reader
|
|
513
|
+
import csv
|
|
514
|
+
|
|
515
|
+
rows = []
|
|
516
|
+
with open(file_path, "r", encoding=encoding) as f:
|
|
517
|
+
reader = csv.DictReader(f) if header else csv.reader(f)
|
|
518
|
+
if header:
|
|
519
|
+
for row in reader:
|
|
520
|
+
rows.append(row)
|
|
521
|
+
else:
|
|
522
|
+
# No header - use column indices
|
|
523
|
+
for row in reader:
|
|
524
|
+
rows.append({str(i): val for i, val in enumerate(row)})
|
|
525
|
+
|
|
526
|
+
# Process rows
|
|
527
|
+
result = await self._process_rows(rows, result)
|
|
528
|
+
|
|
529
|
+
except Exception as e:
|
|
530
|
+
error_msg = f"Failed to import CSV file {file_path}: {e}"
|
|
531
|
+
logger.error(error_msg, exc_info=True)
|
|
532
|
+
result.success = False
|
|
533
|
+
result.errors.append(error_msg)
|
|
534
|
+
|
|
535
|
+
finally:
|
|
536
|
+
result.end_time = datetime.now()
|
|
537
|
+
if result.start_time:
|
|
538
|
+
result.duration_seconds = (result.end_time - result.start_time).total_seconds()
|
|
539
|
+
|
|
540
|
+
return result
|
|
541
|
+
|
|
542
|
+
async def import_from_json(
|
|
543
|
+
self,
|
|
544
|
+
file_path: Union[str, Path],
|
|
545
|
+
encoding: str = "utf-8",
|
|
546
|
+
array_key: Optional[str] = None,
|
|
547
|
+
) -> ImportResult:
|
|
548
|
+
"""
|
|
549
|
+
Import data from JSON file
|
|
550
|
+
|
|
551
|
+
Supports:
|
|
552
|
+
- Array of objects: [{"id": 1, "name": "Alice"}, ...]
|
|
553
|
+
- Object with array: {"items": [{"id": 1, ...}, ...]}
|
|
554
|
+
- Single object: {"id": 1, "name": "Alice"}
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
file_path: Path to JSON file
|
|
558
|
+
encoding: File encoding (default: utf-8)
|
|
559
|
+
array_key: If JSON is object with array, key containing the array
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
ImportResult with statistics
|
|
563
|
+
"""
|
|
564
|
+
result = ImportResult(start_time=datetime.now())
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
# Read JSON file
|
|
568
|
+
with open(file_path, "r", encoding=encoding) as f:
|
|
569
|
+
data = json.load(f)
|
|
570
|
+
|
|
571
|
+
# Extract rows
|
|
572
|
+
if isinstance(data, list):
|
|
573
|
+
rows = data
|
|
574
|
+
elif isinstance(data, dict):
|
|
575
|
+
if array_key:
|
|
576
|
+
rows = data.get(array_key, [])
|
|
577
|
+
if not isinstance(rows, list):
|
|
578
|
+
raise ValueError(f"Key '{array_key}' does not contain an array")
|
|
579
|
+
else:
|
|
580
|
+
# Single object - wrap in list
|
|
581
|
+
rows = [data]
|
|
582
|
+
else:
|
|
583
|
+
raise ValueError(f"JSON file must contain array or object, got {type(data)}")
|
|
584
|
+
|
|
585
|
+
# Process rows
|
|
586
|
+
result = await self._process_rows(rows, result)
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
error_msg = f"Failed to import JSON file {file_path}: {e}"
|
|
590
|
+
logger.error(error_msg, exc_info=True)
|
|
591
|
+
result.success = False
|
|
592
|
+
result.errors.append(error_msg)
|
|
593
|
+
|
|
594
|
+
finally:
|
|
595
|
+
result.end_time = datetime.now()
|
|
596
|
+
if result.start_time:
|
|
597
|
+
result.duration_seconds = (result.end_time - result.start_time).total_seconds()
|
|
598
|
+
|
|
599
|
+
return result
|
|
600
|
+
|
|
601
|
+
async def import_from_csv_streaming(
|
|
602
|
+
self,
|
|
603
|
+
file_path: Union[str, Path],
|
|
604
|
+
encoding: str = "utf-8",
|
|
605
|
+
delimiter: str = ",",
|
|
606
|
+
chunk_size: int = 10000,
|
|
607
|
+
) -> ImportResult:
|
|
608
|
+
"""
|
|
609
|
+
Import data from CSV file using streaming mode.
|
|
610
|
+
|
|
611
|
+
Memory-efficient import for large files (>1GB). Reads file in chunks
|
|
612
|
+
without loading entire file into memory.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
file_path: Path to CSV file
|
|
616
|
+
encoding: File encoding (default: utf-8)
|
|
617
|
+
delimiter: CSV delimiter (default: comma)
|
|
618
|
+
chunk_size: Number of rows per chunk (default: 10000)
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
ImportResult with statistics and performance metrics
|
|
622
|
+
"""
|
|
623
|
+
import time
|
|
624
|
+
|
|
625
|
+
result = ImportResult(start_time=datetime.now())
|
|
626
|
+
|
|
627
|
+
# Initialize performance metrics
|
|
628
|
+
metrics = PerformanceMetrics() if self.track_performance else None
|
|
629
|
+
if metrics:
|
|
630
|
+
metrics.start_time = time.time()
|
|
631
|
+
if self._memory_tracker:
|
|
632
|
+
self._memory_tracker.start_tracking()
|
|
633
|
+
metrics.initial_memory_mb = self._memory_tracker.initial_memory_mb
|
|
634
|
+
|
|
635
|
+
try:
|
|
636
|
+
if not PANDAS_AVAILABLE:
|
|
637
|
+
raise ImportError("pandas is required for streaming CSV import")
|
|
638
|
+
|
|
639
|
+
# Count total rows for progress tracking
|
|
640
|
+
streaming_reader = StreamingCSVReader(
|
|
641
|
+
str(file_path),
|
|
642
|
+
chunk_size=chunk_size,
|
|
643
|
+
encoding=encoding,
|
|
644
|
+
delimiter=delimiter,
|
|
645
|
+
)
|
|
646
|
+
total_rows = streaming_reader.count_rows()
|
|
647
|
+
if metrics:
|
|
648
|
+
metrics.total_rows = total_rows
|
|
649
|
+
|
|
650
|
+
processed_rows = 0
|
|
651
|
+
batch_count = 0
|
|
652
|
+
|
|
653
|
+
# Process file in chunks
|
|
654
|
+
async for chunk_df in streaming_reader.read_chunks():
|
|
655
|
+
read_start = time.time()
|
|
656
|
+
rows = chunk_df.to_dict("records")
|
|
657
|
+
if metrics:
|
|
658
|
+
metrics.read_time_seconds += time.time() - read_start
|
|
659
|
+
|
|
660
|
+
# Update progress
|
|
661
|
+
if self.progress_callback:
|
|
662
|
+
progress_pct = (processed_rows / total_rows) * 100 if total_rows > 0 else 0
|
|
663
|
+
self.progress_callback(
|
|
664
|
+
f"Streaming chunk {batch_count + 1}: {processed_rows}/{total_rows} rows",
|
|
665
|
+
progress_pct,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
# Process chunk
|
|
669
|
+
transform_start = time.time()
|
|
670
|
+
for row in rows:
|
|
671
|
+
try:
|
|
672
|
+
row_entities = await self._row_to_entities(row)
|
|
673
|
+
row_relations = await self._row_to_relations(row)
|
|
674
|
+
|
|
675
|
+
# Add entities and relations
|
|
676
|
+
if self.use_bulk_writes and hasattr(self.graph_store, 'add_entities_bulk'):
|
|
677
|
+
added = await self.graph_store.add_entities_bulk(row_entities)
|
|
678
|
+
result.entities_added += added
|
|
679
|
+
else:
|
|
680
|
+
for entity in row_entities:
|
|
681
|
+
try:
|
|
682
|
+
await self.graph_store.add_entity(entity)
|
|
683
|
+
result.entities_added += 1
|
|
684
|
+
except ValueError:
|
|
685
|
+
pass
|
|
686
|
+
|
|
687
|
+
if self.use_bulk_writes and hasattr(self.graph_store, 'add_relations_bulk'):
|
|
688
|
+
added = await self.graph_store.add_relations_bulk(row_relations)
|
|
689
|
+
result.relations_added += added
|
|
690
|
+
else:
|
|
691
|
+
for relation in row_relations:
|
|
692
|
+
try:
|
|
693
|
+
await self.graph_store.add_relation(relation)
|
|
694
|
+
result.relations_added += 1
|
|
695
|
+
except ValueError:
|
|
696
|
+
pass
|
|
697
|
+
|
|
698
|
+
result.rows_processed += 1
|
|
699
|
+
except Exception as e:
|
|
700
|
+
result.rows_failed += 1
|
|
701
|
+
if not self.skip_errors:
|
|
702
|
+
raise
|
|
703
|
+
result.warnings.append(f"Row error: {e}")
|
|
704
|
+
|
|
705
|
+
if metrics:
|
|
706
|
+
metrics.transform_time_seconds += time.time() - transform_start
|
|
707
|
+
|
|
708
|
+
processed_rows += len(rows)
|
|
709
|
+
batch_count += 1
|
|
710
|
+
|
|
711
|
+
# Update memory tracking
|
|
712
|
+
if self._memory_tracker:
|
|
713
|
+
self._memory_tracker.update()
|
|
714
|
+
|
|
715
|
+
# Finalize metrics
|
|
716
|
+
if metrics:
|
|
717
|
+
metrics.end_time = time.time()
|
|
718
|
+
metrics.batch_count = batch_count
|
|
719
|
+
if self._memory_tracker:
|
|
720
|
+
metrics.peak_memory_mb = self._memory_tracker.peak_memory_mb
|
|
721
|
+
metrics.calculate_throughput()
|
|
722
|
+
result.performance_metrics = metrics
|
|
723
|
+
|
|
724
|
+
except Exception as e:
|
|
725
|
+
error_msg = f"Failed to import CSV file (streaming): {e}"
|
|
726
|
+
logger.error(error_msg, exc_info=True)
|
|
727
|
+
result.success = False
|
|
728
|
+
result.errors.append(error_msg)
|
|
729
|
+
|
|
730
|
+
finally:
|
|
731
|
+
result.end_time = datetime.now()
|
|
732
|
+
if result.start_time:
|
|
733
|
+
result.duration_seconds = (result.end_time - result.start_time).total_seconds()
|
|
734
|
+
|
|
735
|
+
return result
|
|
736
|
+
|
|
737
|
+
async def import_from_spss(
|
|
738
|
+
self,
|
|
739
|
+
file_path: Union[str, Path],
|
|
740
|
+
encoding: str = "utf-8",
|
|
741
|
+
preserve_metadata: bool = True,
|
|
742
|
+
) -> ImportResult:
|
|
743
|
+
"""
|
|
744
|
+
Import data from SPSS file (.sav, .por)
|
|
745
|
+
|
|
746
|
+
Uses pyreadstat library to read SPSS files and extract metadata.
|
|
747
|
+
SPSS variable labels and value labels are preserved as entity properties.
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
file_path: Path to SPSS file (.sav or .por)
|
|
751
|
+
encoding: File encoding (default: utf-8)
|
|
752
|
+
preserve_metadata: Whether to preserve SPSS metadata (variable labels, value labels)
|
|
753
|
+
|
|
754
|
+
Returns:
|
|
755
|
+
ImportResult with statistics
|
|
756
|
+
"""
|
|
757
|
+
result = ImportResult(start_time=datetime.now())
|
|
758
|
+
|
|
759
|
+
try:
|
|
760
|
+
# Import pyreadstat
|
|
761
|
+
try:
|
|
762
|
+
import pyreadstat # type: ignore[import-untyped]
|
|
763
|
+
except ImportError:
|
|
764
|
+
raise ImportError(
|
|
765
|
+
"pyreadstat is required for SPSS import. "
|
|
766
|
+
"Install with: pip install pyreadstat"
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
if not PANDAS_AVAILABLE:
|
|
770
|
+
raise ImportError("pandas is required for SPSS import. Install with: pip install pandas")
|
|
771
|
+
|
|
772
|
+
# Read SPSS file
|
|
773
|
+
df, meta = pyreadstat.read_sav(str(file_path), encoding=encoding)
|
|
774
|
+
|
|
775
|
+
# Convert DataFrame to list of dictionaries
|
|
776
|
+
rows = df.to_dict("records")
|
|
777
|
+
|
|
778
|
+
# If preserve_metadata is True, add SPSS metadata to each row
|
|
779
|
+
if preserve_metadata and meta:
|
|
780
|
+
# Extract metadata
|
|
781
|
+
spss_metadata = {
|
|
782
|
+
"column_names": meta.column_names if hasattr(meta, 'column_names') else [],
|
|
783
|
+
"column_labels": meta.column_labels if hasattr(meta, 'column_labels') else [],
|
|
784
|
+
"variable_value_labels": meta.variable_value_labels if hasattr(meta, 'variable_value_labels') else {},
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
# Store metadata in result for reference
|
|
788
|
+
if spss_metadata.get('column_labels'):
|
|
789
|
+
result.warnings.append(f"SPSS metadata preserved: {len(spss_metadata['column_labels'])} variable labels")
|
|
790
|
+
|
|
791
|
+
# Add metadata to each row's properties
|
|
792
|
+
for row in rows:
|
|
793
|
+
row["_spss_metadata"] = spss_metadata
|
|
794
|
+
|
|
795
|
+
# Process rows
|
|
796
|
+
result = await self._process_rows(rows, result)
|
|
797
|
+
|
|
798
|
+
except Exception as e:
|
|
799
|
+
error_msg = f"Failed to import SPSS file {file_path}: {e}"
|
|
800
|
+
logger.error(error_msg, exc_info=True)
|
|
801
|
+
result.success = False
|
|
802
|
+
result.errors.append(error_msg)
|
|
803
|
+
|
|
804
|
+
finally:
|
|
805
|
+
result.end_time = datetime.now()
|
|
806
|
+
if result.start_time:
|
|
807
|
+
result.duration_seconds = (result.end_time - result.start_time).total_seconds()
|
|
808
|
+
|
|
809
|
+
return result
|
|
810
|
+
|
|
811
|
+
async def import_from_excel(
|
|
812
|
+
self,
|
|
813
|
+
file_path: Union[str, Path],
|
|
814
|
+
sheet_name: Union[str, int, None] = 0,
|
|
815
|
+
encoding: str = "utf-8",
|
|
816
|
+
header: bool = True,
|
|
817
|
+
) -> ImportResult:
|
|
818
|
+
"""
|
|
819
|
+
Import data from Excel file (.xlsx, .xls)
|
|
820
|
+
|
|
821
|
+
Supports importing from specific sheets or all sheets.
|
|
822
|
+
|
|
823
|
+
Args:
|
|
824
|
+
file_path: Path to Excel file
|
|
825
|
+
sheet_name: Sheet name (str), sheet index (int), or None for all sheets (default: 0 = first sheet)
|
|
826
|
+
encoding: File encoding (default: utf-8)
|
|
827
|
+
header: Whether file has header row (default: True)
|
|
828
|
+
|
|
829
|
+
Returns:
|
|
830
|
+
ImportResult with statistics
|
|
831
|
+
"""
|
|
832
|
+
result = ImportResult(start_time=datetime.now())
|
|
833
|
+
|
|
834
|
+
try:
|
|
835
|
+
if not PANDAS_AVAILABLE:
|
|
836
|
+
raise ImportError("pandas is required for Excel import. Install with: pip install pandas openpyxl")
|
|
837
|
+
|
|
838
|
+
# Read Excel file
|
|
839
|
+
if sheet_name is None:
|
|
840
|
+
# Read all sheets
|
|
841
|
+
excel_data = pd.read_excel(
|
|
842
|
+
file_path,
|
|
843
|
+
sheet_name=None, # Returns dict of sheet_name -> DataFrame
|
|
844
|
+
header=0 if header else None,
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
# Process each sheet
|
|
848
|
+
all_rows = []
|
|
849
|
+
for sheet_name_key, df in excel_data.items():
|
|
850
|
+
sheet_rows = df.to_dict("records")
|
|
851
|
+
# Add sheet name to each row for reference
|
|
852
|
+
for row in sheet_rows:
|
|
853
|
+
row["_excel_sheet"] = sheet_name_key
|
|
854
|
+
all_rows.extend(sheet_rows)
|
|
855
|
+
|
|
856
|
+
rows = all_rows
|
|
857
|
+
result.warnings.append(f"Imported {len(excel_data)} sheets from Excel file")
|
|
858
|
+
|
|
859
|
+
else:
|
|
860
|
+
# Read specific sheet
|
|
861
|
+
df = pd.read_excel(
|
|
862
|
+
file_path,
|
|
863
|
+
sheet_name=sheet_name,
|
|
864
|
+
header=0 if header else None,
|
|
865
|
+
)
|
|
866
|
+
rows = df.to_dict("records")
|
|
867
|
+
|
|
868
|
+
# Process rows
|
|
869
|
+
result = await self._process_rows(rows, result)
|
|
870
|
+
|
|
871
|
+
except Exception as e:
|
|
872
|
+
error_msg = f"Failed to import Excel file {file_path}: {e}"
|
|
873
|
+
logger.error(error_msg, exc_info=True)
|
|
874
|
+
result.success = False
|
|
875
|
+
result.errors.append(error_msg)
|
|
876
|
+
|
|
877
|
+
finally:
|
|
878
|
+
result.end_time = datetime.now()
|
|
879
|
+
if result.start_time:
|
|
880
|
+
result.duration_seconds = (result.end_time - result.start_time).total_seconds()
|
|
881
|
+
|
|
882
|
+
return result
|
|
883
|
+
|
|
884
|
+
async def reshape_and_import_csv(
|
|
885
|
+
self,
|
|
886
|
+
file_path: Union[str, Path],
|
|
887
|
+
id_vars: Optional[List[str]] = None,
|
|
888
|
+
value_vars: Optional[List[str]] = None,
|
|
889
|
+
var_name: str = 'variable',
|
|
890
|
+
value_name: str = 'value',
|
|
891
|
+
entity_type_hint: Optional[str] = None,
|
|
892
|
+
encoding: str = "utf-8",
|
|
893
|
+
) -> ImportResult:
|
|
894
|
+
"""
|
|
895
|
+
Reshape wide format CSV to normalized structure and import
|
|
896
|
+
|
|
897
|
+
Automatically converts wide format data (many columns) to long format
|
|
898
|
+
(normalized structure) before importing into the graph.
|
|
899
|
+
|
|
900
|
+
Args:
|
|
901
|
+
file_path: Path to CSV file
|
|
902
|
+
id_vars: Columns to use as identifiers (auto-detected if None)
|
|
903
|
+
value_vars: Columns to unpivot (auto-detected if None)
|
|
904
|
+
var_name: Name for variable column (default: 'variable')
|
|
905
|
+
value_name: Name for value column (default: 'value')
|
|
906
|
+
entity_type_hint: Optional hint for entity type name
|
|
907
|
+
encoding: File encoding (default: utf-8)
|
|
908
|
+
|
|
909
|
+
Returns:
|
|
910
|
+
ImportResult with statistics
|
|
911
|
+
|
|
912
|
+
Example:
|
|
913
|
+
```python
|
|
914
|
+
# Wide format: sample_id, option1, option2, ..., option200
|
|
915
|
+
# Will be reshaped to: sample_id, variable, value
|
|
916
|
+
|
|
917
|
+
result = await pipeline.reshape_and_import_csv(
|
|
918
|
+
"wide_data.csv",
|
|
919
|
+
id_vars=['sample_id'],
|
|
920
|
+
var_name='option_name',
|
|
921
|
+
value_name='option_value'
|
|
922
|
+
)
|
|
923
|
+
```
|
|
924
|
+
"""
|
|
925
|
+
from aiecs.application.knowledge_graph.builder.data_reshaping import DataReshaping
|
|
926
|
+
|
|
927
|
+
result = ImportResult(start_time=datetime.now())
|
|
928
|
+
|
|
929
|
+
try:
|
|
930
|
+
if not PANDAS_AVAILABLE:
|
|
931
|
+
raise ImportError("pandas is required for reshaping")
|
|
932
|
+
|
|
933
|
+
# Read CSV
|
|
934
|
+
df = pd.read_csv(file_path, encoding=encoding)
|
|
935
|
+
|
|
936
|
+
# Auto-detect melt configuration if not provided
|
|
937
|
+
if id_vars is None:
|
|
938
|
+
melt_config = DataReshaping.suggest_melt_config(df)
|
|
939
|
+
id_vars = melt_config['id_vars']
|
|
940
|
+
if value_vars is None:
|
|
941
|
+
value_vars = melt_config['value_vars']
|
|
942
|
+
result.warnings.append(f"Auto-detected id_vars: {id_vars}")
|
|
943
|
+
|
|
944
|
+
# Reshape data
|
|
945
|
+
reshape_result = DataReshaping.melt(
|
|
946
|
+
df,
|
|
947
|
+
id_vars=id_vars,
|
|
948
|
+
value_vars=value_vars,
|
|
949
|
+
var_name=var_name,
|
|
950
|
+
value_name=value_name,
|
|
951
|
+
dropna=True,
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
result.warnings.extend(reshape_result.warnings)
|
|
955
|
+
result.warnings.append(
|
|
956
|
+
f"Reshaped from {reshape_result.original_shape} to {reshape_result.new_shape}"
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
# Convert reshaped data to rows
|
|
960
|
+
rows = reshape_result.data.to_dict("records")
|
|
961
|
+
|
|
962
|
+
# Process rows
|
|
963
|
+
result = await self._process_rows(rows, result)
|
|
964
|
+
|
|
965
|
+
except Exception as e:
|
|
966
|
+
error_msg = f"Failed to reshape and import CSV {file_path}: {e}"
|
|
967
|
+
logger.error(error_msg, exc_info=True)
|
|
968
|
+
result.success = False
|
|
969
|
+
result.errors.append(error_msg)
|
|
970
|
+
|
|
971
|
+
finally:
|
|
972
|
+
result.end_time = datetime.now()
|
|
973
|
+
if result.start_time:
|
|
974
|
+
result.duration_seconds = (result.end_time - result.start_time).total_seconds()
|
|
975
|
+
|
|
976
|
+
return result
|
|
977
|
+
|
|
978
|
+
async def _process_rows(self, rows: List[Dict[str, Any]], result: ImportResult) -> ImportResult:
|
|
979
|
+
"""
|
|
980
|
+
Process rows and convert to entities/relations
|
|
981
|
+
|
|
982
|
+
Args:
|
|
983
|
+
rows: List of row dictionaries
|
|
984
|
+
result: ImportResult to update
|
|
985
|
+
|
|
986
|
+
Returns:
|
|
987
|
+
Updated ImportResult
|
|
988
|
+
"""
|
|
989
|
+
import time
|
|
990
|
+
|
|
991
|
+
total_rows = len(rows)
|
|
992
|
+
|
|
993
|
+
if total_rows == 0:
|
|
994
|
+
result.warnings.append("No rows to process")
|
|
995
|
+
return result
|
|
996
|
+
|
|
997
|
+
# Initialize performance metrics if tracking enabled
|
|
998
|
+
metrics = None
|
|
999
|
+
if self.track_performance:
|
|
1000
|
+
metrics = PerformanceMetrics()
|
|
1001
|
+
metrics.start_time = time.time()
|
|
1002
|
+
metrics.total_rows = total_rows
|
|
1003
|
+
if self._memory_tracker:
|
|
1004
|
+
self._memory_tracker.start_tracking()
|
|
1005
|
+
metrics.initial_memory_mb = self._memory_tracker.initial_memory_mb
|
|
1006
|
+
|
|
1007
|
+
# Determine batch size (auto-tune if enabled)
|
|
1008
|
+
batch_size = self.batch_size
|
|
1009
|
+
if self._batch_optimizer is not None:
|
|
1010
|
+
# Estimate column count from first row
|
|
1011
|
+
column_count = len(rows[0]) if rows else 10
|
|
1012
|
+
batch_size = self._batch_optimizer.estimate_batch_size(column_count)
|
|
1013
|
+
logger.debug(f"Auto-tuned batch size: {batch_size}")
|
|
1014
|
+
|
|
1015
|
+
# Process in batches
|
|
1016
|
+
batch_count = 0
|
|
1017
|
+
for batch_start in range(0, total_rows, batch_size):
|
|
1018
|
+
batch_time_start = time.time() if metrics else 0
|
|
1019
|
+
|
|
1020
|
+
batch_end = min(batch_start + batch_size, total_rows)
|
|
1021
|
+
batch_rows = rows[batch_start:batch_end]
|
|
1022
|
+
|
|
1023
|
+
# Update progress
|
|
1024
|
+
if self.progress_callback:
|
|
1025
|
+
progress_pct = (batch_end / total_rows) * 100
|
|
1026
|
+
self.progress_callback(
|
|
1027
|
+
f"Processing rows {batch_start+1}-{batch_end} of {total_rows}",
|
|
1028
|
+
progress_pct,
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
# Process batch
|
|
1032
|
+
batch_result = await self._process_batch(batch_rows)
|
|
1033
|
+
batch_count += 1
|
|
1034
|
+
|
|
1035
|
+
# Update result
|
|
1036
|
+
result.entities_added += batch_result.entities_added
|
|
1037
|
+
result.relations_added += batch_result.relations_added
|
|
1038
|
+
result.rows_processed += batch_result.rows_processed
|
|
1039
|
+
result.rows_failed += batch_result.rows_failed
|
|
1040
|
+
result.errors.extend(batch_result.errors)
|
|
1041
|
+
result.warnings.extend(batch_result.warnings)
|
|
1042
|
+
|
|
1043
|
+
# Record batch time for adaptive tuning
|
|
1044
|
+
if self._batch_optimizer is not None:
|
|
1045
|
+
batch_time = time.time() - batch_time_start
|
|
1046
|
+
self._batch_optimizer.record_batch_time(batch_time, len(batch_rows))
|
|
1047
|
+
# Adjust batch size for next iteration
|
|
1048
|
+
batch_size = self._batch_optimizer.adjust_batch_size()
|
|
1049
|
+
|
|
1050
|
+
# Update memory tracking
|
|
1051
|
+
if self._memory_tracker:
|
|
1052
|
+
self._memory_tracker.update()
|
|
1053
|
+
|
|
1054
|
+
# Finalize performance metrics
|
|
1055
|
+
if metrics:
|
|
1056
|
+
metrics.end_time = time.time()
|
|
1057
|
+
metrics.batch_count = batch_count
|
|
1058
|
+
if self._memory_tracker:
|
|
1059
|
+
metrics.peak_memory_mb = self._memory_tracker.peak_memory_mb
|
|
1060
|
+
metrics.calculate_throughput()
|
|
1061
|
+
result.performance_metrics = metrics
|
|
1062
|
+
|
|
1063
|
+
# Apply aggregations after all batches processed
|
|
1064
|
+
if self.mapping.aggregations:
|
|
1065
|
+
aggregation_results = await self._apply_aggregations()
|
|
1066
|
+
|
|
1067
|
+
# Store aggregated values as summary entities
|
|
1068
|
+
for entity_type, properties in aggregation_results.items():
|
|
1069
|
+
try:
|
|
1070
|
+
# Create a summary entity with aggregated statistics
|
|
1071
|
+
summary_entity = Entity(
|
|
1072
|
+
id=f"{entity_type}_summary",
|
|
1073
|
+
entity_type=f"{entity_type}Summary",
|
|
1074
|
+
properties=properties,
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
# Try to add the summary entity (may already exist from previous import)
|
|
1078
|
+
try:
|
|
1079
|
+
await self.graph_store.add_entity(summary_entity)
|
|
1080
|
+
result.entities_added += 1
|
|
1081
|
+
except ValueError:
|
|
1082
|
+
# Entity already exists, try to update if method exists
|
|
1083
|
+
if hasattr(self.graph_store, 'update_entity'):
|
|
1084
|
+
await self.graph_store.update_entity(summary_entity)
|
|
1085
|
+
else:
|
|
1086
|
+
# For stores without update_entity, just skip
|
|
1087
|
+
pass
|
|
1088
|
+
|
|
1089
|
+
result.warnings.append(
|
|
1090
|
+
f"Applied aggregations to {entity_type}: {list(properties.keys())}"
|
|
1091
|
+
)
|
|
1092
|
+
except Exception as e:
|
|
1093
|
+
result.warnings.append(f"Failed to apply aggregations for {entity_type}: {e}")
|
|
1094
|
+
|
|
1095
|
+
return result
|
|
1096
|
+
|
|
1097
|
+
async def _process_batch(self, rows: List[Dict[str, Any]]) -> ImportResult:
|
|
1098
|
+
"""
|
|
1099
|
+
Process a batch of rows
|
|
1100
|
+
|
|
1101
|
+
Args:
|
|
1102
|
+
rows: List of row dictionaries
|
|
1103
|
+
|
|
1104
|
+
Returns:
|
|
1105
|
+
ImportResult for this batch
|
|
1106
|
+
"""
|
|
1107
|
+
batch_result = ImportResult()
|
|
1108
|
+
batch_result.rows_processed = len(rows)
|
|
1109
|
+
|
|
1110
|
+
# Collect entities and relations
|
|
1111
|
+
entities_to_add: List[Entity] = []
|
|
1112
|
+
relations_to_add: List[Relation] = []
|
|
1113
|
+
|
|
1114
|
+
for i, row in enumerate(rows):
|
|
1115
|
+
try:
|
|
1116
|
+
# Convert row to entities
|
|
1117
|
+
row_entities = await self._row_to_entities(row)
|
|
1118
|
+
entities_to_add.extend(row_entities)
|
|
1119
|
+
|
|
1120
|
+
# Convert row to relations
|
|
1121
|
+
row_relations = await self._row_to_relations(row)
|
|
1122
|
+
relations_to_add.extend(row_relations)
|
|
1123
|
+
|
|
1124
|
+
except Exception as e:
|
|
1125
|
+
error_msg = f"Failed to process row {i+1}: {e}"
|
|
1126
|
+
logger.warning(error_msg, exc_info=True)
|
|
1127
|
+
batch_result.rows_failed += 1
|
|
1128
|
+
|
|
1129
|
+
if self.skip_errors:
|
|
1130
|
+
batch_result.warnings.append(error_msg)
|
|
1131
|
+
else:
|
|
1132
|
+
batch_result.errors.append(error_msg)
|
|
1133
|
+
raise
|
|
1134
|
+
|
|
1135
|
+
# Update aggregation accumulators
|
|
1136
|
+
if self.mapping.aggregations:
|
|
1137
|
+
self._update_aggregations(rows)
|
|
1138
|
+
|
|
1139
|
+
# Add entities to graph store (use bulk writes if enabled)
|
|
1140
|
+
if self.use_bulk_writes and hasattr(self.graph_store, 'add_entities_bulk'):
|
|
1141
|
+
try:
|
|
1142
|
+
added = await self.graph_store.add_entities_bulk(entities_to_add)
|
|
1143
|
+
batch_result.entities_added = added
|
|
1144
|
+
except Exception as e:
|
|
1145
|
+
error_msg = f"Bulk entity add failed: {e}"
|
|
1146
|
+
logger.warning(error_msg)
|
|
1147
|
+
batch_result.warnings.append(error_msg)
|
|
1148
|
+
if not self.skip_errors:
|
|
1149
|
+
raise
|
|
1150
|
+
else:
|
|
1151
|
+
for entity in entities_to_add:
|
|
1152
|
+
try:
|
|
1153
|
+
await self.graph_store.add_entity(entity)
|
|
1154
|
+
batch_result.entities_added += 1
|
|
1155
|
+
except Exception as e:
|
|
1156
|
+
error_msg = f"Failed to add entity {entity.id}: {e}"
|
|
1157
|
+
logger.warning(error_msg)
|
|
1158
|
+
batch_result.warnings.append(error_msg)
|
|
1159
|
+
if not self.skip_errors:
|
|
1160
|
+
raise
|
|
1161
|
+
|
|
1162
|
+
# Add relations to graph store (use bulk writes if enabled)
|
|
1163
|
+
if self.use_bulk_writes and hasattr(self.graph_store, 'add_relations_bulk'):
|
|
1164
|
+
try:
|
|
1165
|
+
added = await self.graph_store.add_relations_bulk(relations_to_add)
|
|
1166
|
+
batch_result.relations_added = added
|
|
1167
|
+
except Exception as e:
|
|
1168
|
+
error_msg = f"Bulk relation add failed: {e}"
|
|
1169
|
+
logger.warning(error_msg)
|
|
1170
|
+
batch_result.warnings.append(error_msg)
|
|
1171
|
+
if not self.skip_errors:
|
|
1172
|
+
raise
|
|
1173
|
+
else:
|
|
1174
|
+
for relation in relations_to_add:
|
|
1175
|
+
try:
|
|
1176
|
+
await self.graph_store.add_relation(relation)
|
|
1177
|
+
batch_result.relations_added += 1
|
|
1178
|
+
except Exception as e:
|
|
1179
|
+
error_msg = f"Failed to add relation {relation.id}: {e}"
|
|
1180
|
+
logger.warning(error_msg)
|
|
1181
|
+
batch_result.warnings.append(error_msg)
|
|
1182
|
+
if not self.skip_errors:
|
|
1183
|
+
raise
|
|
1184
|
+
|
|
1185
|
+
return batch_result
|
|
1186
|
+
|
|
1187
|
+
async def _row_to_entities(self, row: Dict[str, Any]) -> List[Entity]:
|
|
1188
|
+
"""
|
|
1189
|
+
Convert a row to entities based on entity mappings
|
|
1190
|
+
|
|
1191
|
+
Args:
|
|
1192
|
+
row: Dictionary of column name -> value
|
|
1193
|
+
|
|
1194
|
+
Returns:
|
|
1195
|
+
List of Entity objects
|
|
1196
|
+
"""
|
|
1197
|
+
entities = []
|
|
1198
|
+
|
|
1199
|
+
for entity_mapping in self.mapping.entity_mappings:
|
|
1200
|
+
try:
|
|
1201
|
+
# Map row to entity using mapping
|
|
1202
|
+
entity_data = entity_mapping.map_row_to_entity(row)
|
|
1203
|
+
|
|
1204
|
+
# Create Entity object
|
|
1205
|
+
# Merge metadata into properties since Entity doesn't have a metadata field
|
|
1206
|
+
properties = entity_data["properties"].copy()
|
|
1207
|
+
properties["_metadata"] = {
|
|
1208
|
+
"source": "structured_data_import",
|
|
1209
|
+
"imported_at": datetime.now().isoformat(),
|
|
1210
|
+
}
|
|
1211
|
+
entity = Entity(
|
|
1212
|
+
id=entity_data["id"],
|
|
1213
|
+
entity_type=entity_data["type"],
|
|
1214
|
+
properties=properties,
|
|
1215
|
+
)
|
|
1216
|
+
|
|
1217
|
+
entities.append(entity)
|
|
1218
|
+
|
|
1219
|
+
except Exception as e:
|
|
1220
|
+
error_msg = f"Failed to map row to entity type '{entity_mapping.entity_type}': {e}"
|
|
1221
|
+
logger.warning(error_msg)
|
|
1222
|
+
if not self.skip_errors:
|
|
1223
|
+
raise ValueError(error_msg)
|
|
1224
|
+
|
|
1225
|
+
return entities
|
|
1226
|
+
|
|
1227
|
+
async def _row_to_relations(self, row: Dict[str, Any]) -> List[Relation]:
|
|
1228
|
+
"""
|
|
1229
|
+
Convert a row to relations based on relation mappings
|
|
1230
|
+
|
|
1231
|
+
Args:
|
|
1232
|
+
row: Dictionary of column name -> value
|
|
1233
|
+
|
|
1234
|
+
Returns:
|
|
1235
|
+
List of Relation objects
|
|
1236
|
+
"""
|
|
1237
|
+
relations = []
|
|
1238
|
+
|
|
1239
|
+
for relation_mapping in self.mapping.relation_mappings:
|
|
1240
|
+
try:
|
|
1241
|
+
# Map row to relation using mapping
|
|
1242
|
+
relation_data = relation_mapping.map_row_to_relation(row)
|
|
1243
|
+
|
|
1244
|
+
# Create Relation object
|
|
1245
|
+
# Merge metadata into properties since Relation doesn't have a metadata field
|
|
1246
|
+
rel_properties = relation_data["properties"].copy()
|
|
1247
|
+
rel_properties["_metadata"] = {
|
|
1248
|
+
"source": "structured_data_import",
|
|
1249
|
+
"imported_at": datetime.now().isoformat(),
|
|
1250
|
+
}
|
|
1251
|
+
relation = Relation(
|
|
1252
|
+
id=f"{relation_data['source_id']}_{relation_data['type']}_{relation_data['target_id']}",
|
|
1253
|
+
relation_type=relation_data["type"],
|
|
1254
|
+
source_id=relation_data["source_id"],
|
|
1255
|
+
target_id=relation_data["target_id"],
|
|
1256
|
+
properties=rel_properties,
|
|
1257
|
+
)
|
|
1258
|
+
|
|
1259
|
+
relations.append(relation)
|
|
1260
|
+
|
|
1261
|
+
except Exception as e:
|
|
1262
|
+
error_msg = f"Failed to map row to relation type '{relation_mapping.relation_type}': {e}"
|
|
1263
|
+
logger.warning(error_msg)
|
|
1264
|
+
if not self.skip_errors:
|
|
1265
|
+
raise ValueError(error_msg)
|
|
1266
|
+
|
|
1267
|
+
return relations
|
|
1268
|
+
|
|
1269
|
+
def _update_aggregations(self, rows: List[Dict[str, Any]]):
|
|
1270
|
+
"""
|
|
1271
|
+
Update aggregation accumulators with batch data
|
|
1272
|
+
|
|
1273
|
+
Args:
|
|
1274
|
+
rows: List of row dictionaries
|
|
1275
|
+
"""
|
|
1276
|
+
from aiecs.application.knowledge_graph.builder.schema_mapping import AggregationFunction
|
|
1277
|
+
|
|
1278
|
+
for entity_agg in self.mapping.aggregations:
|
|
1279
|
+
entity_type = entity_agg.entity_type
|
|
1280
|
+
|
|
1281
|
+
# Initialize accumulator for this entity type if needed
|
|
1282
|
+
if entity_type not in self._aggregation_accumulators:
|
|
1283
|
+
self._aggregation_accumulators[entity_type] = {}
|
|
1284
|
+
|
|
1285
|
+
for agg_config in entity_agg.aggregations:
|
|
1286
|
+
target_prop = agg_config.target_property
|
|
1287
|
+
|
|
1288
|
+
# Initialize accumulator for this property if needed
|
|
1289
|
+
if target_prop not in self._aggregation_accumulators[entity_type]:
|
|
1290
|
+
self._aggregation_accumulators[entity_type][target_prop] = AggregationAccumulator()
|
|
1291
|
+
|
|
1292
|
+
accumulator = self._aggregation_accumulators[entity_type][target_prop]
|
|
1293
|
+
|
|
1294
|
+
# Add values from rows
|
|
1295
|
+
for row in rows:
|
|
1296
|
+
value = row.get(agg_config.source_property)
|
|
1297
|
+
if value is not None:
|
|
1298
|
+
accumulator.add(value)
|
|
1299
|
+
|
|
1300
|
+
async def _apply_aggregations(self) -> Dict[str, Dict[str, Any]]:
|
|
1301
|
+
"""
|
|
1302
|
+
Apply aggregations and return computed statistics
|
|
1303
|
+
|
|
1304
|
+
Returns:
|
|
1305
|
+
Dictionary of entity_type -> {property -> value}
|
|
1306
|
+
"""
|
|
1307
|
+
from aiecs.application.knowledge_graph.builder.schema_mapping import AggregationFunction
|
|
1308
|
+
|
|
1309
|
+
results = {}
|
|
1310
|
+
|
|
1311
|
+
for entity_agg in self.mapping.aggregations:
|
|
1312
|
+
entity_type = entity_agg.entity_type
|
|
1313
|
+
|
|
1314
|
+
if entity_type not in self._aggregation_accumulators:
|
|
1315
|
+
continue
|
|
1316
|
+
|
|
1317
|
+
if entity_type not in results:
|
|
1318
|
+
results[entity_type] = {}
|
|
1319
|
+
|
|
1320
|
+
for agg_config in entity_agg.aggregations:
|
|
1321
|
+
target_prop = agg_config.target_property
|
|
1322
|
+
|
|
1323
|
+
if target_prop not in self._aggregation_accumulators[entity_type]:
|
|
1324
|
+
continue
|
|
1325
|
+
|
|
1326
|
+
accumulator = self._aggregation_accumulators[entity_type][target_prop]
|
|
1327
|
+
|
|
1328
|
+
# Compute aggregated value based on function
|
|
1329
|
+
if agg_config.function == AggregationFunction.MEAN:
|
|
1330
|
+
value = accumulator.get_mean()
|
|
1331
|
+
elif agg_config.function == AggregationFunction.STD:
|
|
1332
|
+
value = accumulator.get_std()
|
|
1333
|
+
elif agg_config.function == AggregationFunction.MIN:
|
|
1334
|
+
value = accumulator.get_min()
|
|
1335
|
+
elif agg_config.function == AggregationFunction.MAX:
|
|
1336
|
+
value = accumulator.get_max()
|
|
1337
|
+
elif agg_config.function == AggregationFunction.SUM:
|
|
1338
|
+
value = accumulator.get_sum()
|
|
1339
|
+
elif agg_config.function == AggregationFunction.COUNT:
|
|
1340
|
+
value = accumulator.get_count()
|
|
1341
|
+
elif agg_config.function == AggregationFunction.MEDIAN:
|
|
1342
|
+
value = accumulator.get_median()
|
|
1343
|
+
elif agg_config.function == AggregationFunction.VARIANCE:
|
|
1344
|
+
value = accumulator.get_variance()
|
|
1345
|
+
else:
|
|
1346
|
+
value = None
|
|
1347
|
+
|
|
1348
|
+
if value is not None:
|
|
1349
|
+
results[entity_type][target_prop] = value
|
|
1350
|
+
|
|
1351
|
+
return results
|
|
1352
|
+
|
|
1353
|
+
def _create_validator_from_config(self, config: Dict[str, Any]) -> DataQualityValidator:
|
|
1354
|
+
"""
|
|
1355
|
+
Create DataQualityValidator from configuration dictionary
|
|
1356
|
+
|
|
1357
|
+
Args:
|
|
1358
|
+
config: Validation configuration dictionary
|
|
1359
|
+
|
|
1360
|
+
Returns:
|
|
1361
|
+
Configured DataQualityValidator
|
|
1362
|
+
"""
|
|
1363
|
+
# Parse range rules
|
|
1364
|
+
range_rules = {}
|
|
1365
|
+
if "range_rules" in config:
|
|
1366
|
+
for prop, rule_dict in config["range_rules"].items():
|
|
1367
|
+
range_rules[prop] = RangeRule(
|
|
1368
|
+
min_value=rule_dict.get("min"),
|
|
1369
|
+
max_value=rule_dict.get("max")
|
|
1370
|
+
)
|
|
1371
|
+
|
|
1372
|
+
# Parse required properties
|
|
1373
|
+
required_properties = set(config.get("required_properties", []))
|
|
1374
|
+
|
|
1375
|
+
# Create validation config
|
|
1376
|
+
validation_config = ValidationConfig(
|
|
1377
|
+
range_rules=range_rules,
|
|
1378
|
+
required_properties=required_properties,
|
|
1379
|
+
detect_outliers=config.get("detect_outliers", False),
|
|
1380
|
+
fail_on_violations=config.get("fail_on_violations", False),
|
|
1381
|
+
max_violation_rate=config.get("max_violation_rate", 0.1)
|
|
1382
|
+
)
|
|
1383
|
+
|
|
1384
|
+
return DataQualityValidator(validation_config)
|