aiecs 1.0.1__py3-none-any.whl → 1.7.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +435 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3949 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1731 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +894 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +377 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +230 -37
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +328 -0
- aiecs/llm/clients/google_function_calling_mixin.py +415 -0
- aiecs/llm/clients/googleai_client.py +314 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +1186 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1464 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1016 -0
- aiecs/tools/docs/document_writer_tool.py +2008 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +220 -141
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/METADATA +52 -15
- aiecs-1.7.17.dist-info/RECORD +337 -0
- aiecs-1.7.17.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Reshaping for Knowledge Graph Import
|
|
3
|
+
|
|
4
|
+
Provides utilities to convert wide format data to normalized graph structures
|
|
5
|
+
and vice versa, enabling efficient import of datasets with many columns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional, Dict, Any, Union
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Check for pandas availability
|
|
15
|
+
try:
|
|
16
|
+
import pandas as pd
|
|
17
|
+
PANDAS_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
PANDAS_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ReshapeResult:
|
|
24
|
+
"""
|
|
25
|
+
Result of data reshaping operation
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
data: Reshaped DataFrame
|
|
29
|
+
original_shape: Original (rows, cols) shape
|
|
30
|
+
new_shape: New (rows, cols) shape
|
|
31
|
+
id_columns: Columns used as identifiers
|
|
32
|
+
variable_column: Name of variable column (for melt)
|
|
33
|
+
value_column: Name of value column (for melt)
|
|
34
|
+
warnings: List of warnings
|
|
35
|
+
"""
|
|
36
|
+
data: 'pd.DataFrame'
|
|
37
|
+
original_shape: tuple
|
|
38
|
+
new_shape: tuple
|
|
39
|
+
id_columns: List[str]
|
|
40
|
+
variable_column: Optional[str] = None
|
|
41
|
+
value_column: Optional[str] = None
|
|
42
|
+
warnings: List[str] = None
|
|
43
|
+
|
|
44
|
+
def __post_init__(self):
|
|
45
|
+
if self.warnings is None:
|
|
46
|
+
self.warnings = []
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DataReshaping:
|
|
50
|
+
"""
|
|
51
|
+
Utility class for reshaping structured data
|
|
52
|
+
|
|
53
|
+
Provides methods to convert between wide and long formats,
|
|
54
|
+
enabling normalized graph structures from wide datasets.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def melt(
|
|
59
|
+
df: 'pd.DataFrame',
|
|
60
|
+
id_vars: List[str],
|
|
61
|
+
value_vars: Optional[List[str]] = None,
|
|
62
|
+
var_name: str = 'variable',
|
|
63
|
+
value_name: str = 'value',
|
|
64
|
+
dropna: bool = True,
|
|
65
|
+
) -> ReshapeResult:
|
|
66
|
+
"""
|
|
67
|
+
Convert wide format to long format (melt operation)
|
|
68
|
+
|
|
69
|
+
Transforms data from wide format (many columns) to long format
|
|
70
|
+
(fewer columns, more rows), which is ideal for normalized graph structures.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
df: DataFrame to reshape
|
|
74
|
+
id_vars: Columns to use as identifier variables
|
|
75
|
+
value_vars: Columns to unpivot (default: all columns except id_vars)
|
|
76
|
+
var_name: Name for the variable column (default: 'variable')
|
|
77
|
+
value_name: Name for the value column (default: 'value')
|
|
78
|
+
dropna: Whether to drop rows with missing values (default: True)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
ReshapeResult with reshaped data and metadata
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
```python
|
|
85
|
+
# Wide format: sample_id, option1, option2, option3
|
|
86
|
+
# Long format: sample_id, variable, value
|
|
87
|
+
|
|
88
|
+
result = DataReshaping.melt(
|
|
89
|
+
df,
|
|
90
|
+
id_vars=['sample_id'],
|
|
91
|
+
value_vars=['option1', 'option2', 'option3'],
|
|
92
|
+
var_name='option_name',
|
|
93
|
+
value_name='option_value'
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
"""
|
|
97
|
+
if not PANDAS_AVAILABLE:
|
|
98
|
+
raise ImportError("pandas is required for data reshaping")
|
|
99
|
+
|
|
100
|
+
original_shape = df.shape
|
|
101
|
+
warnings = []
|
|
102
|
+
|
|
103
|
+
# If value_vars not specified, use all columns except id_vars
|
|
104
|
+
if value_vars is None:
|
|
105
|
+
value_vars = [col for col in df.columns if col not in id_vars]
|
|
106
|
+
warnings.append(f"Auto-detected {len(value_vars)} value columns")
|
|
107
|
+
|
|
108
|
+
# Perform melt operation
|
|
109
|
+
melted = pd.melt(
|
|
110
|
+
df,
|
|
111
|
+
id_vars=id_vars,
|
|
112
|
+
value_vars=value_vars,
|
|
113
|
+
var_name=var_name,
|
|
114
|
+
value_name=value_name,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Drop NA values if requested
|
|
118
|
+
if dropna:
|
|
119
|
+
rows_before = len(melted)
|
|
120
|
+
melted = melted.dropna(subset=[value_name])
|
|
121
|
+
rows_dropped = rows_before - len(melted)
|
|
122
|
+
if rows_dropped > 0:
|
|
123
|
+
warnings.append(f"Dropped {rows_dropped} rows with missing values")
|
|
124
|
+
|
|
125
|
+
new_shape = melted.shape
|
|
126
|
+
|
|
127
|
+
return ReshapeResult(
|
|
128
|
+
data=melted,
|
|
129
|
+
original_shape=original_shape,
|
|
130
|
+
new_shape=new_shape,
|
|
131
|
+
id_columns=id_vars,
|
|
132
|
+
variable_column=var_name,
|
|
133
|
+
value_column=value_name,
|
|
134
|
+
warnings=warnings,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def pivot(
|
|
139
|
+
df: 'pd.DataFrame',
|
|
140
|
+
index: Union[str, List[str]],
|
|
141
|
+
columns: str,
|
|
142
|
+
values: str,
|
|
143
|
+
aggfunc: str = 'first',
|
|
144
|
+
fill_value: Optional[Any] = None,
|
|
145
|
+
) -> ReshapeResult:
|
|
146
|
+
"""
|
|
147
|
+
Convert long format to wide format (pivot operation)
|
|
148
|
+
|
|
149
|
+
Transforms data from long format to wide format, creating columns
|
|
150
|
+
from unique values in the specified column.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
df: DataFrame to reshape
|
|
154
|
+
index: Column(s) to use as index (identifier)
|
|
155
|
+
columns: Column whose unique values become new columns
|
|
156
|
+
values: Column containing values to populate the new columns
|
|
157
|
+
aggfunc: Aggregation function if multiple values per group (default: 'first')
|
|
158
|
+
fill_value: Value to use for missing data (default: None)
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
ReshapeResult with reshaped data and metadata
|
|
162
|
+
|
|
163
|
+
Example:
|
|
164
|
+
```python
|
|
165
|
+
# Long format: sample_id, option_name, option_value
|
|
166
|
+
# Wide format: sample_id, option1, option2, option3
|
|
167
|
+
|
|
168
|
+
result = DataReshaping.pivot(
|
|
169
|
+
df,
|
|
170
|
+
index='sample_id',
|
|
171
|
+
columns='option_name',
|
|
172
|
+
values='option_value'
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
"""
|
|
176
|
+
if not PANDAS_AVAILABLE:
|
|
177
|
+
raise ImportError("pandas is required for data reshaping")
|
|
178
|
+
|
|
179
|
+
original_shape = df.shape
|
|
180
|
+
warnings = []
|
|
181
|
+
|
|
182
|
+
# Perform pivot operation
|
|
183
|
+
try:
|
|
184
|
+
pivoted = df.pivot_table(
|
|
185
|
+
index=index,
|
|
186
|
+
columns=columns,
|
|
187
|
+
values=values,
|
|
188
|
+
aggfunc=aggfunc,
|
|
189
|
+
fill_value=fill_value,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Reset index to make it a regular column
|
|
193
|
+
pivoted = pivoted.reset_index()
|
|
194
|
+
|
|
195
|
+
# Flatten column names if multi-level
|
|
196
|
+
if isinstance(pivoted.columns, pd.MultiIndex):
|
|
197
|
+
pivoted.columns = ['_'.join(map(str, col)).strip('_') for col in pivoted.columns.values]
|
|
198
|
+
warnings.append("Flattened multi-level column names")
|
|
199
|
+
|
|
200
|
+
except Exception as e:
|
|
201
|
+
raise ValueError(f"Pivot operation failed: {e}")
|
|
202
|
+
|
|
203
|
+
new_shape = pivoted.shape
|
|
204
|
+
|
|
205
|
+
# Determine id_columns
|
|
206
|
+
if isinstance(index, str):
|
|
207
|
+
id_columns = [index]
|
|
208
|
+
else:
|
|
209
|
+
id_columns = list(index)
|
|
210
|
+
|
|
211
|
+
return ReshapeResult(
|
|
212
|
+
data=pivoted,
|
|
213
|
+
original_shape=original_shape,
|
|
214
|
+
new_shape=new_shape,
|
|
215
|
+
id_columns=id_columns,
|
|
216
|
+
warnings=warnings,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
@staticmethod
|
|
220
|
+
def detect_wide_format(
|
|
221
|
+
df: 'pd.DataFrame',
|
|
222
|
+
threshold_columns: int = 50,
|
|
223
|
+
) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
Detect if DataFrame is in wide format
|
|
226
|
+
|
|
227
|
+
Wide format is characterized by many columns relative to rows.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
df: DataFrame to analyze
|
|
231
|
+
threshold_columns: Minimum number of columns to consider wide (default: 50)
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
True if DataFrame appears to be in wide format
|
|
235
|
+
"""
|
|
236
|
+
if not PANDAS_AVAILABLE:
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
num_cols = len(df.columns)
|
|
240
|
+
num_rows = len(df)
|
|
241
|
+
|
|
242
|
+
# Wide format indicators:
|
|
243
|
+
# 1. Many columns (>= threshold)
|
|
244
|
+
# 2. More columns than rows (or close to it) AND at least 20 columns
|
|
245
|
+
is_wide = num_cols >= threshold_columns or (num_cols >= 20 and num_cols > num_rows * 0.5)
|
|
246
|
+
|
|
247
|
+
return is_wide
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def suggest_melt_config(
|
|
251
|
+
df: 'pd.DataFrame',
|
|
252
|
+
id_column_patterns: Optional[List[str]] = None,
|
|
253
|
+
) -> Dict[str, Any]:
|
|
254
|
+
"""
|
|
255
|
+
Suggest melt configuration for wide format data
|
|
256
|
+
|
|
257
|
+
Analyzes DataFrame structure to suggest appropriate id_vars and value_vars.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
df: DataFrame to analyze
|
|
261
|
+
id_column_patterns: Patterns to identify ID columns (default: ['id', 'key', 'name'])
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Dictionary with suggested melt configuration
|
|
265
|
+
"""
|
|
266
|
+
if not PANDAS_AVAILABLE:
|
|
267
|
+
raise ImportError("pandas is required for data reshaping")
|
|
268
|
+
|
|
269
|
+
if id_column_patterns is None:
|
|
270
|
+
id_column_patterns = ['id', 'key', 'name', 'sample', 'subject']
|
|
271
|
+
|
|
272
|
+
# Identify potential ID columns
|
|
273
|
+
id_vars = []
|
|
274
|
+
for col in df.columns:
|
|
275
|
+
col_lower = col.lower()
|
|
276
|
+
if any(pattern in col_lower for pattern in id_column_patterns):
|
|
277
|
+
id_vars.append(col)
|
|
278
|
+
|
|
279
|
+
# If no ID columns found, use first column
|
|
280
|
+
if not id_vars and len(df.columns) > 0:
|
|
281
|
+
id_vars = [df.columns[0]]
|
|
282
|
+
|
|
283
|
+
# Value columns are all other columns
|
|
284
|
+
value_vars = [col for col in df.columns if col not in id_vars]
|
|
285
|
+
|
|
286
|
+
return {
|
|
287
|
+
'id_vars': id_vars,
|
|
288
|
+
'value_vars': value_vars,
|
|
289
|
+
'var_name': 'variable',
|
|
290
|
+
'value_name': 'value',
|
|
291
|
+
'confidence': 0.8 if id_vars else 0.5,
|
|
292
|
+
}
|
|
293
|
+
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Graph Builder
|
|
3
|
+
|
|
4
|
+
Builds knowledge graphs from documents (PDF, DOCX, TXT, etc.).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Optional, Dict, Any, Union
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
from aiecs.application.knowledge_graph.builder.graph_builder import (
|
|
13
|
+
GraphBuilder,
|
|
14
|
+
BuildResult,
|
|
15
|
+
)
|
|
16
|
+
from aiecs.application.knowledge_graph.builder.text_chunker import TextChunker
|
|
17
|
+
from aiecs.tools.docs.document_parser_tool import (
|
|
18
|
+
DocumentParserTool,
|
|
19
|
+
ParsingStrategy,
|
|
20
|
+
OutputFormat,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class DocumentBuildResult:
|
|
26
|
+
"""
|
|
27
|
+
Result of document-to-graph build operation
|
|
28
|
+
|
|
29
|
+
Extends BuildResult with document-specific information.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
document_path: str
|
|
33
|
+
document_type: str
|
|
34
|
+
total_chunks: int = 0
|
|
35
|
+
chunks_processed: int = 0
|
|
36
|
+
chunk_results: List[BuildResult] = field(default_factory=list)
|
|
37
|
+
success: bool = True
|
|
38
|
+
errors: List[str] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def total_entities_added(self) -> int:
|
|
42
|
+
"""Total entities added across all chunks"""
|
|
43
|
+
return sum(r.entities_added for r in self.chunk_results)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def total_relations_added(self) -> int:
|
|
47
|
+
"""Total relations added across all chunks"""
|
|
48
|
+
return sum(r.relations_added for r in self.chunk_results)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DocumentGraphBuilder:
|
|
52
|
+
"""
|
|
53
|
+
Build knowledge graphs from documents
|
|
54
|
+
|
|
55
|
+
Supports multiple document formats:
|
|
56
|
+
- PDF
|
|
57
|
+
- DOCX (Microsoft Word)
|
|
58
|
+
- TXT (Plain text)
|
|
59
|
+
- And more via AIECS DocumentParserTool
|
|
60
|
+
|
|
61
|
+
For large documents, automatically chunks text into manageable pieces.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
```python
|
|
65
|
+
builder = DocumentGraphBuilder(
|
|
66
|
+
graph_builder=graph_builder,
|
|
67
|
+
chunk_size=1000
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
result = await builder.build_from_document("research_paper.pdf")
|
|
71
|
+
|
|
72
|
+
print(f"Processed {result.total_chunks} chunks")
|
|
73
|
+
print(f"Added {result.total_entities_added} entities")
|
|
74
|
+
print(f"Added {result.total_relations_added} relations")
|
|
75
|
+
```
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
graph_builder: GraphBuilder,
|
|
81
|
+
chunk_size: int = 2000,
|
|
82
|
+
chunk_overlap: int = 200,
|
|
83
|
+
enable_chunking: bool = True,
|
|
84
|
+
parallel_chunks: bool = True,
|
|
85
|
+
max_parallel_chunks: int = 3,
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Initialize document graph builder
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
graph_builder: GraphBuilder instance for text processing
|
|
92
|
+
chunk_size: Size of text chunks (in characters)
|
|
93
|
+
chunk_overlap: Overlap between chunks
|
|
94
|
+
enable_chunking: Whether to chunk large documents
|
|
95
|
+
parallel_chunks: Process chunks in parallel
|
|
96
|
+
max_parallel_chunks: Maximum parallel chunk processing
|
|
97
|
+
"""
|
|
98
|
+
self.graph_builder = graph_builder
|
|
99
|
+
self.chunk_size = chunk_size
|
|
100
|
+
self.chunk_overlap = chunk_overlap
|
|
101
|
+
self.enable_chunking = enable_chunking
|
|
102
|
+
self.parallel_chunks = parallel_chunks
|
|
103
|
+
self.max_parallel_chunks = max_parallel_chunks
|
|
104
|
+
|
|
105
|
+
# Initialize document parser (will read config from environment
|
|
106
|
+
# variables)
|
|
107
|
+
self.document_parser = DocumentParserTool()
|
|
108
|
+
|
|
109
|
+
# Initialize text chunker
|
|
110
|
+
self.text_chunker = TextChunker(
|
|
111
|
+
chunk_size=chunk_size,
|
|
112
|
+
overlap=chunk_overlap,
|
|
113
|
+
respect_sentences=True,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
async def build_from_document(
|
|
117
|
+
self,
|
|
118
|
+
document_path: Union[str, Path],
|
|
119
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
120
|
+
) -> DocumentBuildResult:
|
|
121
|
+
"""
|
|
122
|
+
Build knowledge graph from a document
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
document_path: Path to document file
|
|
126
|
+
metadata: Optional metadata to attach to extracted entities/relations
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
DocumentBuildResult with statistics
|
|
130
|
+
"""
|
|
131
|
+
document_path = str(document_path)
|
|
132
|
+
result = DocumentBuildResult(document_path=document_path, document_type="unknown")
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
# Step 1: Parse document to text
|
|
136
|
+
text = await self._parse_document(document_path)
|
|
137
|
+
|
|
138
|
+
if not text or not text.strip():
|
|
139
|
+
result.success = False
|
|
140
|
+
result.errors.append("Document parsing returned empty text")
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
# Determine document type
|
|
144
|
+
result.document_type = Path(document_path).suffix[1:].lower() # Remove leading dot
|
|
145
|
+
|
|
146
|
+
# Step 2: Chunk text if needed
|
|
147
|
+
if self.enable_chunking and len(text) > self.chunk_size:
|
|
148
|
+
chunks = self.text_chunker.chunk_text(text, metadata={"document": document_path})
|
|
149
|
+
result.total_chunks = len(chunks)
|
|
150
|
+
else:
|
|
151
|
+
# Single chunk (small document)
|
|
152
|
+
from aiecs.application.knowledge_graph.builder.text_chunker import (
|
|
153
|
+
TextChunk,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
chunks = [
|
|
157
|
+
TextChunk(
|
|
158
|
+
text=text,
|
|
159
|
+
start_char=0,
|
|
160
|
+
end_char=len(text),
|
|
161
|
+
chunk_index=0,
|
|
162
|
+
metadata={"document": document_path},
|
|
163
|
+
)
|
|
164
|
+
]
|
|
165
|
+
result.total_chunks = 1
|
|
166
|
+
|
|
167
|
+
# Step 3: Process each chunk
|
|
168
|
+
if self.parallel_chunks and len(chunks) > 1:
|
|
169
|
+
# Process chunks in parallel
|
|
170
|
+
chunk_results = await self._process_chunks_parallel(chunks, document_path, metadata)
|
|
171
|
+
else:
|
|
172
|
+
# Process chunks sequentially
|
|
173
|
+
chunk_results = await self._process_chunks_sequential(chunks, document_path, metadata)
|
|
174
|
+
|
|
175
|
+
result.chunk_results = chunk_results
|
|
176
|
+
result.chunks_processed = len(chunk_results)
|
|
177
|
+
|
|
178
|
+
# Check if all chunks succeeded
|
|
179
|
+
failed_chunks = [r for r in chunk_results if not r.success]
|
|
180
|
+
if failed_chunks:
|
|
181
|
+
result.errors.append(f"{len(failed_chunks)} chunks failed processing")
|
|
182
|
+
|
|
183
|
+
result.success = len(failed_chunks) < len(chunks) # At least some chunks succeeded
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
result.success = False
|
|
187
|
+
result.errors.append(f"Document processing failed: {str(e)}")
|
|
188
|
+
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
async def build_from_documents(
|
|
192
|
+
self,
|
|
193
|
+
document_paths: List[Union[str, Path]],
|
|
194
|
+
parallel: bool = True,
|
|
195
|
+
max_parallel: int = 3,
|
|
196
|
+
) -> List[DocumentBuildResult]:
|
|
197
|
+
"""
|
|
198
|
+
Build knowledge graph from multiple documents
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
document_paths: List of document paths
|
|
202
|
+
parallel: Process documents in parallel
|
|
203
|
+
max_parallel: Maximum parallel documents
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
List of DocumentBuildResult objects
|
|
207
|
+
"""
|
|
208
|
+
if parallel:
|
|
209
|
+
semaphore = asyncio.Semaphore(max_parallel)
|
|
210
|
+
|
|
211
|
+
async def process_one(doc_path):
|
|
212
|
+
async with semaphore:
|
|
213
|
+
return await self.build_from_document(doc_path)
|
|
214
|
+
|
|
215
|
+
tasks = [process_one(doc_path) for doc_path in document_paths]
|
|
216
|
+
gather_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
217
|
+
|
|
218
|
+
# Handle exceptions - convert all to DocumentBuildResult
|
|
219
|
+
results: List[DocumentBuildResult] = []
|
|
220
|
+
for i, result in enumerate(gather_results):
|
|
221
|
+
if isinstance(result, Exception):
|
|
222
|
+
error_result = DocumentBuildResult(
|
|
223
|
+
document_path=str(document_paths[i]),
|
|
224
|
+
document_type="unknown",
|
|
225
|
+
success=False,
|
|
226
|
+
)
|
|
227
|
+
error_result.errors.append(str(result))
|
|
228
|
+
results.append(error_result)
|
|
229
|
+
elif isinstance(result, DocumentBuildResult):
|
|
230
|
+
results.append(result)
|
|
231
|
+
else:
|
|
232
|
+
# Fallback for unexpected types
|
|
233
|
+
error_result = DocumentBuildResult(
|
|
234
|
+
document_path=str(document_paths[i]),
|
|
235
|
+
document_type="unknown",
|
|
236
|
+
success=False,
|
|
237
|
+
)
|
|
238
|
+
error_result.errors.append(f"Unexpected result type: {type(result)}")
|
|
239
|
+
results.append(error_result)
|
|
240
|
+
|
|
241
|
+
return results
|
|
242
|
+
else:
|
|
243
|
+
# Sequential processing
|
|
244
|
+
results = []
|
|
245
|
+
for doc_path in document_paths:
|
|
246
|
+
result = await self.build_from_document(doc_path)
|
|
247
|
+
results.append(result)
|
|
248
|
+
return results
|
|
249
|
+
|
|
250
|
+
async def _parse_document(self, document_path: str) -> str:
|
|
251
|
+
"""
|
|
252
|
+
Parse document to text using AIECS document parser
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
document_path: Path to document
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Extracted text content
|
|
259
|
+
"""
|
|
260
|
+
try:
|
|
261
|
+
# Use document parser tool
|
|
262
|
+
parse_result = self.document_parser.parse_document(
|
|
263
|
+
source=document_path,
|
|
264
|
+
strategy=ParsingStrategy.TEXT_ONLY,
|
|
265
|
+
output_format=OutputFormat.TEXT,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if isinstance(parse_result, dict):
|
|
269
|
+
return parse_result.get("content", "")
|
|
270
|
+
elif isinstance(parse_result, str):
|
|
271
|
+
return parse_result
|
|
272
|
+
else:
|
|
273
|
+
return ""
|
|
274
|
+
|
|
275
|
+
except Exception:
|
|
276
|
+
# Fallback: try reading as plain text
|
|
277
|
+
try:
|
|
278
|
+
with open(document_path, "r", encoding="utf-8") as f:
|
|
279
|
+
return f.read()
|
|
280
|
+
except Exception as fallback_error:
|
|
281
|
+
raise RuntimeError(f"Failed to parse document: {str(fallback_error)}")
|
|
282
|
+
|
|
283
|
+
async def _process_chunks_parallel(
|
|
284
|
+
self,
|
|
285
|
+
chunks: List,
|
|
286
|
+
document_path: str,
|
|
287
|
+
metadata: Optional[Dict[str, Any]],
|
|
288
|
+
) -> List[BuildResult]:
|
|
289
|
+
"""
|
|
290
|
+
Process chunks in parallel
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
chunks: List of TextChunk objects
|
|
294
|
+
document_path: Source document path
|
|
295
|
+
metadata: Optional metadata
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
List of BuildResult objects
|
|
299
|
+
"""
|
|
300
|
+
semaphore = asyncio.Semaphore(self.max_parallel_chunks)
|
|
301
|
+
|
|
302
|
+
async def process_chunk(chunk):
|
|
303
|
+
async with semaphore:
|
|
304
|
+
chunk_metadata = {
|
|
305
|
+
"document": document_path,
|
|
306
|
+
"chunk_index": chunk.chunk_index,
|
|
307
|
+
"chunk_start": chunk.start_char,
|
|
308
|
+
"chunk_end": chunk.end_char,
|
|
309
|
+
}
|
|
310
|
+
if metadata:
|
|
311
|
+
chunk_metadata.update(metadata)
|
|
312
|
+
|
|
313
|
+
source = f"{document_path}#chunk{chunk.chunk_index}"
|
|
314
|
+
return await self.graph_builder.build_from_text(text=chunk.text, source=source, metadata=chunk_metadata)
|
|
315
|
+
|
|
316
|
+
tasks = [process_chunk(chunk) for chunk in chunks]
|
|
317
|
+
gather_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
318
|
+
|
|
319
|
+
# Handle exceptions - convert all to BuildResult
|
|
320
|
+
results: List[BuildResult] = []
|
|
321
|
+
for i, result in enumerate(gather_results):
|
|
322
|
+
if isinstance(result, Exception):
|
|
323
|
+
error_result = BuildResult(success=False)
|
|
324
|
+
error_result.errors.append(f"Chunk {i} failed: {str(result)}")
|
|
325
|
+
results.append(error_result)
|
|
326
|
+
elif isinstance(result, BuildResult):
|
|
327
|
+
results.append(result)
|
|
328
|
+
else:
|
|
329
|
+
# Fallback for unexpected types
|
|
330
|
+
error_result = BuildResult(success=False)
|
|
331
|
+
error_result.errors.append(f"Unexpected result type: {type(result)}")
|
|
332
|
+
results.append(error_result)
|
|
333
|
+
|
|
334
|
+
return results
|
|
335
|
+
|
|
336
|
+
async def _process_chunks_sequential(
|
|
337
|
+
self,
|
|
338
|
+
chunks: List,
|
|
339
|
+
document_path: str,
|
|
340
|
+
metadata: Optional[Dict[str, Any]],
|
|
341
|
+
) -> List[BuildResult]:
|
|
342
|
+
"""
|
|
343
|
+
Process chunks sequentially
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
chunks: List of TextChunk objects
|
|
347
|
+
document_path: Source document path
|
|
348
|
+
metadata: Optional metadata
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
List of BuildResult objects
|
|
352
|
+
"""
|
|
353
|
+
results = []
|
|
354
|
+
|
|
355
|
+
for chunk in chunks:
|
|
356
|
+
chunk_metadata = {
|
|
357
|
+
"document": document_path,
|
|
358
|
+
"chunk_index": chunk.chunk_index,
|
|
359
|
+
"chunk_start": chunk.start_char,
|
|
360
|
+
"chunk_end": chunk.end_char,
|
|
361
|
+
}
|
|
362
|
+
if metadata:
|
|
363
|
+
chunk_metadata.update(metadata)
|
|
364
|
+
|
|
365
|
+
source = f"{document_path}#chunk{chunk.chunk_index}"
|
|
366
|
+
result = await self.graph_builder.build_from_text(text=chunk.text, source=source, metadata=chunk_metadata)
|
|
367
|
+
results.append(result)
|
|
368
|
+
|
|
369
|
+
return results
|