aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1011 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import logging
|
|
4
|
+
import asyncio
|
|
5
|
+
from typing import Dict, Any, List, Optional, Union, Tuple
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import tempfile
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
13
|
+
|
|
14
|
+
from aiecs.tools.base_tool import BaseTool
|
|
15
|
+
from aiecs.tools import register_tool
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DocumentType(str, Enum):
|
|
19
|
+
"""Supported document types for parsing"""
|
|
20
|
+
|
|
21
|
+
PDF = "pdf"
|
|
22
|
+
DOCX = "docx"
|
|
23
|
+
XLSX = "xlsx"
|
|
24
|
+
PPTX = "pptx"
|
|
25
|
+
TXT = "txt"
|
|
26
|
+
HTML = "html"
|
|
27
|
+
RTF = "rtf"
|
|
28
|
+
CSV = "csv"
|
|
29
|
+
JSON = "json"
|
|
30
|
+
XML = "xml"
|
|
31
|
+
MARKDOWN = "md"
|
|
32
|
+
IMAGE = "image"
|
|
33
|
+
UNKNOWN = "unknown"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ParsingStrategy(str, Enum):
|
|
37
|
+
"""Document parsing strategies"""
|
|
38
|
+
|
|
39
|
+
TEXT_ONLY = "text_only"
|
|
40
|
+
STRUCTURED = "structured"
|
|
41
|
+
FULL_CONTENT = "full_content"
|
|
42
|
+
METADATA_ONLY = "metadata_only"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class OutputFormat(str, Enum):
|
|
46
|
+
"""Output formats for parsed content"""
|
|
47
|
+
|
|
48
|
+
TEXT = "text"
|
|
49
|
+
JSON = "json"
|
|
50
|
+
MARKDOWN = "markdown"
|
|
51
|
+
HTML = "html"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class DocumentParserError(Exception):
|
|
55
|
+
"""Base exception for document parser errors"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class UnsupportedDocumentError(DocumentParserError):
|
|
59
|
+
"""Raised when document type is not supported"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DownloadError(DocumentParserError):
|
|
63
|
+
"""Raised when document download fails"""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ParseError(DocumentParserError):
|
|
67
|
+
"""Raised when document parsing fails"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@register_tool("document_parser")
|
|
71
|
+
class DocumentParserTool(BaseTool):
|
|
72
|
+
"""
|
|
73
|
+
Modern high-performance document parsing component that can:
|
|
74
|
+
1. Auto-detect document types from URLs or files
|
|
75
|
+
2. Download documents from URLs
|
|
76
|
+
3. Parse various document formats using existing atomic tools
|
|
77
|
+
4. Output structured content for AI consumption
|
|
78
|
+
|
|
79
|
+
Leverages existing tools:
|
|
80
|
+
- ScraperTool for URL downloading
|
|
81
|
+
- OfficeTool for Office document parsing
|
|
82
|
+
- ImageTool for image OCR
|
|
83
|
+
|
|
84
|
+
Configuration:
|
|
85
|
+
Configuration is automatically loaded by BaseTool from:
|
|
86
|
+
1. Explicit config dict (highest priority) - passed to constructor
|
|
87
|
+
2. YAML config files - config/tools/document_parser_tool.yaml or config/tools.yaml (see examples/config/tools/ for examples)
|
|
88
|
+
3. Environment variables - from .env files via dotenv (DOC_PARSER_ prefix)
|
|
89
|
+
4. Tool defaults - defined in Config class Field defaults (lowest priority)
|
|
90
|
+
|
|
91
|
+
Example usage:
|
|
92
|
+
# Basic usage (automatic configuration)
|
|
93
|
+
tool = get_tool("document_parser")
|
|
94
|
+
|
|
95
|
+
# With explicit config override
|
|
96
|
+
tool = get_tool("document_parser", config={"timeout": 120})
|
|
97
|
+
|
|
98
|
+
# Configuration files:
|
|
99
|
+
# - Runtime config: config/tools/document_parser_tool.yaml (see examples/config/tools/ for examples)
|
|
100
|
+
# - Sensitive config: .env file with DOC_PARSER_* variables
|
|
101
|
+
|
|
102
|
+
See docs/developer/TOOLS/TOOL_CONFIGURATION_EXAMPLES.md for more examples.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
# Configuration schema
|
|
106
|
+
class Config(BaseSettings):
|
|
107
|
+
"""Configuration for the document parser tool
|
|
108
|
+
|
|
109
|
+
Configuration is automatically loaded by BaseTool using ToolConfigLoader.
|
|
110
|
+
Supports loading from:
|
|
111
|
+
- YAML files: config/tools/document_parser_tool.yaml (see examples/config/tools/ for examples)
|
|
112
|
+
- Environment variables: DOC_PARSER_* (from .env files via dotenv)
|
|
113
|
+
- Explicit config dict: passed to constructor
|
|
114
|
+
|
|
115
|
+
Environment variable prefix: DOC_PARSER_
|
|
116
|
+
Example: DOC_PARSER_GCS_PROJECT_ID -> gcs_project_id
|
|
117
|
+
Example: DOC_PARSER_TIMEOUT -> timeout
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
model_config = SettingsConfigDict(env_prefix="DOC_PARSER_")
|
|
121
|
+
|
|
122
|
+
user_agent: str = Field(
|
|
123
|
+
default="DocumentParser/1.0",
|
|
124
|
+
description="User agent for HTTP requests",
|
|
125
|
+
)
|
|
126
|
+
max_file_size: int = Field(default=50 * 1024 * 1024, description="Maximum file size in bytes")
|
|
127
|
+
temp_dir: str = Field(
|
|
128
|
+
default=os.path.join(tempfile.gettempdir(), "document_parser"),
|
|
129
|
+
description="Temporary directory for document processing",
|
|
130
|
+
)
|
|
131
|
+
default_encoding: str = Field(default="utf-8", description="Default encoding for text files")
|
|
132
|
+
timeout: int = Field(default=30, description="Timeout for HTTP requests in seconds")
|
|
133
|
+
max_pages: int = Field(
|
|
134
|
+
default=1000,
|
|
135
|
+
description="Maximum number of pages to process for large documents",
|
|
136
|
+
)
|
|
137
|
+
enable_cloud_storage: bool = Field(
|
|
138
|
+
default=True,
|
|
139
|
+
description="Whether to enable cloud storage integration",
|
|
140
|
+
)
|
|
141
|
+
gcs_bucket_name: str = Field(
|
|
142
|
+
default="aiecs-documents",
|
|
143
|
+
description="Google Cloud Storage bucket name",
|
|
144
|
+
)
|
|
145
|
+
gcs_project_id: Optional[str] = Field(default=None, description="Google Cloud Storage project ID")
|
|
146
|
+
|
|
147
|
+
def __init__(self, config: Optional[Dict] = None, **kwargs):
|
|
148
|
+
"""Initialize DocumentParserTool with settings
|
|
149
|
+
|
|
150
|
+
Configuration is automatically loaded by BaseTool from:
|
|
151
|
+
1. Explicit config dict (highest priority)
|
|
152
|
+
2. YAML config files (config/tools/document_parser_tool.yaml)
|
|
153
|
+
3. Environment variables (via dotenv from .env files)
|
|
154
|
+
4. Tool defaults (lowest priority)
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
config: Optional configuration overrides
|
|
158
|
+
**kwargs: Additional arguments passed to BaseTool (e.g., tool_name)
|
|
159
|
+
"""
|
|
160
|
+
super().__init__(config, **kwargs)
|
|
161
|
+
|
|
162
|
+
# Configuration is automatically loaded by BaseTool into self._config_obj
|
|
163
|
+
# Access config via self._config_obj (BaseSettings instance)
|
|
164
|
+
self.config = self._config_obj if self._config_obj else self.Config()
|
|
165
|
+
|
|
166
|
+
self.logger = logging.getLogger(__name__)
|
|
167
|
+
os.makedirs(self.config.temp_dir, exist_ok=True)
|
|
168
|
+
|
|
169
|
+
# Initialize dependent tools
|
|
170
|
+
self._init_dependent_tools()
|
|
171
|
+
|
|
172
|
+
# Initialize cloud storage
|
|
173
|
+
self._init_cloud_storage()
|
|
174
|
+
|
|
175
|
+
def _init_dependent_tools(self):
|
|
176
|
+
"""Initialize dependent tools for document processing"""
|
|
177
|
+
try:
|
|
178
|
+
from aiecs.tools.task_tools.scraper_tool import ScraperTool
|
|
179
|
+
|
|
180
|
+
self.scraper_tool = ScraperTool()
|
|
181
|
+
except ImportError:
|
|
182
|
+
self.logger.warning("ScraperTool not available")
|
|
183
|
+
self.scraper_tool = None
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
from aiecs.tools.task_tools.office_tool import OfficeTool
|
|
187
|
+
|
|
188
|
+
self.office_tool = OfficeTool()
|
|
189
|
+
except ImportError:
|
|
190
|
+
self.logger.warning("OfficeTool not available")
|
|
191
|
+
self.office_tool = None
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
from aiecs.tools.task_tools.image_tool import ImageTool
|
|
195
|
+
|
|
196
|
+
self.image_tool = ImageTool()
|
|
197
|
+
except ImportError:
|
|
198
|
+
self.logger.warning("ImageTool not available")
|
|
199
|
+
self.image_tool = None
|
|
200
|
+
|
|
201
|
+
def _init_cloud_storage(self):
|
|
202
|
+
"""Initialize cloud storage for document retrieval"""
|
|
203
|
+
self.file_storage = None
|
|
204
|
+
|
|
205
|
+
if self.config.enable_cloud_storage:
|
|
206
|
+
try:
|
|
207
|
+
from aiecs.infrastructure.persistence.file_storage import (
|
|
208
|
+
FileStorage,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
storage_config = {
|
|
212
|
+
"gcs_bucket_name": self.config.gcs_bucket_name,
|
|
213
|
+
"gcs_project_id": self.config.gcs_project_id,
|
|
214
|
+
"enable_local_fallback": True,
|
|
215
|
+
"local_storage_path": self.config.temp_dir,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
self.file_storage = FileStorage(storage_config)
|
|
219
|
+
asyncio.create_task(self._init_storage_async())
|
|
220
|
+
|
|
221
|
+
except ImportError:
|
|
222
|
+
self.logger.warning("FileStorage not available, cloud storage disabled")
|
|
223
|
+
except Exception as e:
|
|
224
|
+
self.logger.warning(f"Failed to initialize cloud storage: {e}")
|
|
225
|
+
|
|
226
|
+
async def _init_storage_async(self):
|
|
227
|
+
"""Async initialization of file storage"""
|
|
228
|
+
try:
|
|
229
|
+
if self.file_storage:
|
|
230
|
+
await self.file_storage.initialize()
|
|
231
|
+
self.logger.info("Cloud storage initialized successfully")
|
|
232
|
+
except Exception as e:
|
|
233
|
+
self.logger.warning(f"Cloud storage initialization failed: {e}")
|
|
234
|
+
self.file_storage = None
|
|
235
|
+
|
|
236
|
+
# Schema definitions
|
|
237
|
+
class Parse_documentSchema(BaseModel):
|
|
238
|
+
"""Schema for parse_document operation"""
|
|
239
|
+
|
|
240
|
+
source: str = Field(description="URL or file path to the document")
|
|
241
|
+
strategy: ParsingStrategy = Field(
|
|
242
|
+
default=ParsingStrategy.FULL_CONTENT,
|
|
243
|
+
description="Parsing strategy",
|
|
244
|
+
)
|
|
245
|
+
output_format: OutputFormat = Field(default=OutputFormat.JSON, description="Output format")
|
|
246
|
+
force_type: Optional[DocumentType] = Field(default=None, description="Force document type detection")
|
|
247
|
+
extract_metadata: bool = Field(default=True, description="Whether to extract metadata")
|
|
248
|
+
chunk_size: Optional[int] = Field(default=None, description="Chunk size for large documents")
|
|
249
|
+
|
|
250
|
+
class Detect_document_typeSchema(BaseModel):
|
|
251
|
+
"""Schema for detect_document_type operation"""
|
|
252
|
+
|
|
253
|
+
source: str = Field(description="URL or file path to analyze")
|
|
254
|
+
download_sample: bool = Field(
|
|
255
|
+
default=True,
|
|
256
|
+
description="Download sample for content-based detection",
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def detect_document_type(self, source: str, download_sample: bool = True) -> Dict[str, Any]:
|
|
260
|
+
"""
|
|
261
|
+
Detect document type from URL or file path
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
source: URL or file path
|
|
265
|
+
download_sample: Whether to download sample for content analysis
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Dict containing detected type and confidence
|
|
269
|
+
"""
|
|
270
|
+
try:
|
|
271
|
+
result: Dict[str, Any] = {
|
|
272
|
+
"source": source,
|
|
273
|
+
"is_url": self._is_url(source),
|
|
274
|
+
"detected_type": DocumentType.UNKNOWN,
|
|
275
|
+
"confidence": 0.0,
|
|
276
|
+
"mime_type": None,
|
|
277
|
+
"file_extension": None,
|
|
278
|
+
"file_size": None,
|
|
279
|
+
"detection_methods": [],
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# Method 1: File extension analysis
|
|
283
|
+
extension_type, ext_confidence = self._detect_by_extension(source)
|
|
284
|
+
if extension_type != DocumentType.UNKNOWN:
|
|
285
|
+
result["detected_type"] = extension_type
|
|
286
|
+
result["confidence"] = ext_confidence
|
|
287
|
+
# Extract extension correctly for URLs and local paths
|
|
288
|
+
if self._is_url(source):
|
|
289
|
+
parsed = urlparse(source)
|
|
290
|
+
result["file_extension"] = Path(parsed.path).suffix.lower()
|
|
291
|
+
else:
|
|
292
|
+
result["file_extension"] = Path(source).suffix.lower()
|
|
293
|
+
result["detection_methods"].append("file_extension")
|
|
294
|
+
|
|
295
|
+
# Method 2: MIME type detection (for URLs)
|
|
296
|
+
if self._is_url(source) and download_sample:
|
|
297
|
+
mime_type, mime_confidence = self._detect_by_mime_type(source)
|
|
298
|
+
confidence = result.get("confidence", 0.0)
|
|
299
|
+
if isinstance(confidence, (int, float)) and mime_type != DocumentType.UNKNOWN and mime_confidence > confidence:
|
|
300
|
+
result["detected_type"] = mime_type
|
|
301
|
+
result["confidence"] = mime_confidence
|
|
302
|
+
result["detection_methods"].append("mime_type")
|
|
303
|
+
|
|
304
|
+
# Method 3: Content-based detection
|
|
305
|
+
if download_sample:
|
|
306
|
+
content_type, content_confidence = self._detect_by_content(source)
|
|
307
|
+
confidence = result.get("confidence", 0.0)
|
|
308
|
+
if isinstance(confidence, (int, float)) and content_type != DocumentType.UNKNOWN and content_confidence > confidence:
|
|
309
|
+
result["detected_type"] = content_type
|
|
310
|
+
result["confidence"] = content_confidence
|
|
311
|
+
result["detection_methods"].append("content_analysis")
|
|
312
|
+
|
|
313
|
+
return result
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
raise DocumentParserError(f"Document type detection failed: {str(e)}")
|
|
317
|
+
|
|
318
|
+
def parse_document(
|
|
319
|
+
self,
|
|
320
|
+
source: str,
|
|
321
|
+
strategy: ParsingStrategy = ParsingStrategy.FULL_CONTENT,
|
|
322
|
+
output_format: OutputFormat = OutputFormat.JSON,
|
|
323
|
+
force_type: Optional[DocumentType] = None,
|
|
324
|
+
extract_metadata: bool = True,
|
|
325
|
+
chunk_size: Optional[int] = None,
|
|
326
|
+
) -> Dict[str, Any]:
|
|
327
|
+
"""
|
|
328
|
+
Parse document from URL or file path
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
source: URL or file path to document
|
|
332
|
+
strategy: Parsing strategy to use
|
|
333
|
+
output_format: Format for output content
|
|
334
|
+
force_type: Force specific document type
|
|
335
|
+
extract_metadata: Whether to extract metadata
|
|
336
|
+
chunk_size: Chunk size for large documents
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Dict containing parsed content and metadata
|
|
340
|
+
"""
|
|
341
|
+
try:
|
|
342
|
+
# Step 1: Detect document type
|
|
343
|
+
if force_type:
|
|
344
|
+
doc_type = force_type
|
|
345
|
+
confidence = 1.0
|
|
346
|
+
else:
|
|
347
|
+
detection_result = self.detect_document_type(source)
|
|
348
|
+
doc_type = detection_result["detected_type"]
|
|
349
|
+
confidence = detection_result["confidence"]
|
|
350
|
+
|
|
351
|
+
if confidence < 0.5:
|
|
352
|
+
raise UnsupportedDocumentError(f"Unable to reliably detect document type for: {source}")
|
|
353
|
+
|
|
354
|
+
# Step 2: Download document if it's a URL
|
|
355
|
+
local_path = self._ensure_local_file(source)
|
|
356
|
+
|
|
357
|
+
# Step 3: Parse document based on type and strategy
|
|
358
|
+
content = self._parse_by_type(local_path, doc_type, strategy)
|
|
359
|
+
|
|
360
|
+
# Step 4: Extract metadata if requested
|
|
361
|
+
metadata = {}
|
|
362
|
+
if extract_metadata:
|
|
363
|
+
metadata = self._extract_metadata(local_path, doc_type)
|
|
364
|
+
|
|
365
|
+
# Step 5: Format output
|
|
366
|
+
result = {
|
|
367
|
+
"source": source,
|
|
368
|
+
"document_type": doc_type,
|
|
369
|
+
"detection_confidence": confidence,
|
|
370
|
+
"parsing_strategy": strategy,
|
|
371
|
+
"metadata": metadata,
|
|
372
|
+
"content": content,
|
|
373
|
+
"content_stats": self._calculate_content_stats(content),
|
|
374
|
+
"chunks": [],
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
# Step 6: Create chunks if requested
|
|
378
|
+
if chunk_size and isinstance(content, str):
|
|
379
|
+
result["chunks"] = self._create_chunks(content, chunk_size)
|
|
380
|
+
|
|
381
|
+
# Step 7: Format output according to requested format
|
|
382
|
+
if output_format == OutputFormat.TEXT:
|
|
383
|
+
return {"text": self._format_as_text(result)}
|
|
384
|
+
elif output_format == OutputFormat.MARKDOWN:
|
|
385
|
+
return {"markdown": self._format_as_markdown(result)}
|
|
386
|
+
elif output_format == OutputFormat.HTML:
|
|
387
|
+
return {"html": self._format_as_html(result)}
|
|
388
|
+
else:
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
except Exception as e:
|
|
392
|
+
if isinstance(e, DocumentParserError):
|
|
393
|
+
raise
|
|
394
|
+
raise ParseError(f"Document parsing failed: {str(e)}")
|
|
395
|
+
finally:
|
|
396
|
+
# Cleanup temporary files
|
|
397
|
+
self._cleanup_temp_files(source)
|
|
398
|
+
|
|
399
|
+
async def parse_document_async(
|
|
400
|
+
self,
|
|
401
|
+
source: str,
|
|
402
|
+
strategy: ParsingStrategy = ParsingStrategy.FULL_CONTENT,
|
|
403
|
+
output_format: OutputFormat = OutputFormat.JSON,
|
|
404
|
+
force_type: Optional[DocumentType] = None,
|
|
405
|
+
extract_metadata: bool = True,
|
|
406
|
+
chunk_size: Optional[int] = None,
|
|
407
|
+
) -> Dict[str, Any]:
|
|
408
|
+
"""Async version of parse_document"""
|
|
409
|
+
return await asyncio.to_thread(
|
|
410
|
+
self.parse_document,
|
|
411
|
+
source=source,
|
|
412
|
+
strategy=strategy,
|
|
413
|
+
output_format=output_format,
|
|
414
|
+
force_type=force_type,
|
|
415
|
+
extract_metadata=extract_metadata,
|
|
416
|
+
chunk_size=chunk_size,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
def _is_url(self, source: str) -> bool:
|
|
420
|
+
"""Check if source is a URL"""
|
|
421
|
+
try:
|
|
422
|
+
result = urlparse(source)
|
|
423
|
+
return bool(result.scheme and result.netloc)
|
|
424
|
+
except Exception:
|
|
425
|
+
return False
|
|
426
|
+
|
|
427
|
+
def _is_cloud_storage_path(self, source: str) -> bool:
|
|
428
|
+
"""Check if source is a cloud storage path"""
|
|
429
|
+
# Support various cloud storage path formats:
|
|
430
|
+
# - gs://bucket/path/file.pdf (Google Cloud Storage)
|
|
431
|
+
# - s3://bucket/path/file.pdf (AWS S3)
|
|
432
|
+
# - azure://container/path/file.pdf (Azure Blob Storage)
|
|
433
|
+
# - cloud://path/file.pdf (Generic cloud storage)
|
|
434
|
+
cloud_schemes = ["gs", "s3", "azure", "cloud"]
|
|
435
|
+
try:
|
|
436
|
+
parsed = urlparse(source)
|
|
437
|
+
return parsed.scheme in cloud_schemes
|
|
438
|
+
except Exception:
|
|
439
|
+
return False
|
|
440
|
+
|
|
441
|
+
def _is_storage_id(self, source: str) -> bool:
|
|
442
|
+
"""Check if source is a storage ID (UUID-like identifier)"""
|
|
443
|
+
# Check for UUID patterns or other storage ID formats
|
|
444
|
+
import re
|
|
445
|
+
|
|
446
|
+
uuid_pattern = r"^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$"
|
|
447
|
+
storage_id_pattern = r"^[a-zA-Z0-9_-]{10,}$" # Generic storage ID
|
|
448
|
+
|
|
449
|
+
return bool(re.match(uuid_pattern, source, re.IGNORECASE) or re.match(storage_id_pattern, source))
|
|
450
|
+
|
|
451
|
+
def _detect_by_extension(self, source: str) -> Tuple[DocumentType, float]:
|
|
452
|
+
"""Detect document type by file extension"""
|
|
453
|
+
try:
|
|
454
|
+
# For URLs, parse the URL first to extract the path without query parameters
|
|
455
|
+
if self._is_url(source):
|
|
456
|
+
parsed = urlparse(source)
|
|
457
|
+
# Extract extension from the URL path, not from the full URL
|
|
458
|
+
path = Path(parsed.path)
|
|
459
|
+
ext = path.suffix.lower()
|
|
460
|
+
else:
|
|
461
|
+
# For local file paths, use Path directly
|
|
462
|
+
path = Path(source)
|
|
463
|
+
ext = path.suffix.lower()
|
|
464
|
+
|
|
465
|
+
extension_map = {
|
|
466
|
+
".pdf": DocumentType.PDF,
|
|
467
|
+
".docx": DocumentType.DOCX,
|
|
468
|
+
".doc": DocumentType.DOCX,
|
|
469
|
+
".xlsx": DocumentType.XLSX,
|
|
470
|
+
".xls": DocumentType.XLSX,
|
|
471
|
+
".pptx": DocumentType.PPTX,
|
|
472
|
+
".ppt": DocumentType.PPTX,
|
|
473
|
+
".txt": DocumentType.TXT,
|
|
474
|
+
".html": DocumentType.HTML,
|
|
475
|
+
".htm": DocumentType.HTML,
|
|
476
|
+
".rtf": DocumentType.RTF,
|
|
477
|
+
".csv": DocumentType.CSV,
|
|
478
|
+
".json": DocumentType.JSON,
|
|
479
|
+
".xml": DocumentType.XML,
|
|
480
|
+
".md": DocumentType.MARKDOWN,
|
|
481
|
+
".markdown": DocumentType.MARKDOWN,
|
|
482
|
+
".jpg": DocumentType.IMAGE,
|
|
483
|
+
".jpeg": DocumentType.IMAGE,
|
|
484
|
+
".png": DocumentType.IMAGE,
|
|
485
|
+
".gif": DocumentType.IMAGE,
|
|
486
|
+
".bmp": DocumentType.IMAGE,
|
|
487
|
+
".tiff": DocumentType.IMAGE,
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
doc_type = extension_map.get(ext, DocumentType.UNKNOWN)
|
|
491
|
+
confidence = 0.8 if doc_type != DocumentType.UNKNOWN else 0.0
|
|
492
|
+
|
|
493
|
+
return doc_type, confidence
|
|
494
|
+
|
|
495
|
+
except Exception:
|
|
496
|
+
return DocumentType.UNKNOWN, 0.0
|
|
497
|
+
|
|
498
|
+
def _detect_by_mime_type(self, url: str) -> Tuple[DocumentType, float]:
|
|
499
|
+
"""Detect document type by MIME type from URL"""
|
|
500
|
+
try:
|
|
501
|
+
if not self.scraper_tool:
|
|
502
|
+
return DocumentType.UNKNOWN, 0.0
|
|
503
|
+
|
|
504
|
+
# Get headers only
|
|
505
|
+
response = asyncio.run(self.scraper_tool.get_httpx(url, method="HEAD", verify_ssl=False))
|
|
506
|
+
|
|
507
|
+
content_type = response.get("headers", {}).get("content-type", "").lower()
|
|
508
|
+
|
|
509
|
+
mime_map = {
|
|
510
|
+
"application/pdf": DocumentType.PDF,
|
|
511
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": DocumentType.DOCX,
|
|
512
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": DocumentType.XLSX,
|
|
513
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": DocumentType.PPTX,
|
|
514
|
+
"text/plain": DocumentType.TXT,
|
|
515
|
+
"text/html": DocumentType.HTML,
|
|
516
|
+
"application/rtf": DocumentType.RTF,
|
|
517
|
+
"text/csv": DocumentType.CSV,
|
|
518
|
+
"application/json": DocumentType.JSON,
|
|
519
|
+
"application/xml": DocumentType.XML,
|
|
520
|
+
"text/xml": DocumentType.XML,
|
|
521
|
+
"text/markdown": DocumentType.MARKDOWN,
|
|
522
|
+
"image/jpeg": DocumentType.IMAGE,
|
|
523
|
+
"image/png": DocumentType.IMAGE,
|
|
524
|
+
"image/gif": DocumentType.IMAGE,
|
|
525
|
+
"image/bmp": DocumentType.IMAGE,
|
|
526
|
+
"image/tiff": DocumentType.IMAGE,
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
for mime_pattern, doc_type in mime_map.items():
|
|
530
|
+
if mime_pattern in content_type:
|
|
531
|
+
return doc_type, 0.9
|
|
532
|
+
|
|
533
|
+
return DocumentType.UNKNOWN, 0.0
|
|
534
|
+
|
|
535
|
+
except Exception:
|
|
536
|
+
return DocumentType.UNKNOWN, 0.0
|
|
537
|
+
|
|
538
|
+
def _detect_by_content(self, source: str) -> Tuple[DocumentType, float]:
|
|
539
|
+
"""Detect document type by content analysis"""
|
|
540
|
+
try:
|
|
541
|
+
# Download a small sample for analysis
|
|
542
|
+
if self._is_url(source):
|
|
543
|
+
sample_path = self._download_sample(source, max_size=1024) # 1KB sample
|
|
544
|
+
else:
|
|
545
|
+
sample_path = source
|
|
546
|
+
|
|
547
|
+
with open(sample_path, "rb") as f:
|
|
548
|
+
header = f.read(512) # Read first 512 bytes
|
|
549
|
+
|
|
550
|
+
# Magic number detection
|
|
551
|
+
if header.startswith(b"%PDF"):
|
|
552
|
+
return DocumentType.PDF, 0.95
|
|
553
|
+
elif header.startswith(b"PK\x03\x04"): # ZIP-based formats
|
|
554
|
+
if b"word/" in header or b"document.xml" in header:
|
|
555
|
+
return DocumentType.DOCX, 0.9
|
|
556
|
+
elif b"xl/" in header or b"workbook.xml" in header:
|
|
557
|
+
return DocumentType.XLSX, 0.9
|
|
558
|
+
elif b"ppt/" in header or b"presentation.xml" in header:
|
|
559
|
+
return DocumentType.PPTX, 0.9
|
|
560
|
+
elif header.startswith(b"{\rtf"):
|
|
561
|
+
return DocumentType.RTF, 0.95
|
|
562
|
+
elif header.startswith((b"\xff\xd8\xff", b"\x89PNG", b"GIF8")):
|
|
563
|
+
return DocumentType.IMAGE, 0.95
|
|
564
|
+
elif header.startswith(b"<?xml"):
|
|
565
|
+
return DocumentType.XML, 0.9
|
|
566
|
+
elif header.startswith((b"{", b"[")):
|
|
567
|
+
# Try to parse as JSON
|
|
568
|
+
try:
|
|
569
|
+
import json
|
|
570
|
+
|
|
571
|
+
json.loads(header.decode("utf-8", errors="ignore"))
|
|
572
|
+
return DocumentType.JSON, 0.85
|
|
573
|
+
except Exception:
|
|
574
|
+
pass
|
|
575
|
+
|
|
576
|
+
# Text-based detection
|
|
577
|
+
try:
|
|
578
|
+
text_content = header.decode("utf-8", errors="ignore")
|
|
579
|
+
if re.match(r"^#\s+.*$", text_content, re.MULTILINE):
|
|
580
|
+
return DocumentType.MARKDOWN, 0.7
|
|
581
|
+
elif "<html" in text_content.lower() or "<!doctype html" in text_content.lower():
|
|
582
|
+
return DocumentType.HTML, 0.85
|
|
583
|
+
elif "," in text_content and "\n" in text_content:
|
|
584
|
+
# Simple CSV detection
|
|
585
|
+
lines = text_content.split("\n")[:5]
|
|
586
|
+
if all("," in line for line in lines if line.strip()):
|
|
587
|
+
return DocumentType.CSV, 0.6
|
|
588
|
+
except Exception:
|
|
589
|
+
pass
|
|
590
|
+
|
|
591
|
+
return DocumentType.UNKNOWN, 0.0
|
|
592
|
+
|
|
593
|
+
except Exception:
|
|
594
|
+
return DocumentType.UNKNOWN, 0.0
|
|
595
|
+
|
|
596
|
+
def _ensure_local_file(self, source: str) -> str:
|
|
597
|
+
"""Ensure we have a local file, download/retrieve if necessary"""
|
|
598
|
+
# Check source type and handle accordingly
|
|
599
|
+
if self._is_cloud_storage_path(source) or self._is_storage_id(source):
|
|
600
|
+
# Download from cloud storage
|
|
601
|
+
return asyncio.run(self._download_from_cloud_storage(source))
|
|
602
|
+
elif self._is_url(source):
|
|
603
|
+
# Download from URL
|
|
604
|
+
return self._download_document(source)
|
|
605
|
+
else:
|
|
606
|
+
# Local file path
|
|
607
|
+
if not os.path.exists(source):
|
|
608
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
609
|
+
return source
|
|
610
|
+
|
|
611
|
+
def _download_document(self, url: str) -> str:
|
|
612
|
+
"""Download document from URL"""
|
|
613
|
+
try:
|
|
614
|
+
if not self.scraper_tool:
|
|
615
|
+
raise DownloadError("ScraperTool not available for URL download")
|
|
616
|
+
|
|
617
|
+
# Generate temp file path
|
|
618
|
+
parsed_url = urlparse(url)
|
|
619
|
+
filename = os.path.basename(parsed_url.path) or "document"
|
|
620
|
+
temp_path = os.path.join(self.config.temp_dir, f"download_{hash(url)}_{filename}")
|
|
621
|
+
|
|
622
|
+
# Download using scraper tool
|
|
623
|
+
result = asyncio.run(
|
|
624
|
+
self.scraper_tool.get_httpx(
|
|
625
|
+
url,
|
|
626
|
+
content_type="binary",
|
|
627
|
+
output_path=temp_path,
|
|
628
|
+
verify_ssl=False,
|
|
629
|
+
)
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
if isinstance(result, dict) and "saved_to" in result:
|
|
633
|
+
return result["saved_to"]
|
|
634
|
+
else:
|
|
635
|
+
# Fallback: save content manually
|
|
636
|
+
with open(temp_path, "wb") as f:
|
|
637
|
+
if isinstance(result, dict) and "content" in result:
|
|
638
|
+
f.write(result["content"])
|
|
639
|
+
else:
|
|
640
|
+
f.write(result)
|
|
641
|
+
return temp_path
|
|
642
|
+
|
|
643
|
+
except Exception as e:
|
|
644
|
+
raise DownloadError(f"Failed to download document from {url}: {str(e)}")
|
|
645
|
+
|
|
646
|
+
async def _download_from_cloud_storage(self, source: str) -> str:
|
|
647
|
+
"""Download document from cloud storage"""
|
|
648
|
+
if not self.file_storage:
|
|
649
|
+
raise DownloadError("Cloud storage not available")
|
|
650
|
+
|
|
651
|
+
try:
|
|
652
|
+
# Parse the cloud storage path
|
|
653
|
+
storage_path = self._parse_cloud_storage_path(source)
|
|
654
|
+
|
|
655
|
+
# Generate local temp file path
|
|
656
|
+
temp_filename = f"cloud_download_{hash(source)}_{Path(storage_path).name}"
|
|
657
|
+
temp_path = os.path.join(self.config.temp_dir, temp_filename)
|
|
658
|
+
|
|
659
|
+
self.logger.info(f"Downloading from cloud storage: {source} -> {temp_path}")
|
|
660
|
+
|
|
661
|
+
# Retrieve file from cloud storage
|
|
662
|
+
file_data = await self.file_storage.retrieve(storage_path)
|
|
663
|
+
|
|
664
|
+
# Save to local temp file
|
|
665
|
+
if isinstance(file_data, bytes):
|
|
666
|
+
with open(temp_path, "wb") as f:
|
|
667
|
+
f.write(file_data)
|
|
668
|
+
elif isinstance(file_data, str):
|
|
669
|
+
with open(temp_path, "w", encoding="utf-8") as f:
|
|
670
|
+
f.write(file_data)
|
|
671
|
+
else:
|
|
672
|
+
# Handle other data types (e.g., dict, list)
|
|
673
|
+
import json
|
|
674
|
+
|
|
675
|
+
with open(temp_path, "w", encoding="utf-8") as f:
|
|
676
|
+
json.dump(file_data, f)
|
|
677
|
+
|
|
678
|
+
self.logger.info(f"Successfully downloaded file to: {temp_path}")
|
|
679
|
+
return temp_path
|
|
680
|
+
|
|
681
|
+
except Exception as e:
|
|
682
|
+
raise DownloadError(f"Failed to download from cloud storage {source}: {str(e)}")
|
|
683
|
+
|
|
684
|
+
def _parse_cloud_storage_path(self, source: str) -> str:
|
|
685
|
+
"""Parse cloud storage path to get the storage key"""
|
|
686
|
+
try:
|
|
687
|
+
if self._is_storage_id(source):
|
|
688
|
+
# Direct storage ID
|
|
689
|
+
return source
|
|
690
|
+
elif self._is_cloud_storage_path(source):
|
|
691
|
+
parsed = urlparse(source)
|
|
692
|
+
if parsed.scheme == "gs":
|
|
693
|
+
# Google Cloud Storage: gs://bucket/path/file.pdf ->
|
|
694
|
+
# path/file.pdf
|
|
695
|
+
return parsed.path.lstrip("/")
|
|
696
|
+
elif parsed.scheme == "s3":
|
|
697
|
+
# AWS S3: s3://bucket/path/file.pdf -> path/file.pdf
|
|
698
|
+
return parsed.path.lstrip("/")
|
|
699
|
+
elif parsed.scheme == "azure":
|
|
700
|
+
# Azure Blob: azure://container/path/file.pdf ->
|
|
701
|
+
# path/file.pdf
|
|
702
|
+
return parsed.path.lstrip("/")
|
|
703
|
+
elif parsed.scheme == "cloud":
|
|
704
|
+
# Generic cloud: cloud://path/file.pdf -> path/file.pdf
|
|
705
|
+
return parsed.path.lstrip("/")
|
|
706
|
+
else:
|
|
707
|
+
return parsed.path.lstrip("/")
|
|
708
|
+
else:
|
|
709
|
+
# Assume it's already a storage path
|
|
710
|
+
return source
|
|
711
|
+
except Exception as e:
|
|
712
|
+
self.logger.warning(f"Failed to parse cloud storage path {source}: {e}")
|
|
713
|
+
return source
|
|
714
|
+
|
|
715
|
+
def _download_sample(self, url: str, max_size: int = 1024) -> str:
|
|
716
|
+
"""Download a small sample of the document for analysis"""
|
|
717
|
+
# This is a simplified version - in practice, you'd implement range
|
|
718
|
+
# requests
|
|
719
|
+
return self._download_document(url)
|
|
720
|
+
|
|
721
|
+
def _parse_by_type(self, file_path: str, doc_type: DocumentType, strategy: ParsingStrategy) -> Union[str, Dict[str, Any]]:
|
|
722
|
+
"""Parse document based on its type and strategy"""
|
|
723
|
+
try:
|
|
724
|
+
if doc_type == DocumentType.PDF:
|
|
725
|
+
return self._parse_pdf(file_path, strategy)
|
|
726
|
+
elif doc_type in [
|
|
727
|
+
DocumentType.DOCX,
|
|
728
|
+
DocumentType.XLSX,
|
|
729
|
+
DocumentType.PPTX,
|
|
730
|
+
]:
|
|
731
|
+
return self._parse_office_document(file_path, doc_type, strategy)
|
|
732
|
+
elif doc_type == DocumentType.IMAGE:
|
|
733
|
+
return self._parse_image(file_path, strategy)
|
|
734
|
+
elif doc_type in [
|
|
735
|
+
DocumentType.TXT,
|
|
736
|
+
DocumentType.HTML,
|
|
737
|
+
DocumentType.CSV,
|
|
738
|
+
DocumentType.JSON,
|
|
739
|
+
DocumentType.XML,
|
|
740
|
+
DocumentType.MARKDOWN,
|
|
741
|
+
]:
|
|
742
|
+
return self._parse_text_document(file_path, doc_type, strategy)
|
|
743
|
+
else:
|
|
744
|
+
raise UnsupportedDocumentError(f"Unsupported document type: {doc_type}")
|
|
745
|
+
|
|
746
|
+
except Exception as e:
|
|
747
|
+
raise ParseError(f"Failed to parse {doc_type} document: {str(e)}")
|
|
748
|
+
|
|
749
|
+
def _parse_pdf(self, file_path: str, strategy: ParsingStrategy) -> Union[str, Dict[str, Any]]:
|
|
750
|
+
"""Parse PDF document"""
|
|
751
|
+
if self.office_tool:
|
|
752
|
+
try:
|
|
753
|
+
text_content = self.office_tool.extract_text(file_path)
|
|
754
|
+
|
|
755
|
+
if strategy == ParsingStrategy.TEXT_ONLY:
|
|
756
|
+
return text_content
|
|
757
|
+
elif strategy == ParsingStrategy.STRUCTURED:
|
|
758
|
+
# Try to extract structure from PDF
|
|
759
|
+
return {
|
|
760
|
+
"text": text_content,
|
|
761
|
+
"structure": self._extract_pdf_structure(text_content),
|
|
762
|
+
}
|
|
763
|
+
else:
|
|
764
|
+
return {
|
|
765
|
+
"text": text_content,
|
|
766
|
+
"pages": self._split_into_pages(text_content),
|
|
767
|
+
}
|
|
768
|
+
except Exception as e:
|
|
769
|
+
self.logger.warning(f"OfficeTool PDF parsing failed: {e}")
|
|
770
|
+
|
|
771
|
+
# Fallback to simple text extraction
|
|
772
|
+
return self._extract_text_fallback(file_path)
|
|
773
|
+
|
|
774
|
+
def _parse_office_document(self, file_path: str, doc_type: DocumentType, strategy: ParsingStrategy) -> Union[str, Dict[str, Any]]:
|
|
775
|
+
"""Parse Office documents (DOCX, XLSX, PPTX)"""
|
|
776
|
+
if not self.office_tool:
|
|
777
|
+
raise UnsupportedDocumentError("OfficeTool not available for Office document parsing")
|
|
778
|
+
|
|
779
|
+
try:
|
|
780
|
+
text_content = self.office_tool.extract_text(file_path)
|
|
781
|
+
|
|
782
|
+
if strategy == ParsingStrategy.TEXT_ONLY:
|
|
783
|
+
return text_content
|
|
784
|
+
elif strategy == ParsingStrategy.STRUCTURED:
|
|
785
|
+
return {
|
|
786
|
+
"text": text_content,
|
|
787
|
+
"structure": self._extract_office_structure(file_path, doc_type),
|
|
788
|
+
}
|
|
789
|
+
else:
|
|
790
|
+
return {"text": text_content, "raw_content": text_content}
|
|
791
|
+
|
|
792
|
+
except Exception as e:
|
|
793
|
+
raise ParseError(f"Failed to parse Office document: {str(e)}")
|
|
794
|
+
|
|
795
|
+
def _parse_image(self, file_path: str, strategy: ParsingStrategy) -> Union[str, Dict[str, Any]]:
|
|
796
|
+
"""Parse image document using OCR"""
|
|
797
|
+
if not self.image_tool:
|
|
798
|
+
raise UnsupportedDocumentError("ImageTool not available for image OCR")
|
|
799
|
+
|
|
800
|
+
try:
|
|
801
|
+
# Use image tool for OCR
|
|
802
|
+
ocr_result = self.image_tool.ocr_image(file_path)
|
|
803
|
+
|
|
804
|
+
if strategy == ParsingStrategy.TEXT_ONLY:
|
|
805
|
+
return ocr_result.get("text", "")
|
|
806
|
+
else:
|
|
807
|
+
return ocr_result
|
|
808
|
+
|
|
809
|
+
except Exception as e:
|
|
810
|
+
raise ParseError(f"Failed to parse image document: {str(e)}")
|
|
811
|
+
|
|
812
|
+
def _parse_text_document(self, file_path: str, doc_type: DocumentType, strategy: ParsingStrategy) -> Union[str, Dict[str, Any]]:
|
|
813
|
+
"""Parse text-based documents"""
|
|
814
|
+
try:
|
|
815
|
+
with open(
|
|
816
|
+
file_path,
|
|
817
|
+
"r",
|
|
818
|
+
encoding=self.config.default_encoding,
|
|
819
|
+
errors="ignore",
|
|
820
|
+
) as f:
|
|
821
|
+
content = f.read()
|
|
822
|
+
|
|
823
|
+
if strategy == ParsingStrategy.TEXT_ONLY:
|
|
824
|
+
return content
|
|
825
|
+
elif strategy == ParsingStrategy.STRUCTURED:
|
|
826
|
+
return self._extract_text_structure(content, doc_type)
|
|
827
|
+
else:
|
|
828
|
+
return {
|
|
829
|
+
"text": content,
|
|
830
|
+
"lines": content.split("\n"),
|
|
831
|
+
"word_count": len(content.split()),
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
except Exception as e:
|
|
835
|
+
raise ParseError(f"Failed to parse text document: {str(e)}")
|
|
836
|
+
|
|
837
|
+
def _extract_metadata(self, file_path: str, doc_type: DocumentType) -> Dict[str, Any]:
|
|
838
|
+
"""Extract metadata from document"""
|
|
839
|
+
metadata = {
|
|
840
|
+
"file_path": file_path,
|
|
841
|
+
"file_size": os.path.getsize(file_path),
|
|
842
|
+
"file_type": doc_type.value,
|
|
843
|
+
"created_at": os.path.getctime(file_path),
|
|
844
|
+
"modified_at": os.path.getmtime(file_path),
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
# Add type-specific metadata extraction here
|
|
848
|
+
# This could leverage existing tools' metadata extraction capabilities
|
|
849
|
+
|
|
850
|
+
return metadata
|
|
851
|
+
|
|
852
|
+
def _calculate_content_stats(self, content: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
853
|
+
"""Calculate statistics about the parsed content"""
|
|
854
|
+
if isinstance(content, str):
|
|
855
|
+
return {
|
|
856
|
+
"character_count": len(content),
|
|
857
|
+
"word_count": len(content.split()),
|
|
858
|
+
"line_count": len(content.split("\n")),
|
|
859
|
+
"paragraph_count": len([p for p in content.split("\n\n") if p.strip()]),
|
|
860
|
+
}
|
|
861
|
+
else:
|
|
862
|
+
# For structured content, calculate stats on text portion
|
|
863
|
+
text_content = content.get("text", "")
|
|
864
|
+
return self._calculate_content_stats(text_content)
|
|
865
|
+
|
|
866
|
+
def _create_chunks(self, content: str, chunk_size: int) -> List[Dict[str, Any]]:
|
|
867
|
+
"""Create chunks from content for better AI processing"""
|
|
868
|
+
chunks: List[Dict[str, Any]] = []
|
|
869
|
+
words = content.split()
|
|
870
|
+
|
|
871
|
+
for i in range(0, len(words), chunk_size):
|
|
872
|
+
chunk_words = words[i : i + chunk_size]
|
|
873
|
+
chunk_text = " ".join(chunk_words)
|
|
874
|
+
|
|
875
|
+
chunks.append(
|
|
876
|
+
{
|
|
877
|
+
"index": len(chunks),
|
|
878
|
+
"text": chunk_text,
|
|
879
|
+
"word_count": len(chunk_words),
|
|
880
|
+
"start_word": i,
|
|
881
|
+
"end_word": min(i + chunk_size, len(words)),
|
|
882
|
+
}
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
return chunks
|
|
886
|
+
|
|
887
|
+
def _format_as_text(self, result: Dict[str, Any]) -> str:
|
|
888
|
+
"""Format result as plain text"""
|
|
889
|
+
content = result.get("content", "")
|
|
890
|
+
if isinstance(content, dict):
|
|
891
|
+
return content.get("text", str(content))
|
|
892
|
+
return str(content)
|
|
893
|
+
|
|
894
|
+
def _format_as_markdown(self, result: Dict[str, Any]) -> str:
|
|
895
|
+
"""Format result as Markdown"""
|
|
896
|
+
content = result.get("content", "")
|
|
897
|
+
result.get("metadata", {})
|
|
898
|
+
|
|
899
|
+
md_content = f"# Document: {result.get('source', 'Unknown')}\n\n"
|
|
900
|
+
md_content += f"**Type:** {result.get('document_type', 'Unknown')}\n"
|
|
901
|
+
md_content += f"**Detection Confidence:** {result.get('detection_confidence', 0):.2f}\n\n"
|
|
902
|
+
|
|
903
|
+
if isinstance(content, dict):
|
|
904
|
+
md_content += content.get("text", str(content))
|
|
905
|
+
else:
|
|
906
|
+
md_content += str(content)
|
|
907
|
+
|
|
908
|
+
return md_content
|
|
909
|
+
|
|
910
|
+
def _format_as_html(self, result: Dict[str, Any]) -> str:
|
|
911
|
+
"""Format result as HTML"""
|
|
912
|
+
content = result.get("content", "")
|
|
913
|
+
|
|
914
|
+
html_content = f"""
|
|
915
|
+
<html>
|
|
916
|
+
<head><title>Parsed Document</title></head>
|
|
917
|
+
<body>
|
|
918
|
+
<h1>Document: {result.get('source', 'Unknown')}</h1>
|
|
919
|
+
<p><strong>Type:</strong> {result.get('document_type', 'Unknown')}</p>
|
|
920
|
+
<p><strong>Detection Confidence:</strong> {result.get('detection_confidence', 0):.2f}</p>
|
|
921
|
+
<div class="content">
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
if isinstance(content, dict):
|
|
925
|
+
html_content += f"<pre>{content.get('text', str(content))}</pre>"
|
|
926
|
+
else:
|
|
927
|
+
html_content += f"<pre>{str(content)}</pre>"
|
|
928
|
+
|
|
929
|
+
html_content += "</div></body></html>"
|
|
930
|
+
return html_content
|
|
931
|
+
|
|
932
|
+
def _cleanup_temp_files(self, source: str):
|
|
933
|
+
"""Clean up temporary files"""
|
|
934
|
+
import glob
|
|
935
|
+
|
|
936
|
+
if self._is_url(source):
|
|
937
|
+
# Clean up URL downloaded files
|
|
938
|
+
temp_pattern = os.path.join(self.config.temp_dir, f"download_{hash(source)}_*")
|
|
939
|
+
for temp_file in glob.glob(temp_pattern):
|
|
940
|
+
try:
|
|
941
|
+
os.remove(temp_file)
|
|
942
|
+
self.logger.debug(f"Cleaned up temp file: {temp_file}")
|
|
943
|
+
except Exception as e:
|
|
944
|
+
self.logger.warning(f"Failed to clean up temp file {temp_file}: {e}")
|
|
945
|
+
|
|
946
|
+
elif self._is_cloud_storage_path(source) or self._is_storage_id(source):
|
|
947
|
+
# Clean up cloud storage downloaded files
|
|
948
|
+
temp_pattern = os.path.join(self.config.temp_dir, f"cloud_download_{hash(source)}_*")
|
|
949
|
+
for temp_file in glob.glob(temp_pattern):
|
|
950
|
+
try:
|
|
951
|
+
os.remove(temp_file)
|
|
952
|
+
self.logger.debug(f"Cleaned up cloud temp file: {temp_file}")
|
|
953
|
+
except Exception as e:
|
|
954
|
+
self.logger.warning(f"Failed to clean up cloud temp file {temp_file}: {e}")
|
|
955
|
+
|
|
956
|
+
# Helper methods for structure extraction
|
|
957
|
+
def _extract_pdf_structure(self, text: str) -> Dict[str, Any]:
|
|
958
|
+
"""Extract structure from PDF text"""
|
|
959
|
+
# Implement PDF structure extraction logic
|
|
960
|
+
return {"sections": [], "headings": []}
|
|
961
|
+
|
|
962
|
+
def _extract_office_structure(self, file_path: str, doc_type: DocumentType) -> Dict[str, Any]:
|
|
963
|
+
"""Extract structure from Office documents"""
|
|
964
|
+
# Implement Office document structure extraction
|
|
965
|
+
return {"sections": [], "tables": [], "images": []}
|
|
966
|
+
|
|
967
|
+
def _extract_text_structure(self, content: str, doc_type: DocumentType) -> Dict[str, Any]:
|
|
968
|
+
"""Extract structure from text documents"""
|
|
969
|
+
result: Dict[str, Any] = {"text": content}
|
|
970
|
+
|
|
971
|
+
if doc_type == DocumentType.MARKDOWN:
|
|
972
|
+
# Extract markdown structure
|
|
973
|
+
headings = re.findall(r"^(#{1,6})\s+(.+)$", content, re.MULTILINE)
|
|
974
|
+
result["headings"] = [{"level": len(h[0]), "text": h[1]} for h in headings]
|
|
975
|
+
elif doc_type == DocumentType.HTML:
|
|
976
|
+
# Extract HTML structure (simplified)
|
|
977
|
+
from bs4 import BeautifulSoup
|
|
978
|
+
|
|
979
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
980
|
+
result["title"] = soup.title.string if soup.title else ""
|
|
981
|
+
result["headings"] = [{"tag": h.name, "text": h.get_text()} for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
|
|
982
|
+
elif doc_type == DocumentType.JSON:
|
|
983
|
+
import json
|
|
984
|
+
|
|
985
|
+
try:
|
|
986
|
+
result["json_data"] = json.loads(content)
|
|
987
|
+
except Exception:
|
|
988
|
+
pass
|
|
989
|
+
|
|
990
|
+
return result
|
|
991
|
+
|
|
992
|
+
def _split_into_pages(self, text: str) -> List[str]:
|
|
993
|
+
"""Split text into pages (simplified)"""
|
|
994
|
+
# This is a simple implementation - could be enhanced
|
|
995
|
+
# Form feed character often indicates page break
|
|
996
|
+
pages = text.split("\f")
|
|
997
|
+
return [page.strip() for page in pages if page.strip()]
|
|
998
|
+
|
|
999
|
+
def _extract_text_fallback(self, file_path: str) -> str:
|
|
1000
|
+
"""Fallback text extraction method"""
|
|
1001
|
+
try:
|
|
1002
|
+
with open(
|
|
1003
|
+
file_path,
|
|
1004
|
+
"r",
|
|
1005
|
+
encoding=self.config.default_encoding,
|
|
1006
|
+
errors="ignore",
|
|
1007
|
+
) as f:
|
|
1008
|
+
return f.read()
|
|
1009
|
+
except Exception:
|
|
1010
|
+
with open(file_path, "rb") as f:
|
|
1011
|
+
return f.read().decode("utf-8", errors="ignore")
|