aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,994 @@
1
+ import os
2
+ import re
3
+ import logging
4
+ import asyncio
5
+ from typing import Dict, Any, List, Optional, Union, Tuple
6
+ from enum import Enum
7
+ from urllib.parse import urlparse
8
+ from pathlib import Path
9
+ import tempfile
10
+
11
+ from pydantic import BaseModel, Field
12
+ from pydantic_settings import BaseSettings, SettingsConfigDict
13
+
14
+ from aiecs.tools.base_tool import BaseTool
15
+ from aiecs.tools import register_tool
16
+
17
+
18
+ class DocumentType(str, Enum):
19
+ """Supported document types for parsing"""
20
+
21
+ PDF = "pdf"
22
+ DOCX = "docx"
23
+ XLSX = "xlsx"
24
+ PPTX = "pptx"
25
+ TXT = "txt"
26
+ HTML = "html"
27
+ RTF = "rtf"
28
+ CSV = "csv"
29
+ JSON = "json"
30
+ XML = "xml"
31
+ MARKDOWN = "md"
32
+ IMAGE = "image"
33
+ UNKNOWN = "unknown"
34
+
35
+
36
+ class ParsingStrategy(str, Enum):
37
+ """Document parsing strategies"""
38
+
39
+ TEXT_ONLY = "text_only"
40
+ STRUCTURED = "structured"
41
+ FULL_CONTENT = "full_content"
42
+ METADATA_ONLY = "metadata_only"
43
+
44
+
45
+ class OutputFormat(str, Enum):
46
+ """Output formats for parsed content"""
47
+
48
+ TEXT = "text"
49
+ JSON = "json"
50
+ MARKDOWN = "markdown"
51
+ HTML = "html"
52
+
53
+
54
+ class DocumentParserError(Exception):
55
+ """Base exception for document parser errors"""
56
+
57
+
58
+ class UnsupportedDocumentError(DocumentParserError):
59
+ """Raised when document type is not supported"""
60
+
61
+
62
+ class DownloadError(DocumentParserError):
63
+ """Raised when document download fails"""
64
+
65
+
66
+ class ParseError(DocumentParserError):
67
+ """Raised when document parsing fails"""
68
+
69
+
70
+ @register_tool("document_parser")
71
+ class DocumentParserTool(BaseTool):
72
+ """
73
+ Modern high-performance document parsing component that can:
74
+ 1. Auto-detect document types from URLs or files
75
+ 2. Download documents from URLs
76
+ 3. Parse various document formats using existing atomic tools
77
+ 4. Output structured content for AI consumption
78
+
79
+ Leverages existing tools:
80
+ - ScraperTool for URL downloading
81
+ - OfficeTool for Office document parsing
82
+ - ImageTool for image OCR
83
+ """
84
+
85
+ # Configuration schema
86
+ class Config(BaseSettings):
87
+ """Configuration for the document parser tool
88
+
89
+ Automatically reads from environment variables with DOC_PARSER_ prefix.
90
+ Example: DOC_PARSER_GCS_PROJECT_ID -> gcs_project_id
91
+ """
92
+
93
+ model_config = SettingsConfigDict(env_prefix="DOC_PARSER_")
94
+
95
+ user_agent: str = Field(
96
+ default="DocumentParser/1.0",
97
+ description="User agent for HTTP requests",
98
+ )
99
+ max_file_size: int = Field(
100
+ default=50 * 1024 * 1024, description="Maximum file size in bytes"
101
+ )
102
+ temp_dir: str = Field(
103
+ default=os.path.join(tempfile.gettempdir(), "document_parser"),
104
+ description="Temporary directory for document processing",
105
+ )
106
+ default_encoding: str = Field(
107
+ default="utf-8", description="Default encoding for text files"
108
+ )
109
+ timeout: int = Field(default=30, description="Timeout for HTTP requests in seconds")
110
+ max_pages: int = Field(
111
+ default=1000,
112
+ description="Maximum number of pages to process for large documents",
113
+ )
114
+ enable_cloud_storage: bool = Field(
115
+ default=True,
116
+ description="Whether to enable cloud storage integration",
117
+ )
118
+ gcs_bucket_name: str = Field(
119
+ default="aiecs-documents",
120
+ description="Google Cloud Storage bucket name",
121
+ )
122
+ gcs_project_id: Optional[str] = Field(
123
+ default=None, description="Google Cloud Storage project ID"
124
+ )
125
+
126
+ def __init__(self, config: Optional[Dict] = None):
127
+ """Initialize DocumentParserTool with settings"""
128
+ super().__init__(config)
129
+
130
+ # Parse configuration with BaseSettings
131
+ # BaseSettings automatically reads from environment variables with DOC_PARSER_ prefix
132
+ # Config dict values override environment variables
133
+ if config:
134
+ # Filter out None values to allow env vars to be used for missing
135
+ # keys
136
+ filtered_config = {k: v for k, v in config.items() if v is not None}
137
+ self.config = self.Config(**filtered_config)
138
+ else:
139
+ # No config provided, read entirely from environment variables
140
+ self.config = self.Config()
141
+
142
+ self.logger = logging.getLogger(__name__)
143
+ os.makedirs(self.config.temp_dir, exist_ok=True)
144
+
145
+ # Initialize dependent tools
146
+ self._init_dependent_tools()
147
+
148
+ # Initialize cloud storage
149
+ self._init_cloud_storage()
150
+
151
+ def _init_dependent_tools(self):
152
+ """Initialize dependent tools for document processing"""
153
+ try:
154
+ from aiecs.tools.task_tools.scraper_tool import ScraperTool
155
+
156
+ self.scraper_tool = ScraperTool()
157
+ except ImportError:
158
+ self.logger.warning("ScraperTool not available")
159
+ self.scraper_tool = None
160
+
161
+ try:
162
+ from aiecs.tools.task_tools.office_tool import OfficeTool
163
+
164
+ self.office_tool = OfficeTool()
165
+ except ImportError:
166
+ self.logger.warning("OfficeTool not available")
167
+ self.office_tool = None
168
+
169
+ try:
170
+ from aiecs.tools.task_tools.image_tool import ImageTool
171
+
172
+ self.image_tool = ImageTool()
173
+ except ImportError:
174
+ self.logger.warning("ImageTool not available")
175
+ self.image_tool = None
176
+
177
+ def _init_cloud_storage(self):
178
+ """Initialize cloud storage for document retrieval"""
179
+ self.file_storage = None
180
+
181
+ if self.config.enable_cloud_storage:
182
+ try:
183
+ from aiecs.infrastructure.persistence.file_storage import (
184
+ FileStorage,
185
+ )
186
+
187
+ storage_config = {
188
+ "gcs_bucket_name": self.config.gcs_bucket_name,
189
+ "gcs_project_id": self.config.gcs_project_id,
190
+ "enable_local_fallback": True,
191
+ "local_storage_path": self.config.temp_dir,
192
+ }
193
+
194
+ self.file_storage = FileStorage(storage_config)
195
+ asyncio.create_task(self._init_storage_async())
196
+
197
+ except ImportError:
198
+ self.logger.warning("FileStorage not available, cloud storage disabled")
199
+ except Exception as e:
200
+ self.logger.warning(f"Failed to initialize cloud storage: {e}")
201
+
202
+ async def _init_storage_async(self):
203
+ """Async initialization of file storage"""
204
+ try:
205
+ if self.file_storage:
206
+ await self.file_storage.initialize()
207
+ self.logger.info("Cloud storage initialized successfully")
208
+ except Exception as e:
209
+ self.logger.warning(f"Cloud storage initialization failed: {e}")
210
+ self.file_storage = None
211
+
212
+ # Schema definitions
213
+ class ParseDocumentSchema(BaseModel):
214
+ """Schema for parse_document operation"""
215
+
216
+ source: str = Field(description="URL or file path to the document")
217
+ strategy: ParsingStrategy = Field(
218
+ default=ParsingStrategy.FULL_CONTENT,
219
+ description="Parsing strategy",
220
+ )
221
+ output_format: OutputFormat = Field(default=OutputFormat.JSON, description="Output format")
222
+ force_type: Optional[DocumentType] = Field(
223
+ default=None, description="Force document type detection"
224
+ )
225
+ extract_metadata: bool = Field(default=True, description="Whether to extract metadata")
226
+ chunk_size: Optional[int] = Field(
227
+ default=None, description="Chunk size for large documents"
228
+ )
229
+
230
+ class DetectTypeSchema(BaseModel):
231
+ """Schema for detect_document_type operation"""
232
+
233
+ source: str = Field(description="URL or file path to analyze")
234
+ download_sample: bool = Field(
235
+ default=True,
236
+ description="Download sample for content-based detection",
237
+ )
238
+
239
+ def detect_document_type(self, source: str, download_sample: bool = True) -> Dict[str, Any]:
240
+ """
241
+ Detect document type from URL or file path
242
+
243
+ Args:
244
+ source: URL or file path
245
+ download_sample: Whether to download sample for content analysis
246
+
247
+ Returns:
248
+ Dict containing detected type and confidence
249
+ """
250
+ try:
251
+ result = {
252
+ "source": source,
253
+ "is_url": self._is_url(source),
254
+ "detected_type": DocumentType.UNKNOWN,
255
+ "confidence": 0.0,
256
+ "mime_type": None,
257
+ "file_extension": None,
258
+ "file_size": None,
259
+ "detection_methods": [],
260
+ }
261
+
262
+ # Method 1: File extension analysis
263
+ extension_type, ext_confidence = self._detect_by_extension(source)
264
+ if extension_type != DocumentType.UNKNOWN:
265
+ result["detected_type"] = extension_type
266
+ result["confidence"] = ext_confidence
267
+ result["file_extension"] = Path(source).suffix.lower()
268
+ result["detection_methods"].append("file_extension")
269
+
270
+ # Method 2: MIME type detection (for URLs)
271
+ if self._is_url(source) and download_sample:
272
+ mime_type, mime_confidence = self._detect_by_mime_type(source)
273
+ if mime_type != DocumentType.UNKNOWN and mime_confidence > result["confidence"]:
274
+ result["detected_type"] = mime_type
275
+ result["confidence"] = mime_confidence
276
+ result["detection_methods"].append("mime_type")
277
+
278
+ # Method 3: Content-based detection
279
+ if download_sample:
280
+ content_type, content_confidence = self._detect_by_content(source)
281
+ if (
282
+ content_type != DocumentType.UNKNOWN
283
+ and content_confidence > result["confidence"]
284
+ ):
285
+ result["detected_type"] = content_type
286
+ result["confidence"] = content_confidence
287
+ result["detection_methods"].append("content_analysis")
288
+
289
+ return result
290
+
291
+ except Exception as e:
292
+ raise DocumentParserError(f"Document type detection failed: {str(e)}")
293
+
294
+ def parse_document(
295
+ self,
296
+ source: str,
297
+ strategy: ParsingStrategy = ParsingStrategy.FULL_CONTENT,
298
+ output_format: OutputFormat = OutputFormat.JSON,
299
+ force_type: Optional[DocumentType] = None,
300
+ extract_metadata: bool = True,
301
+ chunk_size: Optional[int] = None,
302
+ ) -> Dict[str, Any]:
303
+ """
304
+ Parse document from URL or file path
305
+
306
+ Args:
307
+ source: URL or file path to document
308
+ strategy: Parsing strategy to use
309
+ output_format: Format for output content
310
+ force_type: Force specific document type
311
+ extract_metadata: Whether to extract metadata
312
+ chunk_size: Chunk size for large documents
313
+
314
+ Returns:
315
+ Dict containing parsed content and metadata
316
+ """
317
+ try:
318
+ # Step 1: Detect document type
319
+ if force_type:
320
+ doc_type = force_type
321
+ confidence = 1.0
322
+ else:
323
+ detection_result = self.detect_document_type(source)
324
+ doc_type = detection_result["detected_type"]
325
+ confidence = detection_result["confidence"]
326
+
327
+ if confidence < 0.5:
328
+ raise UnsupportedDocumentError(
329
+ f"Unable to reliably detect document type for: {source}"
330
+ )
331
+
332
+ # Step 2: Download document if it's a URL
333
+ local_path = self._ensure_local_file(source)
334
+
335
+ # Step 3: Parse document based on type and strategy
336
+ content = self._parse_by_type(local_path, doc_type, strategy)
337
+
338
+ # Step 4: Extract metadata if requested
339
+ metadata = {}
340
+ if extract_metadata:
341
+ metadata = self._extract_metadata(local_path, doc_type)
342
+
343
+ # Step 5: Format output
344
+ result = {
345
+ "source": source,
346
+ "document_type": doc_type,
347
+ "detection_confidence": confidence,
348
+ "parsing_strategy": strategy,
349
+ "metadata": metadata,
350
+ "content": content,
351
+ "content_stats": self._calculate_content_stats(content),
352
+ "chunks": [],
353
+ }
354
+
355
+ # Step 6: Create chunks if requested
356
+ if chunk_size and isinstance(content, str):
357
+ result["chunks"] = self._create_chunks(content, chunk_size)
358
+
359
+ # Step 7: Format output according to requested format
360
+ if output_format == OutputFormat.TEXT:
361
+ return {"text": self._format_as_text(result)}
362
+ elif output_format == OutputFormat.MARKDOWN:
363
+ return {"markdown": self._format_as_markdown(result)}
364
+ elif output_format == OutputFormat.HTML:
365
+ return {"html": self._format_as_html(result)}
366
+ else:
367
+ return result
368
+
369
+ except Exception as e:
370
+ if isinstance(e, DocumentParserError):
371
+ raise
372
+ raise ParseError(f"Document parsing failed: {str(e)}")
373
+ finally:
374
+ # Cleanup temporary files
375
+ self._cleanup_temp_files(source)
376
+
377
+ async def parse_document_async(
378
+ self,
379
+ source: str,
380
+ strategy: ParsingStrategy = ParsingStrategy.FULL_CONTENT,
381
+ output_format: OutputFormat = OutputFormat.JSON,
382
+ force_type: Optional[DocumentType] = None,
383
+ extract_metadata: bool = True,
384
+ chunk_size: Optional[int] = None,
385
+ ) -> Dict[str, Any]:
386
+ """Async version of parse_document"""
387
+ return await asyncio.to_thread(
388
+ self.parse_document,
389
+ source=source,
390
+ strategy=strategy,
391
+ output_format=output_format,
392
+ force_type=force_type,
393
+ extract_metadata=extract_metadata,
394
+ chunk_size=chunk_size,
395
+ )
396
+
397
+ def _is_url(self, source: str) -> bool:
398
+ """Check if source is a URL"""
399
+ try:
400
+ result = urlparse(source)
401
+ return bool(result.scheme and result.netloc)
402
+ except Exception:
403
+ return False
404
+
405
+ def _is_cloud_storage_path(self, source: str) -> bool:
406
+ """Check if source is a cloud storage path"""
407
+ # Support various cloud storage path formats:
408
+ # - gs://bucket/path/file.pdf (Google Cloud Storage)
409
+ # - s3://bucket/path/file.pdf (AWS S3)
410
+ # - azure://container/path/file.pdf (Azure Blob Storage)
411
+ # - cloud://path/file.pdf (Generic cloud storage)
412
+ cloud_schemes = ["gs", "s3", "azure", "cloud"]
413
+ try:
414
+ parsed = urlparse(source)
415
+ return parsed.scheme in cloud_schemes
416
+ except Exception:
417
+ return False
418
+
419
+ def _is_storage_id(self, source: str) -> bool:
420
+ """Check if source is a storage ID (UUID-like identifier)"""
421
+ # Check for UUID patterns or other storage ID formats
422
+ import re
423
+
424
+ uuid_pattern = r"^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$"
425
+ storage_id_pattern = r"^[a-zA-Z0-9_-]{10,}$" # Generic storage ID
426
+
427
+ return bool(
428
+ re.match(uuid_pattern, source, re.IGNORECASE) or re.match(storage_id_pattern, source)
429
+ )
430
+
431
+ def _detect_by_extension(self, source: str) -> Tuple[DocumentType, float]:
432
+ """Detect document type by file extension"""
433
+ try:
434
+ path = Path(source)
435
+ ext = path.suffix.lower()
436
+
437
+ extension_map = {
438
+ ".pdf": DocumentType.PDF,
439
+ ".docx": DocumentType.DOCX,
440
+ ".doc": DocumentType.DOCX,
441
+ ".xlsx": DocumentType.XLSX,
442
+ ".xls": DocumentType.XLSX,
443
+ ".pptx": DocumentType.PPTX,
444
+ ".ppt": DocumentType.PPTX,
445
+ ".txt": DocumentType.TXT,
446
+ ".html": DocumentType.HTML,
447
+ ".htm": DocumentType.HTML,
448
+ ".rtf": DocumentType.RTF,
449
+ ".csv": DocumentType.CSV,
450
+ ".json": DocumentType.JSON,
451
+ ".xml": DocumentType.XML,
452
+ ".md": DocumentType.MARKDOWN,
453
+ ".markdown": DocumentType.MARKDOWN,
454
+ ".jpg": DocumentType.IMAGE,
455
+ ".jpeg": DocumentType.IMAGE,
456
+ ".png": DocumentType.IMAGE,
457
+ ".gif": DocumentType.IMAGE,
458
+ ".bmp": DocumentType.IMAGE,
459
+ ".tiff": DocumentType.IMAGE,
460
+ }
461
+
462
+ doc_type = extension_map.get(ext, DocumentType.UNKNOWN)
463
+ confidence = 0.8 if doc_type != DocumentType.UNKNOWN else 0.0
464
+
465
+ return doc_type, confidence
466
+
467
+ except Exception:
468
+ return DocumentType.UNKNOWN, 0.0
469
+
470
+ def _detect_by_mime_type(self, url: str) -> Tuple[DocumentType, float]:
471
+ """Detect document type by MIME type from URL"""
472
+ try:
473
+ if not self.scraper_tool:
474
+ return DocumentType.UNKNOWN, 0.0
475
+
476
+ # Get headers only
477
+ response = asyncio.run(
478
+ self.scraper_tool.get_httpx(url, method="HEAD", verify_ssl=False)
479
+ )
480
+
481
+ content_type = response.get("headers", {}).get("content-type", "").lower()
482
+
483
+ mime_map = {
484
+ "application/pdf": DocumentType.PDF,
485
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": DocumentType.DOCX,
486
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": DocumentType.XLSX,
487
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": DocumentType.PPTX,
488
+ "text/plain": DocumentType.TXT,
489
+ "text/html": DocumentType.HTML,
490
+ "application/rtf": DocumentType.RTF,
491
+ "text/csv": DocumentType.CSV,
492
+ "application/json": DocumentType.JSON,
493
+ "application/xml": DocumentType.XML,
494
+ "text/xml": DocumentType.XML,
495
+ "text/markdown": DocumentType.MARKDOWN,
496
+ "image/jpeg": DocumentType.IMAGE,
497
+ "image/png": DocumentType.IMAGE,
498
+ "image/gif": DocumentType.IMAGE,
499
+ "image/bmp": DocumentType.IMAGE,
500
+ "image/tiff": DocumentType.IMAGE,
501
+ }
502
+
503
+ for mime_pattern, doc_type in mime_map.items():
504
+ if mime_pattern in content_type:
505
+ return doc_type, 0.9
506
+
507
+ return DocumentType.UNKNOWN, 0.0
508
+
509
+ except Exception:
510
+ return DocumentType.UNKNOWN, 0.0
511
+
512
+ def _detect_by_content(self, source: str) -> Tuple[DocumentType, float]:
513
+ """Detect document type by content analysis"""
514
+ try:
515
+ # Download a small sample for analysis
516
+ if self._is_url(source):
517
+ sample_path = self._download_sample(source, max_size=1024) # 1KB sample
518
+ else:
519
+ sample_path = source
520
+
521
+ with open(sample_path, "rb") as f:
522
+ header = f.read(512) # Read first 512 bytes
523
+
524
+ # Magic number detection
525
+ if header.startswith(b"%PDF"):
526
+ return DocumentType.PDF, 0.95
527
+ elif header.startswith(b"PK\x03\x04"): # ZIP-based formats
528
+ if b"word/" in header or b"document.xml" in header:
529
+ return DocumentType.DOCX, 0.9
530
+ elif b"xl/" in header or b"workbook.xml" in header:
531
+ return DocumentType.XLSX, 0.9
532
+ elif b"ppt/" in header or b"presentation.xml" in header:
533
+ return DocumentType.PPTX, 0.9
534
+ elif header.startswith(b"{\rtf"):
535
+ return DocumentType.RTF, 0.95
536
+ elif header.startswith((b"\xff\xd8\xff", b"\x89PNG", b"GIF8")):
537
+ return DocumentType.IMAGE, 0.95
538
+ elif header.startswith(b"<?xml"):
539
+ return DocumentType.XML, 0.9
540
+ elif header.startswith((b"{", b"[")):
541
+ # Try to parse as JSON
542
+ try:
543
+ import json
544
+
545
+ json.loads(header.decode("utf-8", errors="ignore"))
546
+ return DocumentType.JSON, 0.85
547
+ except Exception:
548
+ pass
549
+
550
+ # Text-based detection
551
+ try:
552
+ text_content = header.decode("utf-8", errors="ignore")
553
+ if re.match(r"^#\s+.*$", text_content, re.MULTILINE):
554
+ return DocumentType.MARKDOWN, 0.7
555
+ elif "<html" in text_content.lower() or "<!doctype html" in text_content.lower():
556
+ return DocumentType.HTML, 0.85
557
+ elif "," in text_content and "\n" in text_content:
558
+ # Simple CSV detection
559
+ lines = text_content.split("\n")[:5]
560
+ if all("," in line for line in lines if line.strip()):
561
+ return DocumentType.CSV, 0.6
562
+ except Exception:
563
+ pass
564
+
565
+ return DocumentType.UNKNOWN, 0.0
566
+
567
+ except Exception:
568
+ return DocumentType.UNKNOWN, 0.0
569
+
570
+ def _ensure_local_file(self, source: str) -> str:
571
+ """Ensure we have a local file, download/retrieve if necessary"""
572
+ # Check source type and handle accordingly
573
+ if self._is_cloud_storage_path(source) or self._is_storage_id(source):
574
+ # Download from cloud storage
575
+ return asyncio.run(self._download_from_cloud_storage(source))
576
+ elif self._is_url(source):
577
+ # Download from URL
578
+ return self._download_document(source)
579
+ else:
580
+ # Local file path
581
+ if not os.path.exists(source):
582
+ raise FileNotFoundError(f"File not found: {source}")
583
+ return source
584
+
585
+ def _download_document(self, url: str) -> str:
586
+ """Download document from URL"""
587
+ try:
588
+ if not self.scraper_tool:
589
+ raise DownloadError("ScraperTool not available for URL download")
590
+
591
+ # Generate temp file path
592
+ parsed_url = urlparse(url)
593
+ filename = os.path.basename(parsed_url.path) or "document"
594
+ temp_path = os.path.join(self.config.temp_dir, f"download_{hash(url)}_{filename}")
595
+
596
+ # Download using scraper tool
597
+ result = asyncio.run(
598
+ self.scraper_tool.get_httpx(
599
+ url,
600
+ content_type="binary",
601
+ output_path=temp_path,
602
+ verify_ssl=False,
603
+ )
604
+ )
605
+
606
+ if isinstance(result, dict) and "saved_to" in result:
607
+ return result["saved_to"]
608
+ else:
609
+ # Fallback: save content manually
610
+ with open(temp_path, "wb") as f:
611
+ if isinstance(result, dict) and "content" in result:
612
+ f.write(result["content"])
613
+ else:
614
+ f.write(result)
615
+ return temp_path
616
+
617
+ except Exception as e:
618
+ raise DownloadError(f"Failed to download document from {url}: {str(e)}")
619
+
620
+ async def _download_from_cloud_storage(self, source: str) -> str:
621
+ """Download document from cloud storage"""
622
+ if not self.file_storage:
623
+ raise DownloadError("Cloud storage not available")
624
+
625
+ try:
626
+ # Parse the cloud storage path
627
+ storage_path = self._parse_cloud_storage_path(source)
628
+
629
+ # Generate local temp file path
630
+ temp_filename = f"cloud_download_{hash(source)}_{Path(storage_path).name}"
631
+ temp_path = os.path.join(self.config.temp_dir, temp_filename)
632
+
633
+ self.logger.info(f"Downloading from cloud storage: {source} -> {temp_path}")
634
+
635
+ # Retrieve file from cloud storage
636
+ file_data = await self.file_storage.retrieve(storage_path)
637
+
638
+ # Save to local temp file
639
+ if isinstance(file_data, bytes):
640
+ with open(temp_path, "wb") as f:
641
+ f.write(file_data)
642
+ elif isinstance(file_data, str):
643
+ with open(temp_path, "w", encoding="utf-8") as f:
644
+ f.write(file_data)
645
+ else:
646
+ # Handle other data types (e.g., dict, list)
647
+ import json
648
+
649
+ with open(temp_path, "w", encoding="utf-8") as f:
650
+ json.dump(file_data, f)
651
+
652
+ self.logger.info(f"Successfully downloaded file to: {temp_path}")
653
+ return temp_path
654
+
655
+ except Exception as e:
656
+ raise DownloadError(f"Failed to download from cloud storage {source}: {str(e)}")
657
+
658
+ def _parse_cloud_storage_path(self, source: str) -> str:
659
+ """Parse cloud storage path to get the storage key"""
660
+ try:
661
+ if self._is_storage_id(source):
662
+ # Direct storage ID
663
+ return source
664
+ elif self._is_cloud_storage_path(source):
665
+ parsed = urlparse(source)
666
+ if parsed.scheme == "gs":
667
+ # Google Cloud Storage: gs://bucket/path/file.pdf ->
668
+ # path/file.pdf
669
+ return parsed.path.lstrip("/")
670
+ elif parsed.scheme == "s3":
671
+ # AWS S3: s3://bucket/path/file.pdf -> path/file.pdf
672
+ return parsed.path.lstrip("/")
673
+ elif parsed.scheme == "azure":
674
+ # Azure Blob: azure://container/path/file.pdf ->
675
+ # path/file.pdf
676
+ return parsed.path.lstrip("/")
677
+ elif parsed.scheme == "cloud":
678
+ # Generic cloud: cloud://path/file.pdf -> path/file.pdf
679
+ return parsed.path.lstrip("/")
680
+ else:
681
+ return parsed.path.lstrip("/")
682
+ else:
683
+ # Assume it's already a storage path
684
+ return source
685
+ except Exception as e:
686
+ self.logger.warning(f"Failed to parse cloud storage path {source}: {e}")
687
+ return source
688
+
689
+ def _download_sample(self, url: str, max_size: int = 1024) -> str:
690
+ """Download a small sample of the document for analysis"""
691
+ # This is a simplified version - in practice, you'd implement range
692
+ # requests
693
+ return self._download_document(url)
694
+
695
+ def _parse_by_type(
696
+ self, file_path: str, doc_type: DocumentType, strategy: ParsingStrategy
697
+ ) -> Union[str, Dict[str, Any]]:
698
+ """Parse document based on its type and strategy"""
699
+ try:
700
+ if doc_type == DocumentType.PDF:
701
+ return self._parse_pdf(file_path, strategy)
702
+ elif doc_type in [
703
+ DocumentType.DOCX,
704
+ DocumentType.XLSX,
705
+ DocumentType.PPTX,
706
+ ]:
707
+ return self._parse_office_document(file_path, doc_type, strategy)
708
+ elif doc_type == DocumentType.IMAGE:
709
+ return self._parse_image(file_path, strategy)
710
+ elif doc_type in [
711
+ DocumentType.TXT,
712
+ DocumentType.HTML,
713
+ DocumentType.CSV,
714
+ DocumentType.JSON,
715
+ DocumentType.XML,
716
+ DocumentType.MARKDOWN,
717
+ ]:
718
+ return self._parse_text_document(file_path, doc_type, strategy)
719
+ else:
720
+ raise UnsupportedDocumentError(f"Unsupported document type: {doc_type}")
721
+
722
+ except Exception as e:
723
+ raise ParseError(f"Failed to parse {doc_type} document: {str(e)}")
724
+
725
+ def _parse_pdf(self, file_path: str, strategy: ParsingStrategy) -> Union[str, Dict[str, Any]]:
726
+ """Parse PDF document"""
727
+ if self.office_tool:
728
+ try:
729
+ text_content = self.office_tool.extract_text(file_path)
730
+
731
+ if strategy == ParsingStrategy.TEXT_ONLY:
732
+ return text_content
733
+ elif strategy == ParsingStrategy.STRUCTURED:
734
+ # Try to extract structure from PDF
735
+ return {
736
+ "text": text_content,
737
+ "structure": self._extract_pdf_structure(text_content),
738
+ }
739
+ else:
740
+ return {
741
+ "text": text_content,
742
+ "pages": self._split_into_pages(text_content),
743
+ }
744
+ except Exception as e:
745
+ self.logger.warning(f"OfficeTool PDF parsing failed: {e}")
746
+
747
+ # Fallback to simple text extraction
748
+ return self._extract_text_fallback(file_path)
749
+
750
+ def _parse_office_document(
751
+ self, file_path: str, doc_type: DocumentType, strategy: ParsingStrategy
752
+ ) -> Union[str, Dict[str, Any]]:
753
+ """Parse Office documents (DOCX, XLSX, PPTX)"""
754
+ if not self.office_tool:
755
+ raise UnsupportedDocumentError("OfficeTool not available for Office document parsing")
756
+
757
+ try:
758
+ text_content = self.office_tool.extract_text(file_path)
759
+
760
+ if strategy == ParsingStrategy.TEXT_ONLY:
761
+ return text_content
762
+ elif strategy == ParsingStrategy.STRUCTURED:
763
+ return {
764
+ "text": text_content,
765
+ "structure": self._extract_office_structure(file_path, doc_type),
766
+ }
767
+ else:
768
+ return {"text": text_content, "raw_content": text_content}
769
+
770
+ except Exception as e:
771
+ raise ParseError(f"Failed to parse Office document: {str(e)}")
772
+
773
+ def _parse_image(self, file_path: str, strategy: ParsingStrategy) -> Union[str, Dict[str, Any]]:
774
+ """Parse image document using OCR"""
775
+ if not self.image_tool:
776
+ raise UnsupportedDocumentError("ImageTool not available for image OCR")
777
+
778
+ try:
779
+ # Use image tool for OCR
780
+ ocr_result = self.image_tool.ocr_image(file_path)
781
+
782
+ if strategy == ParsingStrategy.TEXT_ONLY:
783
+ return ocr_result.get("text", "")
784
+ else:
785
+ return ocr_result
786
+
787
+ except Exception as e:
788
+ raise ParseError(f"Failed to parse image document: {str(e)}")
789
+
790
+ def _parse_text_document(
791
+ self, file_path: str, doc_type: DocumentType, strategy: ParsingStrategy
792
+ ) -> Union[str, Dict[str, Any]]:
793
+ """Parse text-based documents"""
794
+ try:
795
+ with open(
796
+ file_path,
797
+ "r",
798
+ encoding=self.config.default_encoding,
799
+ errors="ignore",
800
+ ) as f:
801
+ content = f.read()
802
+
803
+ if strategy == ParsingStrategy.TEXT_ONLY:
804
+ return content
805
+ elif strategy == ParsingStrategy.STRUCTURED:
806
+ return self._extract_text_structure(content, doc_type)
807
+ else:
808
+ return {
809
+ "text": content,
810
+ "lines": content.split("\n"),
811
+ "word_count": len(content.split()),
812
+ }
813
+
814
+ except Exception as e:
815
+ raise ParseError(f"Failed to parse text document: {str(e)}")
816
+
817
+ def _extract_metadata(self, file_path: str, doc_type: DocumentType) -> Dict[str, Any]:
818
+ """Extract metadata from document"""
819
+ metadata = {
820
+ "file_path": file_path,
821
+ "file_size": os.path.getsize(file_path),
822
+ "file_type": doc_type.value,
823
+ "created_at": os.path.getctime(file_path),
824
+ "modified_at": os.path.getmtime(file_path),
825
+ }
826
+
827
+ # Add type-specific metadata extraction here
828
+ # This could leverage existing tools' metadata extraction capabilities
829
+
830
+ return metadata
831
+
832
+ def _calculate_content_stats(self, content: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
833
+ """Calculate statistics about the parsed content"""
834
+ if isinstance(content, str):
835
+ return {
836
+ "character_count": len(content),
837
+ "word_count": len(content.split()),
838
+ "line_count": len(content.split("\n")),
839
+ "paragraph_count": len([p for p in content.split("\n\n") if p.strip()]),
840
+ }
841
+ else:
842
+ # For structured content, calculate stats on text portion
843
+ text_content = content.get("text", "")
844
+ return self._calculate_content_stats(text_content)
845
+
846
+ def _create_chunks(self, content: str, chunk_size: int) -> List[Dict[str, Any]]:
847
+ """Create chunks from content for better AI processing"""
848
+ chunks = []
849
+ words = content.split()
850
+
851
+ for i in range(0, len(words), chunk_size):
852
+ chunk_words = words[i : i + chunk_size]
853
+ chunk_text = " ".join(chunk_words)
854
+
855
+ chunks.append(
856
+ {
857
+ "index": len(chunks),
858
+ "text": chunk_text,
859
+ "word_count": len(chunk_words),
860
+ "start_word": i,
861
+ "end_word": min(i + chunk_size, len(words)),
862
+ }
863
+ )
864
+
865
+ return chunks
866
+
867
+ def _format_as_text(self, result: Dict[str, Any]) -> str:
868
+ """Format result as plain text"""
869
+ content = result.get("content", "")
870
+ if isinstance(content, dict):
871
+ return content.get("text", str(content))
872
+ return str(content)
873
+
874
+ def _format_as_markdown(self, result: Dict[str, Any]) -> str:
875
+ """Format result as Markdown"""
876
+ content = result.get("content", "")
877
+ result.get("metadata", {})
878
+
879
+ md_content = f"# Document: {result.get('source', 'Unknown')}\n\n"
880
+ md_content += f"**Type:** {result.get('document_type', 'Unknown')}\n"
881
+ md_content += f"**Detection Confidence:** {result.get('detection_confidence', 0):.2f}\n\n"
882
+
883
+ if isinstance(content, dict):
884
+ md_content += content.get("text", str(content))
885
+ else:
886
+ md_content += str(content)
887
+
888
+ return md_content
889
+
890
+ def _format_as_html(self, result: Dict[str, Any]) -> str:
891
+ """Format result as HTML"""
892
+ content = result.get("content", "")
893
+
894
+ html_content = f"""
895
+ <html>
896
+ <head><title>Parsed Document</title></head>
897
+ <body>
898
+ <h1>Document: {result.get('source', 'Unknown')}</h1>
899
+ <p><strong>Type:</strong> {result.get('document_type', 'Unknown')}</p>
900
+ <p><strong>Detection Confidence:</strong> {result.get('detection_confidence', 0):.2f}</p>
901
+ <div class="content">
902
+ """
903
+
904
+ if isinstance(content, dict):
905
+ html_content += f"<pre>{content.get('text', str(content))}</pre>"
906
+ else:
907
+ html_content += f"<pre>{str(content)}</pre>"
908
+
909
+ html_content += "</div></body></html>"
910
+ return html_content
911
+
912
+ def _cleanup_temp_files(self, source: str):
913
+ """Clean up temporary files"""
914
+ import glob
915
+
916
+ if self._is_url(source):
917
+ # Clean up URL downloaded files
918
+ temp_pattern = os.path.join(self.config.temp_dir, f"download_{hash(source)}_*")
919
+ for temp_file in glob.glob(temp_pattern):
920
+ try:
921
+ os.remove(temp_file)
922
+ self.logger.debug(f"Cleaned up temp file: {temp_file}")
923
+ except Exception as e:
924
+ self.logger.warning(f"Failed to clean up temp file {temp_file}: {e}")
925
+
926
+ elif self._is_cloud_storage_path(source) or self._is_storage_id(source):
927
+ # Clean up cloud storage downloaded files
928
+ temp_pattern = os.path.join(self.config.temp_dir, f"cloud_download_{hash(source)}_*")
929
+ for temp_file in glob.glob(temp_pattern):
930
+ try:
931
+ os.remove(temp_file)
932
+ self.logger.debug(f"Cleaned up cloud temp file: {temp_file}")
933
+ except Exception as e:
934
+ self.logger.warning(f"Failed to clean up cloud temp file {temp_file}: {e}")
935
+
936
+ # Helper methods for structure extraction
937
+ def _extract_pdf_structure(self, text: str) -> Dict[str, Any]:
938
+ """Extract structure from PDF text"""
939
+ # Implement PDF structure extraction logic
940
+ return {"sections": [], "headings": []}
941
+
942
+ def _extract_office_structure(self, file_path: str, doc_type: DocumentType) -> Dict[str, Any]:
943
+ """Extract structure from Office documents"""
944
+ # Implement Office document structure extraction
945
+ return {"sections": [], "tables": [], "images": []}
946
+
947
+ def _extract_text_structure(self, content: str, doc_type: DocumentType) -> Dict[str, Any]:
948
+ """Extract structure from text documents"""
949
+ result = {"text": content}
950
+
951
+ if doc_type == DocumentType.MARKDOWN:
952
+ # Extract markdown structure
953
+ headings = re.findall(r"^(#{1,6})\s+(.+)$", content, re.MULTILINE)
954
+ result["headings"] = [{"level": len(h[0]), "text": h[1]} for h in headings]
955
+ elif doc_type == DocumentType.HTML:
956
+ # Extract HTML structure (simplified)
957
+ from bs4 import BeautifulSoup
958
+
959
+ soup = BeautifulSoup(content, "html.parser")
960
+ result["title"] = soup.title.string if soup.title else ""
961
+ result["headings"] = [
962
+ {"tag": h.name, "text": h.get_text()}
963
+ for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
964
+ ]
965
+ elif doc_type == DocumentType.JSON:
966
+ import json
967
+
968
+ try:
969
+ result["json_data"] = json.loads(content)
970
+ except Exception:
971
+ pass
972
+
973
+ return result
974
+
975
+ def _split_into_pages(self, text: str) -> List[str]:
976
+ """Split text into pages (simplified)"""
977
+ # This is a simple implementation - could be enhanced
978
+ # Form feed character often indicates page break
979
+ pages = text.split("\f")
980
+ return [page.strip() for page in pages if page.strip()]
981
+
982
+ def _extract_text_fallback(self, file_path: str) -> str:
983
+ """Fallback text extraction method"""
984
+ try:
985
+ with open(
986
+ file_path,
987
+ "r",
988
+ encoding=self.config.default_encoding,
989
+ errors="ignore",
990
+ ) as f:
991
+ return f.read()
992
+ except Exception:
993
+ with open(file_path, "rb") as f:
994
+ return f.read().decode("utf-8", errors="ignore")