aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,684 @@
|
|
|
1
|
+
from aiecs.tools import register_tool
|
|
2
|
+
from aiecs.tools.base_tool import BaseTool
|
|
3
|
+
from pydantic import BaseModel, field_validator, ConfigDict, Field
|
|
4
|
+
from pptx.util import Inches
|
|
5
|
+
from pptx import Presentation
|
|
6
|
+
from docx.shared import Pt
|
|
7
|
+
from docx import Document as DocxDocument
|
|
8
|
+
from tika import parser
|
|
9
|
+
import os
|
|
10
|
+
import logging
|
|
11
|
+
import warnings
|
|
12
|
+
from typing import List, Dict, Optional, Any
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import pdfplumber
|
|
16
|
+
import pytesseract
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
19
|
+
# Configure Tika log path to user-writable directory before importing
|
|
20
|
+
os.environ["TIKA_LOG_PATH"] = os.path.expanduser("~/.cache/tika")
|
|
21
|
+
os.makedirs(os.path.expanduser("~/.cache/tika"), exist_ok=True)
|
|
22
|
+
|
|
23
|
+
# Suppress pkg_resources deprecation warning from tika
|
|
24
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="tika")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Module-level default configuration for validators
|
|
28
|
+
_DEFAULT_MAX_FILE_SIZE_MB = 100
|
|
29
|
+
_DEFAULT_ALLOWED_EXTENSIONS = [
|
|
30
|
+
".docx",
|
|
31
|
+
".pptx",
|
|
32
|
+
".xlsx",
|
|
33
|
+
".pdf",
|
|
34
|
+
".png",
|
|
35
|
+
".jpg",
|
|
36
|
+
".jpeg",
|
|
37
|
+
".tiff",
|
|
38
|
+
".bmp",
|
|
39
|
+
".gif",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# Exceptions
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class OfficeToolError(Exception):
|
|
46
|
+
"""Base exception for OfficeTool errors."""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class InputValidationError(OfficeToolError):
|
|
50
|
+
"""Raised when input validation fails."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class FileOperationError(OfficeToolError):
|
|
54
|
+
"""Raised when file operations fail."""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SecurityError(OfficeToolError):
|
|
58
|
+
"""Raised for security-related issues."""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ContentValidationError(OfficeToolError):
|
|
62
|
+
"""Raised when document content validation fails."""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Base schema for common fields
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BaseFileSchema(BaseModel):
|
|
69
|
+
file_path: Optional[str] = None
|
|
70
|
+
output_path: Optional[str] = None
|
|
71
|
+
image_path: Optional[str] = None
|
|
72
|
+
|
|
73
|
+
@field_validator("file_path", "output_path", "image_path")
|
|
74
|
+
def validate_path(cls, v: Optional[str], field) -> Optional[str]:
|
|
75
|
+
"""Validate file paths for existence, size, extension, and path traversal."""
|
|
76
|
+
if not v:
|
|
77
|
+
return v
|
|
78
|
+
abs_path = os.path.abspath(os.path.normpath(v))
|
|
79
|
+
# Check for path traversal
|
|
80
|
+
if ".." in v or "~" in v or "%" in v:
|
|
81
|
+
raise SecurityError(f"Path traversal attempt detected: {v}")
|
|
82
|
+
# Ensure path is in allowed directories
|
|
83
|
+
base_dir = os.path.abspath(os.getcwd())
|
|
84
|
+
allowed_dirs = [
|
|
85
|
+
os.path.abspath(os.path.normpath(d)) for d in ["/tmp", "./data", "./uploads"]
|
|
86
|
+
]
|
|
87
|
+
if not abs_path.startswith(base_dir) and not any(
|
|
88
|
+
abs_path.startswith(d) for d in allowed_dirs
|
|
89
|
+
):
|
|
90
|
+
raise SecurityError(f"Path not in allowed directories: {abs_path}")
|
|
91
|
+
# Check extension
|
|
92
|
+
ext = os.path.splitext(abs_path)[1].lower()
|
|
93
|
+
if ext not in _DEFAULT_ALLOWED_EXTENSIONS:
|
|
94
|
+
raise SecurityError(
|
|
95
|
+
f"Extension '{ext}' not allowed for '{field.field_name}', expected {_DEFAULT_ALLOWED_EXTENSIONS}"
|
|
96
|
+
)
|
|
97
|
+
# Check file existence and size for input paths
|
|
98
|
+
if field.field_name == "file_path":
|
|
99
|
+
if not os.path.isfile(abs_path):
|
|
100
|
+
raise FileOperationError(f"{field.field_name}: File not found: {abs_path}")
|
|
101
|
+
size_mb = os.path.getsize(abs_path) / (1024 * 1024)
|
|
102
|
+
if size_mb > _DEFAULT_MAX_FILE_SIZE_MB:
|
|
103
|
+
raise FileOperationError(
|
|
104
|
+
f"{field.field_name}: File too large: {size_mb:.1f}MB, max {_DEFAULT_MAX_FILE_SIZE_MB}MB"
|
|
105
|
+
)
|
|
106
|
+
# Check for existing output paths
|
|
107
|
+
elif field.field_name == "output_path" and os.path.exists(abs_path):
|
|
108
|
+
raise FileOperationError(f"{field.field_name}: File already exists: {abs_path}")
|
|
109
|
+
return abs_path
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# Schemas for operations
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class ReadDocxSchema(BaseFileSchema):
|
|
116
|
+
"""Schema for reading DOCX files."""
|
|
117
|
+
|
|
118
|
+
file_path: str
|
|
119
|
+
include_tables: bool = False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class WriteDocxSchema(BaseFileSchema):
|
|
123
|
+
"""Schema for writing DOCX files."""
|
|
124
|
+
|
|
125
|
+
text: str
|
|
126
|
+
output_path: str
|
|
127
|
+
table_data: Optional[List[List[str]]] = None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class ReadPptxSchema(BaseFileSchema):
|
|
131
|
+
"""Schema for reading PPTX files."""
|
|
132
|
+
|
|
133
|
+
file_path: str
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class WritePptxSchema(BaseFileSchema):
|
|
137
|
+
"""Schema for writing PPTX files."""
|
|
138
|
+
|
|
139
|
+
slides: List[str]
|
|
140
|
+
output_path: str
|
|
141
|
+
image_path: Optional[str] = None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class ReadXlsxSchema(BaseFileSchema):
|
|
145
|
+
"""Schema for reading XLSX files."""
|
|
146
|
+
|
|
147
|
+
file_path: str
|
|
148
|
+
sheet_name: Optional[str] = None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class WriteXlsxSchema(BaseFileSchema):
|
|
152
|
+
"""Schema for writing XLSX files."""
|
|
153
|
+
|
|
154
|
+
data: List[Dict]
|
|
155
|
+
output_path: str
|
|
156
|
+
sheet_name: str = "Sheet1"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class ExtractTextSchema(BaseFileSchema):
|
|
160
|
+
"""Schema for extracting text from files."""
|
|
161
|
+
|
|
162
|
+
file_path: str
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@register_tool("office")
|
|
166
|
+
class OfficeTool(BaseTool):
|
|
167
|
+
"""
|
|
168
|
+
Office document processing tool supporting:
|
|
169
|
+
- read_docx: Read content from DOCX files.
|
|
170
|
+
- write_docx: Write content to DOCX files.
|
|
171
|
+
- read_pptx: Read content from PPTX files.
|
|
172
|
+
- write_pptx: Write content to PPTX files.
|
|
173
|
+
- read_xlsx: Read content from XLSX files.
|
|
174
|
+
- write_xlsx: Write content to XLSX files.
|
|
175
|
+
- extract_text: Extract text from various file formats.
|
|
176
|
+
|
|
177
|
+
Inherits from BaseTool to leverage ToolExecutor for caching, concurrency, and error handling.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
# Configuration schema
|
|
181
|
+
class Config(BaseModel):
|
|
182
|
+
"""Configuration for the office tool"""
|
|
183
|
+
|
|
184
|
+
model_config = ConfigDict(env_prefix="OFFICE_TOOL_")
|
|
185
|
+
|
|
186
|
+
max_file_size_mb: int = Field(default=100, description="Maximum file size in megabytes")
|
|
187
|
+
default_font: str = Field(default="Arial", description="Default font for documents")
|
|
188
|
+
default_font_size: int = Field(default=12, description="Default font size in points")
|
|
189
|
+
allowed_extensions: List[str] = Field(
|
|
190
|
+
default=[
|
|
191
|
+
".docx",
|
|
192
|
+
".pptx",
|
|
193
|
+
".xlsx",
|
|
194
|
+
".pdf",
|
|
195
|
+
".png",
|
|
196
|
+
".jpg",
|
|
197
|
+
".jpeg",
|
|
198
|
+
".tiff",
|
|
199
|
+
".bmp",
|
|
200
|
+
".gif",
|
|
201
|
+
],
|
|
202
|
+
description="Allowed document file extensions",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
206
|
+
"""
|
|
207
|
+
Initialize OfficeTool with configuration.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
config (Dict, optional): Configuration overrides for OfficeTool.
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
ValueError: If config contains invalid settings.
|
|
214
|
+
"""
|
|
215
|
+
super().__init__(config)
|
|
216
|
+
|
|
217
|
+
# Parse configuration
|
|
218
|
+
self.config = self.Config(**(config or {}))
|
|
219
|
+
|
|
220
|
+
self.logger = logging.getLogger(__name__)
|
|
221
|
+
if not self.logger.handlers:
|
|
222
|
+
handler = logging.StreamHandler()
|
|
223
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
224
|
+
self.logger.addHandler(handler)
|
|
225
|
+
self.logger.setLevel(logging.INFO)
|
|
226
|
+
|
|
227
|
+
def _validate_document(self, file_path: str, file_type: str) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Validate document structure before processing.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
file_path (str): Path to the document file.
|
|
233
|
+
file_type (str): Type of document ('docx', 'pptx', 'xlsx', 'pdf', 'image').
|
|
234
|
+
|
|
235
|
+
Raises:
|
|
236
|
+
ContentValidationError: If document structure is invalid.
|
|
237
|
+
"""
|
|
238
|
+
try:
|
|
239
|
+
if file_type == "docx":
|
|
240
|
+
doc = DocxDocument(file_path)
|
|
241
|
+
if not hasattr(doc, "paragraphs"):
|
|
242
|
+
raise ContentValidationError("Invalid DOCX structure")
|
|
243
|
+
elif file_type == "pptx":
|
|
244
|
+
prs = Presentation(file_path)
|
|
245
|
+
if not hasattr(prs, "slides"):
|
|
246
|
+
raise ContentValidationError("Invalid PPTX structure")
|
|
247
|
+
elif file_type == "xlsx":
|
|
248
|
+
# Just validate that file can be read - don't care about return
|
|
249
|
+
# type
|
|
250
|
+
pd.read_excel(file_path, nrows=5)
|
|
251
|
+
elif file_type == "pdf":
|
|
252
|
+
with pdfplumber.open(file_path) as pdf:
|
|
253
|
+
if len(pdf.pages) == 0:
|
|
254
|
+
raise ContentValidationError("PDF has no pages")
|
|
255
|
+
elif file_type == "image":
|
|
256
|
+
img = Image.open(file_path)
|
|
257
|
+
img.verify() # Verify it's a valid image
|
|
258
|
+
else:
|
|
259
|
+
# Use tika as fallback for other formats
|
|
260
|
+
parsed = parser.from_file(file_path)
|
|
261
|
+
if not parsed or not parsed.get("content"):
|
|
262
|
+
raise ContentValidationError("Unable to parse file content")
|
|
263
|
+
except Exception as e:
|
|
264
|
+
raise ContentValidationError(f"Invalid {file_type.upper()} file: {str(e)}")
|
|
265
|
+
|
|
266
|
+
def _sanitize_text(self, text: str) -> str:
|
|
267
|
+
"""
|
|
268
|
+
Sanitize text to remove potentially harmful control characters.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
text (str): Input text.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
str: Sanitized text.
|
|
275
|
+
"""
|
|
276
|
+
if not text:
|
|
277
|
+
return ""
|
|
278
|
+
return "".join(char for char in text if ord(char) >= 32 or char in "\n\r\t")
|
|
279
|
+
|
|
280
|
+
def _sanitize_table_data(
|
|
281
|
+
self, table_data: Optional[List[List[str]]]
|
|
282
|
+
) -> Optional[List[List[str]]]:
|
|
283
|
+
"""
|
|
284
|
+
Sanitize table data to remove harmful content.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
table_data (Optional[List[List[str]]]): Table data to sanitize.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
Optional[List[List[str]]]: Sanitized table data.
|
|
291
|
+
"""
|
|
292
|
+
if not table_data:
|
|
293
|
+
return None
|
|
294
|
+
return [[self._sanitize_text(str(cell)) for cell in row] for row in table_data]
|
|
295
|
+
|
|
296
|
+
def _sanitize_data(self, data_list: List[Dict]) -> List[Dict]:
|
|
297
|
+
"""
|
|
298
|
+
Sanitize Excel data to remove harmful content and enforce limits.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
data_list (List[Dict]): List of dictionaries to sanitize.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
List[Dict]: Sanitized data.
|
|
305
|
+
"""
|
|
306
|
+
if not data_list:
|
|
307
|
+
return []
|
|
308
|
+
sanitized = []
|
|
309
|
+
for item in data_list:
|
|
310
|
+
clean_item = {}
|
|
311
|
+
for k, v in item.items():
|
|
312
|
+
# Excel key limit with sanitization
|
|
313
|
+
clean_key = self._sanitize_text(str(k))[:255]
|
|
314
|
+
if isinstance(v, str):
|
|
315
|
+
clean_value = self._sanitize_text(v)[:32767] # Excel cell limit
|
|
316
|
+
else:
|
|
317
|
+
clean_value = v
|
|
318
|
+
clean_item[clean_key] = clean_value
|
|
319
|
+
sanitized.append(clean_item)
|
|
320
|
+
return sanitized
|
|
321
|
+
|
|
322
|
+
def _extract_pdf_text(self, file_path: str) -> str:
|
|
323
|
+
"""
|
|
324
|
+
Extract text from PDF using pdfplumber.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
file_path (str): Path to the PDF file.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
str: Extracted text content.
|
|
331
|
+
|
|
332
|
+
Raises:
|
|
333
|
+
FileOperationError: If PDF text extraction fails.
|
|
334
|
+
"""
|
|
335
|
+
try:
|
|
336
|
+
text_content = []
|
|
337
|
+
with pdfplumber.open(file_path) as pdf:
|
|
338
|
+
for page in pdf.pages:
|
|
339
|
+
page_text = page.extract_text()
|
|
340
|
+
if page_text:
|
|
341
|
+
text_content.append(page_text)
|
|
342
|
+
return "\n".join(text_content)
|
|
343
|
+
except Exception as e:
|
|
344
|
+
raise FileOperationError(f"Failed to extract PDF text: {str(e)}")
|
|
345
|
+
|
|
346
|
+
def _extract_image_text(self, file_path: str) -> str:
|
|
347
|
+
"""
|
|
348
|
+
Extract text from image using pytesseract OCR.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
file_path (str): Path to the image file.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
str: Extracted text content.
|
|
355
|
+
|
|
356
|
+
Raises:
|
|
357
|
+
FileOperationError: If image text extraction fails.
|
|
358
|
+
"""
|
|
359
|
+
try:
|
|
360
|
+
image = Image.open(file_path)
|
|
361
|
+
# Convert to RGB if necessary
|
|
362
|
+
if image.mode != "RGB":
|
|
363
|
+
image = image.convert("RGB")
|
|
364
|
+
text = pytesseract.image_to_string(image, lang="eng+chi_sim")
|
|
365
|
+
return text.strip()
|
|
366
|
+
except Exception as e:
|
|
367
|
+
raise FileOperationError(f"Failed to extract image text: {str(e)}")
|
|
368
|
+
|
|
369
|
+
def _extract_tika_text(self, file_path: str) -> str:
|
|
370
|
+
"""
|
|
371
|
+
Extract text using Apache Tika as fallback.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
file_path (str): Path to the file.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
str: Extracted text content.
|
|
378
|
+
|
|
379
|
+
Raises:
|
|
380
|
+
FileOperationError: If Tika text extraction fails.
|
|
381
|
+
"""
|
|
382
|
+
try:
|
|
383
|
+
parsed = parser.from_file(file_path)
|
|
384
|
+
content = parsed.get("content", "")
|
|
385
|
+
return content.strip() if content else ""
|
|
386
|
+
except Exception as e:
|
|
387
|
+
raise FileOperationError(f"Failed to extract text with Tika: {str(e)}")
|
|
388
|
+
|
|
389
|
+
def read_docx(self, file_path: str, include_tables: bool = False) -> Dict[str, Any]:
|
|
390
|
+
"""
|
|
391
|
+
Read content from a DOCX file.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
file_path (str): Path to the DOCX file.
|
|
395
|
+
include_tables (bool): Whether to include table data.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
Dict[str, Any]: Document content {'paragraphs': List[str], 'tables': Optional[List[List[List[str]]]]}.
|
|
399
|
+
|
|
400
|
+
Raises:
|
|
401
|
+
FileOperationError: If file cannot be read.
|
|
402
|
+
ContentValidationError: If document structure is invalid.
|
|
403
|
+
"""
|
|
404
|
+
try:
|
|
405
|
+
self._validate_document(file_path, "docx")
|
|
406
|
+
doc = DocxDocument(file_path)
|
|
407
|
+
paras = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
408
|
+
tables = None
|
|
409
|
+
if include_tables:
|
|
410
|
+
tables = [
|
|
411
|
+
[[cell.text for cell in row.cells] for row in table.rows]
|
|
412
|
+
for table in doc.tables
|
|
413
|
+
]
|
|
414
|
+
return {"paragraphs": paras, "tables": tables}
|
|
415
|
+
except ContentValidationError:
|
|
416
|
+
raise
|
|
417
|
+
except Exception as e:
|
|
418
|
+
raise FileOperationError(f"Failed to read DOCX: {str(e)}")
|
|
419
|
+
|
|
420
|
+
def write_docx(
|
|
421
|
+
self,
|
|
422
|
+
text: str,
|
|
423
|
+
output_path: str,
|
|
424
|
+
table_data: Optional[List[List[str]]] = None,
|
|
425
|
+
) -> Dict[str, Any]:
|
|
426
|
+
"""
|
|
427
|
+
Write content to a DOCX file.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
text (str): Text content to write.
|
|
431
|
+
output_path (str): Path to save the DOCX file.
|
|
432
|
+
table_data (Optional[List[List[str]]]): Table data to include.
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
Dict[str, Any]: Status {'success': bool, 'file_path': str}.
|
|
436
|
+
|
|
437
|
+
Raises:
|
|
438
|
+
FileOperationError: If file cannot be written.
|
|
439
|
+
"""
|
|
440
|
+
try:
|
|
441
|
+
sanitized_text = self._sanitize_text(text)
|
|
442
|
+
sanitized_table_data = self._sanitize_table_data(table_data)
|
|
443
|
+
doc = DocxDocument()
|
|
444
|
+
style = doc.styles["Normal"]
|
|
445
|
+
style.font.name = self.config.default_font
|
|
446
|
+
style.font.size = Pt(self.config.default_font_size)
|
|
447
|
+
for line in sanitized_text.splitlines():
|
|
448
|
+
doc.add_paragraph(line)
|
|
449
|
+
if sanitized_table_data and sanitized_table_data[0]:
|
|
450
|
+
# Find maximum number of columns to handle irregular table data
|
|
451
|
+
max_cols = max(len(row) for row in sanitized_table_data)
|
|
452
|
+
table = doc.add_table(rows=len(sanitized_table_data), cols=max_cols)
|
|
453
|
+
for i, row in enumerate(sanitized_table_data):
|
|
454
|
+
for j in range(max_cols):
|
|
455
|
+
if j < len(row):
|
|
456
|
+
table.rows[i].cells[j].text = str(row[j])
|
|
457
|
+
else:
|
|
458
|
+
# Empty cell for missing data
|
|
459
|
+
table.rows[i].cells[j].text = ""
|
|
460
|
+
doc.save(output_path)
|
|
461
|
+
return {"success": True, "file_path": output_path}
|
|
462
|
+
except Exception as e:
|
|
463
|
+
raise FileOperationError(f"Failed to write DOCX: {str(e)}")
|
|
464
|
+
|
|
465
|
+
def read_pptx(self, file_path: str) -> List[str]:
|
|
466
|
+
"""
|
|
467
|
+
Read content from a PPTX file.
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
file_path (str): Path to the PPTX file.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
List[str]: List of text content from slides.
|
|
474
|
+
|
|
475
|
+
Raises:
|
|
476
|
+
FileOperationError: If file cannot be read.
|
|
477
|
+
ContentValidationError: If document structure is invalid.
|
|
478
|
+
"""
|
|
479
|
+
try:
|
|
480
|
+
self._validate_document(file_path, "pptx")
|
|
481
|
+
prs = Presentation(file_path)
|
|
482
|
+
texts = []
|
|
483
|
+
for slide in prs.slides:
|
|
484
|
+
for shape in slide.shapes:
|
|
485
|
+
if hasattr(shape, "text"):
|
|
486
|
+
txt = shape.text.strip()
|
|
487
|
+
if txt:
|
|
488
|
+
texts.append(txt)
|
|
489
|
+
return texts
|
|
490
|
+
except ContentValidationError:
|
|
491
|
+
raise
|
|
492
|
+
except Exception as e:
|
|
493
|
+
raise FileOperationError(f"Failed to read PPTX: {str(e)}")
|
|
494
|
+
|
|
495
|
+
def write_pptx(
|
|
496
|
+
self,
|
|
497
|
+
slides: List[str],
|
|
498
|
+
output_path: str,
|
|
499
|
+
image_path: Optional[str] = None,
|
|
500
|
+
) -> Dict[str, Any]:
|
|
501
|
+
"""
|
|
502
|
+
Write content to a PPTX file.
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
slides (List[str]): List of slide contents.
|
|
506
|
+
output_path (str): Path to save the PPTX file.
|
|
507
|
+
image_path (Optional[str]): Path to an image to include on the first slide.
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
Dict[str, Any]: Status {'success': bool, 'file_path': str}.
|
|
511
|
+
|
|
512
|
+
Raises:
|
|
513
|
+
FileOperationError: If file cannot be written.
|
|
514
|
+
"""
|
|
515
|
+
try:
|
|
516
|
+
sanitized_slides = [self._sanitize_text(slide) for slide in slides]
|
|
517
|
+
prs = Presentation()
|
|
518
|
+
blank = prs.slide_layouts[6]
|
|
519
|
+
for idx, content in enumerate(sanitized_slides):
|
|
520
|
+
slide = prs.slides.add_slide(blank)
|
|
521
|
+
box = slide.shapes.add_textbox(Inches(1), Inches(1), Inches(8), Inches(5))
|
|
522
|
+
tf = box.text_frame
|
|
523
|
+
lines = content.splitlines()
|
|
524
|
+
if lines:
|
|
525
|
+
# Set text for the first paragraph (which already exists)
|
|
526
|
+
tf.text = lines[0]
|
|
527
|
+
# Add additional paragraphs for remaining lines
|
|
528
|
+
for line in lines[1:]:
|
|
529
|
+
p = tf.add_paragraph()
|
|
530
|
+
p.text = line
|
|
531
|
+
if idx == 0 and image_path:
|
|
532
|
+
try:
|
|
533
|
+
slide.shapes.add_picture(image_path, Inches(1), Inches(6), Inches(4))
|
|
534
|
+
except Exception as img_err:
|
|
535
|
+
self.logger.warning(f"Could not add image to slide: {img_err}")
|
|
536
|
+
prs.save(output_path)
|
|
537
|
+
return {"success": True, "file_path": output_path}
|
|
538
|
+
except Exception as e:
|
|
539
|
+
raise FileOperationError(f"Failed to write PPTX: {str(e)}")
|
|
540
|
+
|
|
541
|
+
def read_xlsx(self, file_path: str, sheet_name: Optional[str] = None) -> List[Dict]:
|
|
542
|
+
"""
|
|
543
|
+
Read content from an XLSX file.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
file_path (str): Path to the XLSX file.
|
|
547
|
+
sheet_name (Optional[str]): Name of the sheet to read.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
List[Dict]: List of dictionaries representing Excel data.
|
|
551
|
+
|
|
552
|
+
Raises:
|
|
553
|
+
FileOperationError: If file cannot be read.
|
|
554
|
+
ContentValidationError: If document structure is invalid.
|
|
555
|
+
"""
|
|
556
|
+
try:
|
|
557
|
+
self._validate_document(file_path, "xlsx")
|
|
558
|
+
data = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
559
|
+
|
|
560
|
+
# Handle different return types from pd.read_excel()
|
|
561
|
+
if isinstance(data, pd.DataFrame):
|
|
562
|
+
# Single sheet or specific sheet requested
|
|
563
|
+
return data.to_dict(orient="records")
|
|
564
|
+
elif isinstance(data, dict):
|
|
565
|
+
# Multiple sheets returned as dict - use the first sheet
|
|
566
|
+
first_sheet_name = list(data.keys())[0]
|
|
567
|
+
first_df = data[first_sheet_name]
|
|
568
|
+
return first_df.to_dict(orient="records")
|
|
569
|
+
else:
|
|
570
|
+
raise FileOperationError("Unexpected data type returned from Excel file")
|
|
571
|
+
|
|
572
|
+
except ContentValidationError:
|
|
573
|
+
raise
|
|
574
|
+
except Exception as e:
|
|
575
|
+
raise FileOperationError(f"Failed to read XLSX: {str(e)}")
|
|
576
|
+
|
|
577
|
+
def write_xlsx(
|
|
578
|
+
self, data: List[Dict], output_path: str, sheet_name: str = "Sheet1"
|
|
579
|
+
) -> Dict[str, Any]:
|
|
580
|
+
"""
|
|
581
|
+
Write content to an XLSX file.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
data (List[Dict]): Data to write.
|
|
585
|
+
output_path (str): Path to save the XLSX file.
|
|
586
|
+
sheet_name (str): Name of the sheet.
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
Dict[str, Any]: Status {'success': bool, 'file_path': str}.
|
|
590
|
+
|
|
591
|
+
Raises:
|
|
592
|
+
FileOperationError: If file cannot be written.
|
|
593
|
+
"""
|
|
594
|
+
try:
|
|
595
|
+
sanitized_data = self._sanitize_data(data)
|
|
596
|
+
if not sanitized_data:
|
|
597
|
+
pd.DataFrame().to_excel(output_path, index=False, sheet_name=sheet_name)
|
|
598
|
+
else:
|
|
599
|
+
pd.DataFrame(sanitized_data).to_excel(
|
|
600
|
+
output_path, index=False, sheet_name=sheet_name
|
|
601
|
+
)
|
|
602
|
+
return {"success": True, "file_path": output_path}
|
|
603
|
+
except Exception as e:
|
|
604
|
+
raise FileOperationError(f"Failed to write XLSX: {str(e)}")
|
|
605
|
+
|
|
606
|
+
def extract_text(self, file_path: str) -> str:
|
|
607
|
+
"""
|
|
608
|
+
Extract text from various file formats using combination library approach.
|
|
609
|
+
|
|
610
|
+
Args:
|
|
611
|
+
file_path (str): Path to the file.
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
str: Extracted text content.
|
|
615
|
+
|
|
616
|
+
Raises:
|
|
617
|
+
FileOperationError: If text extraction fails.
|
|
618
|
+
ContentValidationError: If document structure is invalid.
|
|
619
|
+
"""
|
|
620
|
+
try:
|
|
621
|
+
file_ext = os.path.splitext(file_path)[1].lower()
|
|
622
|
+
|
|
623
|
+
# Determine file type and validate
|
|
624
|
+
if file_ext == ".pdf":
|
|
625
|
+
file_type = "pdf"
|
|
626
|
+
elif file_ext == ".docx":
|
|
627
|
+
file_type = "docx"
|
|
628
|
+
elif file_ext == ".pptx":
|
|
629
|
+
file_type = "pptx"
|
|
630
|
+
elif file_ext == ".xlsx":
|
|
631
|
+
file_type = "xlsx"
|
|
632
|
+
elif file_ext in [
|
|
633
|
+
".png",
|
|
634
|
+
".jpg",
|
|
635
|
+
".jpeg",
|
|
636
|
+
".tiff",
|
|
637
|
+
".bmp",
|
|
638
|
+
".gif",
|
|
639
|
+
]:
|
|
640
|
+
file_type = "image"
|
|
641
|
+
else:
|
|
642
|
+
file_type = "other"
|
|
643
|
+
|
|
644
|
+
# Validate document structure
|
|
645
|
+
self._validate_document(file_path, file_type)
|
|
646
|
+
|
|
647
|
+
# Extract text based on file type
|
|
648
|
+
if file_type == "pdf":
|
|
649
|
+
return self._sanitize_text(self._extract_pdf_text(file_path))
|
|
650
|
+
elif file_type == "docx":
|
|
651
|
+
doc = DocxDocument(file_path)
|
|
652
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
653
|
+
return self._sanitize_text("\n".join(paragraphs))
|
|
654
|
+
elif file_type == "pptx":
|
|
655
|
+
prs = Presentation(file_path)
|
|
656
|
+
texts = []
|
|
657
|
+
for slide in prs.slides:
|
|
658
|
+
for shape in slide.shapes:
|
|
659
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
660
|
+
texts.append(shape.text)
|
|
661
|
+
return self._sanitize_text("\n".join(texts))
|
|
662
|
+
elif file_type == "xlsx":
|
|
663
|
+
data = pd.read_excel(file_path)
|
|
664
|
+
# Handle different return types from pd.read_excel()
|
|
665
|
+
if isinstance(data, pd.DataFrame):
|
|
666
|
+
return self._sanitize_text(data.to_string(index=False))
|
|
667
|
+
elif isinstance(data, dict):
|
|
668
|
+
# Multiple sheets returned as dict - use the first sheet
|
|
669
|
+
first_sheet_name = list(data.keys())[0]
|
|
670
|
+
first_df = data[first_sheet_name]
|
|
671
|
+
return self._sanitize_text(first_df.to_string(index=False))
|
|
672
|
+
else:
|
|
673
|
+
# Fallback for unexpected data types
|
|
674
|
+
return self._sanitize_text("")
|
|
675
|
+
elif file_type == "image":
|
|
676
|
+
return self._sanitize_text(self._extract_image_text(file_path))
|
|
677
|
+
else:
|
|
678
|
+
# Use Tika as fallback for other formats
|
|
679
|
+
return self._sanitize_text(self._extract_tika_text(file_path))
|
|
680
|
+
|
|
681
|
+
except ContentValidationError:
|
|
682
|
+
raise
|
|
683
|
+
except Exception as e:
|
|
684
|
+
raise FileOperationError(f"Failed to extract text: {str(e)}")
|