aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -1,138 +1,109 @@
|
|
|
1
|
+
from aiecs.tools import register_tool
|
|
2
|
+
from aiecs.tools.base_tool import BaseTool
|
|
3
|
+
from pydantic import BaseModel, field_validator, Field
|
|
4
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
|
+
from pptx.util import Inches
|
|
6
|
+
from pptx import Presentation
|
|
7
|
+
from docx.shared import Pt
|
|
8
|
+
from docx import Document as DocxDocument
|
|
9
|
+
from tika import parser # type: ignore[import-untyped]
|
|
1
10
|
import os
|
|
2
11
|
import logging
|
|
12
|
+
import warnings
|
|
3
13
|
from typing import List, Dict, Optional, Any
|
|
4
14
|
|
|
5
|
-
import pandas as pd
|
|
15
|
+
import pandas as pd # type: ignore[import-untyped]
|
|
6
16
|
import pdfplumber
|
|
7
|
-
import pytesseract
|
|
17
|
+
import pytesseract # type: ignore[import-untyped]
|
|
8
18
|
from PIL import Image
|
|
9
|
-
from tika import parser
|
|
10
|
-
from docx import Document as DocxDocument
|
|
11
|
-
from docx.shared import Pt
|
|
12
|
-
from pptx import Presentation
|
|
13
|
-
from pptx.util import Inches
|
|
14
|
-
from pydantic import BaseModel, field_validator, ValidationError, ConfigDict
|
|
15
|
-
from pydantic_settings import BaseSettings
|
|
16
19
|
|
|
17
|
-
|
|
18
|
-
|
|
20
|
+
# Tika log path will be configured via Config class
|
|
21
|
+
|
|
22
|
+
# Suppress pkg_resources deprecation warning from tika
|
|
23
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="tika")
|
|
19
24
|
|
|
20
|
-
# Configuration for OfficeTool
|
|
21
|
-
class OfficeSettings(BaseSettings):
|
|
22
|
-
"""
|
|
23
|
-
Configuration for OfficeTool.
|
|
24
|
-
|
|
25
|
-
Attributes:
|
|
26
|
-
max_file_size_mb (int): Maximum file size in megabytes.
|
|
27
|
-
default_font (str): Default font for documents.
|
|
28
|
-
default_font_size (int): Default font size in points.
|
|
29
|
-
allowed_extensions (List[str]): Allowed document file extensions.
|
|
30
|
-
env_prefix (str): Environment variable prefix for settings.
|
|
31
|
-
"""
|
|
32
|
-
max_file_size_mb: int = 100
|
|
33
|
-
default_font: str = "Arial"
|
|
34
|
-
default_font_size: int = 12
|
|
35
|
-
allowed_extensions: List[str] = ['.docx', '.pptx', '.xlsx', '.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']
|
|
36
|
-
env_prefix: str = 'OFFICE_TOOL_'
|
|
37
25
|
|
|
38
|
-
|
|
26
|
+
# Module-level default configuration for validators
|
|
27
|
+
_DEFAULT_MAX_FILE_SIZE_MB = 100
|
|
28
|
+
_DEFAULT_ALLOWED_EXTENSIONS = [
|
|
29
|
+
".docx",
|
|
30
|
+
".pptx",
|
|
31
|
+
".xlsx",
|
|
32
|
+
".pdf",
|
|
33
|
+
".png",
|
|
34
|
+
".jpg",
|
|
35
|
+
".jpeg",
|
|
36
|
+
".tiff",
|
|
37
|
+
".bmp",
|
|
38
|
+
".gif",
|
|
39
|
+
]
|
|
39
40
|
|
|
40
41
|
# Exceptions
|
|
42
|
+
|
|
43
|
+
|
|
41
44
|
class OfficeToolError(Exception):
|
|
42
45
|
"""Base exception for OfficeTool errors."""
|
|
43
|
-
|
|
46
|
+
|
|
44
47
|
|
|
45
48
|
class InputValidationError(OfficeToolError):
|
|
46
49
|
"""Raised when input validation fails."""
|
|
47
|
-
|
|
50
|
+
|
|
48
51
|
|
|
49
52
|
class FileOperationError(OfficeToolError):
|
|
50
53
|
"""Raised when file operations fail."""
|
|
51
|
-
|
|
54
|
+
|
|
52
55
|
|
|
53
56
|
class SecurityError(OfficeToolError):
|
|
54
57
|
"""Raised for security-related issues."""
|
|
55
|
-
|
|
58
|
+
|
|
56
59
|
|
|
57
60
|
class ContentValidationError(OfficeToolError):
|
|
58
61
|
"""Raised when document content validation fails."""
|
|
59
|
-
|
|
62
|
+
|
|
60
63
|
|
|
61
64
|
# Base schema for common fields
|
|
65
|
+
|
|
66
|
+
|
|
62
67
|
class BaseFileSchema(BaseModel):
|
|
63
68
|
file_path: Optional[str] = None
|
|
64
69
|
output_path: Optional[str] = None
|
|
65
70
|
image_path: Optional[str] = None
|
|
66
71
|
|
|
67
|
-
@field_validator(
|
|
72
|
+
@field_validator("file_path", "output_path", "image_path")
|
|
68
73
|
def validate_path(cls, v: Optional[str], field) -> Optional[str]:
|
|
69
74
|
"""Validate file paths for existence, size, extension, and path traversal."""
|
|
70
75
|
if not v:
|
|
71
76
|
return v
|
|
72
|
-
settings = OfficeSettings()
|
|
73
77
|
abs_path = os.path.abspath(os.path.normpath(v))
|
|
74
78
|
# Check for path traversal
|
|
75
|
-
if
|
|
79
|
+
if ".." in v or "~" in v or "%" in v:
|
|
76
80
|
raise SecurityError(f"Path traversal attempt detected: {v}")
|
|
77
81
|
# Ensure path is in allowed directories
|
|
78
82
|
base_dir = os.path.abspath(os.getcwd())
|
|
79
|
-
allowed_dirs = [os.path.abspath(os.path.normpath(d)) for d in [
|
|
83
|
+
allowed_dirs = [os.path.abspath(os.path.normpath(d)) for d in ["/tmp", "./data", "./uploads"]]
|
|
80
84
|
if not abs_path.startswith(base_dir) and not any(abs_path.startswith(d) for d in allowed_dirs):
|
|
81
85
|
raise SecurityError(f"Path not in allowed directories: {abs_path}")
|
|
82
86
|
# Check extension
|
|
83
87
|
ext = os.path.splitext(abs_path)[1].lower()
|
|
84
|
-
if ext not in
|
|
85
|
-
raise SecurityError(f"Extension '{ext}' not allowed for '{field.field_name}', expected {
|
|
88
|
+
if ext not in _DEFAULT_ALLOWED_EXTENSIONS:
|
|
89
|
+
raise SecurityError(f"Extension '{ext}' not allowed for '{field.field_name}', expected {_DEFAULT_ALLOWED_EXTENSIONS}")
|
|
86
90
|
# Check file existence and size for input paths
|
|
87
|
-
if field.field_name ==
|
|
91
|
+
if field.field_name == "file_path":
|
|
88
92
|
if not os.path.isfile(abs_path):
|
|
89
93
|
raise FileOperationError(f"{field.field_name}: File not found: {abs_path}")
|
|
90
94
|
size_mb = os.path.getsize(abs_path) / (1024 * 1024)
|
|
91
|
-
if size_mb >
|
|
92
|
-
raise FileOperationError(f"{field.field_name}: File too large: {size_mb:.1f}MB, max {
|
|
95
|
+
if size_mb > _DEFAULT_MAX_FILE_SIZE_MB:
|
|
96
|
+
raise FileOperationError(f"{field.field_name}: File too large: {size_mb:.1f}MB, max {_DEFAULT_MAX_FILE_SIZE_MB}MB")
|
|
93
97
|
# Check for existing output paths
|
|
94
|
-
elif field.field_name ==
|
|
98
|
+
elif field.field_name == "output_path" and os.path.exists(abs_path):
|
|
95
99
|
raise FileOperationError(f"{field.field_name}: File already exists: {abs_path}")
|
|
96
100
|
return abs_path
|
|
97
101
|
|
|
98
|
-
# Schemas for operations
|
|
99
|
-
class ReadDocxSchema(BaseFileSchema):
|
|
100
|
-
"""Schema for reading DOCX files."""
|
|
101
|
-
file_path: str
|
|
102
|
-
include_tables: bool = False
|
|
103
|
-
|
|
104
|
-
class WriteDocxSchema(BaseFileSchema):
|
|
105
|
-
"""Schema for writing DOCX files."""
|
|
106
|
-
text: str
|
|
107
|
-
output_path: str
|
|
108
|
-
table_data: Optional[List[List[str]]] = None
|
|
109
|
-
|
|
110
|
-
class ReadPptxSchema(BaseFileSchema):
|
|
111
|
-
"""Schema for reading PPTX files."""
|
|
112
|
-
file_path: str
|
|
113
|
-
|
|
114
|
-
class WritePptxSchema(BaseFileSchema):
|
|
115
|
-
"""Schema for writing PPTX files."""
|
|
116
|
-
slides: List[str]
|
|
117
|
-
output_path: str
|
|
118
|
-
image_path: Optional[str] = None
|
|
119
|
-
|
|
120
|
-
class ReadXlsxSchema(BaseFileSchema):
|
|
121
|
-
"""Schema for reading XLSX files."""
|
|
122
|
-
file_path: str
|
|
123
|
-
sheet_name: Optional[str] = None
|
|
124
102
|
|
|
125
|
-
class
|
|
126
|
-
"""Schema for writing XLSX files."""
|
|
127
|
-
data: List[Dict]
|
|
128
|
-
output_path: str
|
|
129
|
-
sheet_name: str = 'Sheet1'
|
|
103
|
+
# Schemas for operations - moved to OfficeTool class as inner classes
|
|
130
104
|
|
|
131
|
-
class ExtractTextSchema(BaseFileSchema):
|
|
132
|
-
"""Schema for extracting text from files."""
|
|
133
|
-
file_path: str
|
|
134
105
|
|
|
135
|
-
@register_tool(
|
|
106
|
+
@register_tool("office")
|
|
136
107
|
class OfficeTool(BaseTool):
|
|
137
108
|
"""
|
|
138
109
|
Office document processing tool supporting:
|
|
@@ -146,27 +117,115 @@ class OfficeTool(BaseTool):
|
|
|
146
117
|
|
|
147
118
|
Inherits from BaseTool to leverage ToolExecutor for caching, concurrency, and error handling.
|
|
148
119
|
"""
|
|
149
|
-
|
|
120
|
+
|
|
121
|
+
# Configuration schema
|
|
122
|
+
class Config(BaseSettings):
|
|
123
|
+
"""Configuration for the office tool
|
|
124
|
+
|
|
125
|
+
Automatically reads from environment variables with OFFICE_TOOL_ prefix.
|
|
126
|
+
Example: OFFICE_TOOL_MAX_FILE_SIZE_MB -> max_file_size_mb
|
|
150
127
|
"""
|
|
151
|
-
|
|
128
|
+
|
|
129
|
+
model_config = SettingsConfigDict(env_prefix="OFFICE_TOOL_")
|
|
130
|
+
|
|
131
|
+
max_file_size_mb: int = Field(default=100, description="Maximum file size in megabytes")
|
|
132
|
+
default_font: str = Field(default="Arial", description="Default font for documents")
|
|
133
|
+
default_font_size: int = Field(default=12, description="Default font size in points")
|
|
134
|
+
tika_log_path: str = Field(
|
|
135
|
+
default=os.path.expanduser("~/.cache/tika"),
|
|
136
|
+
description="Tika log directory path",
|
|
137
|
+
)
|
|
138
|
+
allowed_extensions: List[str] = Field(
|
|
139
|
+
default=[
|
|
140
|
+
".docx",
|
|
141
|
+
".pptx",
|
|
142
|
+
".xlsx",
|
|
143
|
+
".pdf",
|
|
144
|
+
".png",
|
|
145
|
+
".jpg",
|
|
146
|
+
".jpeg",
|
|
147
|
+
".tiff",
|
|
148
|
+
".bmp",
|
|
149
|
+
".gif",
|
|
150
|
+
],
|
|
151
|
+
description="Allowed document file extensions",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Schema definitions
|
|
155
|
+
class Read_docxSchema(BaseFileSchema):
|
|
156
|
+
"""Schema for read_docx operation"""
|
|
157
|
+
|
|
158
|
+
file_path: str = Field(description="Path to the DOCX file to read")
|
|
159
|
+
include_tables: bool = Field(default=False, description="Whether to include table data in the output. If True, tables are included as nested lists")
|
|
160
|
+
|
|
161
|
+
class Write_docxSchema(BaseFileSchema):
|
|
162
|
+
"""Schema for write_docx operation"""
|
|
163
|
+
|
|
164
|
+
text: str = Field(description="Text content to write to the DOCX file")
|
|
165
|
+
output_path: str = Field(description="Path where the DOCX file will be saved")
|
|
166
|
+
table_data: Optional[List[List[str]]] = Field(default=None, description="Optional table data to include in the document. Each inner list represents a row, each string represents a cell")
|
|
167
|
+
|
|
168
|
+
class Read_pptxSchema(BaseFileSchema):
|
|
169
|
+
"""Schema for read_pptx operation"""
|
|
170
|
+
|
|
171
|
+
file_path: str = Field(description="Path to the PPTX file to read")
|
|
172
|
+
|
|
173
|
+
class Write_pptxSchema(BaseFileSchema):
|
|
174
|
+
"""Schema for write_pptx operation"""
|
|
175
|
+
|
|
176
|
+
slides: List[str] = Field(description="List of slide content strings. Each string becomes a slide")
|
|
177
|
+
output_path: str = Field(description="Path where the PPTX file will be saved")
|
|
178
|
+
image_path: Optional[str] = Field(default=None, description="Optional path to an image file to include on the first slide")
|
|
179
|
+
|
|
180
|
+
class Read_xlsxSchema(BaseFileSchema):
|
|
181
|
+
"""Schema for read_xlsx operation"""
|
|
182
|
+
|
|
183
|
+
file_path: str = Field(description="Path to the XLSX file to read")
|
|
184
|
+
sheet_name: Optional[str] = Field(default=None, description="Optional name of the sheet to read. If None, reads the first sheet")
|
|
185
|
+
|
|
186
|
+
class Write_xlsxSchema(BaseFileSchema):
|
|
187
|
+
"""Schema for write_xlsx operation"""
|
|
188
|
+
|
|
189
|
+
data: List[Dict[str, Any]] = Field(description="List of dictionaries representing Excel rows. Each dictionary key becomes a column header, values become cell data")
|
|
190
|
+
output_path: str = Field(description="Path where the XLSX file will be saved")
|
|
191
|
+
sheet_name: str = Field(default="Sheet1", description="Name of the Excel sheet to create")
|
|
192
|
+
|
|
193
|
+
class Extract_textSchema(BaseFileSchema):
|
|
194
|
+
"""Schema for extract_text operation"""
|
|
195
|
+
|
|
196
|
+
file_path: str = Field(description="Path to the file to extract text from. Supports DOCX, PPTX, XLSX, PDF, and image formats")
|
|
197
|
+
|
|
198
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None, **kwargs):
|
|
199
|
+
"""
|
|
200
|
+
Initialize OfficeTool with configuration.
|
|
201
|
+
|
|
202
|
+
Configuration is automatically loaded by BaseTool from:
|
|
203
|
+
1. Explicit config dict (highest priority)
|
|
204
|
+
2. YAML config files (config/tools/office_tool.yaml)
|
|
205
|
+
3. Environment variables (via dotenv from .env files)
|
|
206
|
+
4. Tool defaults (lowest priority)
|
|
152
207
|
|
|
153
208
|
Args:
|
|
154
|
-
config (Dict, optional): Configuration overrides for
|
|
209
|
+
config (Dict, optional): Configuration overrides for OfficeTool.
|
|
210
|
+
**kwargs: Additional arguments passed to BaseTool (e.g., tool_name)
|
|
155
211
|
|
|
156
212
|
Raises:
|
|
157
213
|
ValueError: If config contains invalid settings.
|
|
158
214
|
"""
|
|
159
|
-
super().__init__(config)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
215
|
+
super().__init__(config, **kwargs)
|
|
216
|
+
|
|
217
|
+
# Configuration is automatically loaded by BaseTool into self._config_obj
|
|
218
|
+
# Access config via self._config_obj (BaseSettings instance)
|
|
219
|
+
self.config = self._config_obj if self._config_obj else self.Config()
|
|
220
|
+
|
|
221
|
+
# Configure Tika log path from config
|
|
222
|
+
os.environ["TIKA_LOG_PATH"] = self.config.tika_log_path
|
|
223
|
+
os.makedirs(self.config.tika_log_path, exist_ok=True)
|
|
224
|
+
|
|
166
225
|
self.logger = logging.getLogger(__name__)
|
|
167
226
|
if not self.logger.handlers:
|
|
168
227
|
handler = logging.StreamHandler()
|
|
169
|
-
handler.setFormatter(logging.Formatter(
|
|
228
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
170
229
|
self.logger.addHandler(handler)
|
|
171
230
|
self.logger.setLevel(logging.INFO)
|
|
172
231
|
|
|
@@ -182,28 +241,29 @@ class OfficeTool(BaseTool):
|
|
|
182
241
|
ContentValidationError: If document structure is invalid.
|
|
183
242
|
"""
|
|
184
243
|
try:
|
|
185
|
-
if file_type ==
|
|
244
|
+
if file_type == "docx":
|
|
186
245
|
doc = DocxDocument(file_path)
|
|
187
|
-
if not hasattr(doc,
|
|
246
|
+
if not hasattr(doc, "paragraphs"):
|
|
188
247
|
raise ContentValidationError("Invalid DOCX structure")
|
|
189
|
-
elif file_type ==
|
|
248
|
+
elif file_type == "pptx":
|
|
190
249
|
prs = Presentation(file_path)
|
|
191
|
-
if not hasattr(prs,
|
|
250
|
+
if not hasattr(prs, "slides"):
|
|
192
251
|
raise ContentValidationError("Invalid PPTX structure")
|
|
193
|
-
elif file_type ==
|
|
194
|
-
# Just validate that file can be read - don't care about return
|
|
252
|
+
elif file_type == "xlsx":
|
|
253
|
+
# Just validate that file can be read - don't care about return
|
|
254
|
+
# type
|
|
195
255
|
pd.read_excel(file_path, nrows=5)
|
|
196
|
-
elif file_type ==
|
|
256
|
+
elif file_type == "pdf":
|
|
197
257
|
with pdfplumber.open(file_path) as pdf:
|
|
198
258
|
if len(pdf.pages) == 0:
|
|
199
259
|
raise ContentValidationError("PDF has no pages")
|
|
200
|
-
elif file_type ==
|
|
260
|
+
elif file_type == "image":
|
|
201
261
|
img = Image.open(file_path)
|
|
202
262
|
img.verify() # Verify it's a valid image
|
|
203
263
|
else:
|
|
204
264
|
# Use tika as fallback for other formats
|
|
205
265
|
parsed = parser.from_file(file_path)
|
|
206
|
-
if not parsed or not parsed.get(
|
|
266
|
+
if not parsed or not parsed.get("content"):
|
|
207
267
|
raise ContentValidationError("Unable to parse file content")
|
|
208
268
|
except Exception as e:
|
|
209
269
|
raise ContentValidationError(f"Invalid {file_type.upper()} file: {str(e)}")
|
|
@@ -220,7 +280,7 @@ class OfficeTool(BaseTool):
|
|
|
220
280
|
"""
|
|
221
281
|
if not text:
|
|
222
282
|
return ""
|
|
223
|
-
return
|
|
283
|
+
return "".join(char for char in text if ord(char) >= 32 or char in "\n\r\t")
|
|
224
284
|
|
|
225
285
|
def _sanitize_table_data(self, table_data: Optional[List[List[str]]]) -> Optional[List[List[str]]]:
|
|
226
286
|
"""
|
|
@@ -252,7 +312,8 @@ class OfficeTool(BaseTool):
|
|
|
252
312
|
for item in data_list:
|
|
253
313
|
clean_item = {}
|
|
254
314
|
for k, v in item.items():
|
|
255
|
-
|
|
315
|
+
# Excel key limit with sanitization
|
|
316
|
+
clean_key = self._sanitize_text(str(k))[:255]
|
|
256
317
|
if isinstance(v, str):
|
|
257
318
|
clean_value = self._sanitize_text(v)[:32767] # Excel cell limit
|
|
258
319
|
else:
|
|
@@ -281,7 +342,7 @@ class OfficeTool(BaseTool):
|
|
|
281
342
|
page_text = page.extract_text()
|
|
282
343
|
if page_text:
|
|
283
344
|
text_content.append(page_text)
|
|
284
|
-
return
|
|
345
|
+
return "\n".join(text_content)
|
|
285
346
|
except Exception as e:
|
|
286
347
|
raise FileOperationError(f"Failed to extract PDF text: {str(e)}")
|
|
287
348
|
|
|
@@ -299,11 +360,11 @@ class OfficeTool(BaseTool):
|
|
|
299
360
|
FileOperationError: If image text extraction fails.
|
|
300
361
|
"""
|
|
301
362
|
try:
|
|
302
|
-
image = Image.open(file_path)
|
|
363
|
+
image: Image.Image = Image.open(file_path)
|
|
303
364
|
# Convert to RGB if necessary
|
|
304
|
-
if image.mode !=
|
|
305
|
-
image = image.convert(
|
|
306
|
-
text = pytesseract.image_to_string(image, lang=
|
|
365
|
+
if image.mode != "RGB":
|
|
366
|
+
image = image.convert("RGB")
|
|
367
|
+
text = pytesseract.image_to_string(image, lang="eng+chi_sim")
|
|
307
368
|
return text.strip()
|
|
308
369
|
except Exception as e:
|
|
309
370
|
raise FileOperationError(f"Failed to extract image text: {str(e)}")
|
|
@@ -323,7 +384,7 @@ class OfficeTool(BaseTool):
|
|
|
323
384
|
"""
|
|
324
385
|
try:
|
|
325
386
|
parsed = parser.from_file(file_path)
|
|
326
|
-
content = parsed.get(
|
|
387
|
+
content = parsed.get("content", "")
|
|
327
388
|
return content.strip() if content else ""
|
|
328
389
|
except Exception as e:
|
|
329
390
|
raise FileOperationError(f"Failed to extract text with Tika: {str(e)}")
|
|
@@ -344,19 +405,24 @@ class OfficeTool(BaseTool):
|
|
|
344
405
|
ContentValidationError: If document structure is invalid.
|
|
345
406
|
"""
|
|
346
407
|
try:
|
|
347
|
-
self._validate_document(file_path,
|
|
408
|
+
self._validate_document(file_path, "docx")
|
|
348
409
|
doc = DocxDocument(file_path)
|
|
349
410
|
paras = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
350
411
|
tables = None
|
|
351
412
|
if include_tables:
|
|
352
413
|
tables = [[[cell.text for cell in row.cells] for row in table.rows] for table in doc.tables]
|
|
353
|
-
return {
|
|
414
|
+
return {"paragraphs": paras, "tables": tables}
|
|
354
415
|
except ContentValidationError:
|
|
355
416
|
raise
|
|
356
417
|
except Exception as e:
|
|
357
418
|
raise FileOperationError(f"Failed to read DOCX: {str(e)}")
|
|
358
419
|
|
|
359
|
-
def write_docx(
|
|
420
|
+
def write_docx(
|
|
421
|
+
self,
|
|
422
|
+
text: str,
|
|
423
|
+
output_path: str,
|
|
424
|
+
table_data: Optional[List[List[str]]] = None,
|
|
425
|
+
) -> Dict[str, Any]:
|
|
360
426
|
"""
|
|
361
427
|
Write content to a DOCX file.
|
|
362
428
|
|
|
@@ -375,9 +441,9 @@ class OfficeTool(BaseTool):
|
|
|
375
441
|
sanitized_text = self._sanitize_text(text)
|
|
376
442
|
sanitized_table_data = self._sanitize_table_data(table_data)
|
|
377
443
|
doc = DocxDocument()
|
|
378
|
-
style = doc.styles[
|
|
379
|
-
style.font.name = self.
|
|
380
|
-
style.font.size = Pt(self.
|
|
444
|
+
style = doc.styles["Normal"]
|
|
445
|
+
style.font.name = self.config.default_font
|
|
446
|
+
style.font.size = Pt(self.config.default_font_size)
|
|
381
447
|
for line in sanitized_text.splitlines():
|
|
382
448
|
doc.add_paragraph(line)
|
|
383
449
|
if sanitized_table_data and sanitized_table_data[0]:
|
|
@@ -389,9 +455,10 @@ class OfficeTool(BaseTool):
|
|
|
389
455
|
if j < len(row):
|
|
390
456
|
table.rows[i].cells[j].text = str(row[j])
|
|
391
457
|
else:
|
|
392
|
-
|
|
458
|
+
# Empty cell for missing data
|
|
459
|
+
table.rows[i].cells[j].text = ""
|
|
393
460
|
doc.save(output_path)
|
|
394
|
-
return {
|
|
461
|
+
return {"success": True, "file_path": output_path}
|
|
395
462
|
except Exception as e:
|
|
396
463
|
raise FileOperationError(f"Failed to write DOCX: {str(e)}")
|
|
397
464
|
|
|
@@ -410,12 +477,12 @@ class OfficeTool(BaseTool):
|
|
|
410
477
|
ContentValidationError: If document structure is invalid.
|
|
411
478
|
"""
|
|
412
479
|
try:
|
|
413
|
-
self._validate_document(file_path,
|
|
480
|
+
self._validate_document(file_path, "pptx")
|
|
414
481
|
prs = Presentation(file_path)
|
|
415
482
|
texts = []
|
|
416
483
|
for slide in prs.slides:
|
|
417
484
|
for shape in slide.shapes:
|
|
418
|
-
if hasattr(shape,
|
|
485
|
+
if hasattr(shape, "text"):
|
|
419
486
|
txt = shape.text.strip()
|
|
420
487
|
if txt:
|
|
421
488
|
texts.append(txt)
|
|
@@ -425,7 +492,12 @@ class OfficeTool(BaseTool):
|
|
|
425
492
|
except Exception as e:
|
|
426
493
|
raise FileOperationError(f"Failed to read PPTX: {str(e)}")
|
|
427
494
|
|
|
428
|
-
def write_pptx(
|
|
495
|
+
def write_pptx(
|
|
496
|
+
self,
|
|
497
|
+
slides: List[str],
|
|
498
|
+
output_path: str,
|
|
499
|
+
image_path: Optional[str] = None,
|
|
500
|
+
) -> Dict[str, Any]:
|
|
429
501
|
"""
|
|
430
502
|
Write content to a PPTX file.
|
|
431
503
|
|
|
@@ -462,7 +534,7 @@ class OfficeTool(BaseTool):
|
|
|
462
534
|
except Exception as img_err:
|
|
463
535
|
self.logger.warning(f"Could not add image to slide: {img_err}")
|
|
464
536
|
prs.save(output_path)
|
|
465
|
-
return {
|
|
537
|
+
return {"success": True, "file_path": output_path}
|
|
466
538
|
except Exception as e:
|
|
467
539
|
raise FileOperationError(f"Failed to write PPTX: {str(e)}")
|
|
468
540
|
|
|
@@ -482,27 +554,27 @@ class OfficeTool(BaseTool):
|
|
|
482
554
|
ContentValidationError: If document structure is invalid.
|
|
483
555
|
"""
|
|
484
556
|
try:
|
|
485
|
-
self._validate_document(file_path,
|
|
557
|
+
self._validate_document(file_path, "xlsx")
|
|
486
558
|
data = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
487
|
-
|
|
559
|
+
|
|
488
560
|
# Handle different return types from pd.read_excel()
|
|
489
561
|
if isinstance(data, pd.DataFrame):
|
|
490
562
|
# Single sheet or specific sheet requested
|
|
491
|
-
return data.to_dict(orient=
|
|
563
|
+
return data.to_dict(orient="records")
|
|
492
564
|
elif isinstance(data, dict):
|
|
493
565
|
# Multiple sheets returned as dict - use the first sheet
|
|
494
566
|
first_sheet_name = list(data.keys())[0]
|
|
495
567
|
first_df = data[first_sheet_name]
|
|
496
|
-
return first_df.to_dict(orient=
|
|
568
|
+
return first_df.to_dict(orient="records")
|
|
497
569
|
else:
|
|
498
570
|
raise FileOperationError("Unexpected data type returned from Excel file")
|
|
499
|
-
|
|
571
|
+
|
|
500
572
|
except ContentValidationError:
|
|
501
573
|
raise
|
|
502
574
|
except Exception as e:
|
|
503
575
|
raise FileOperationError(f"Failed to read XLSX: {str(e)}")
|
|
504
576
|
|
|
505
|
-
def write_xlsx(self, data: List[Dict], output_path: str, sheet_name: str =
|
|
577
|
+
def write_xlsx(self, data: List[Dict], output_path: str, sheet_name: str = "Sheet1") -> Dict[str, Any]:
|
|
506
578
|
"""
|
|
507
579
|
Write content to an XLSX file.
|
|
508
580
|
|
|
@@ -523,7 +595,7 @@ class OfficeTool(BaseTool):
|
|
|
523
595
|
pd.DataFrame().to_excel(output_path, index=False, sheet_name=sheet_name)
|
|
524
596
|
else:
|
|
525
597
|
pd.DataFrame(sanitized_data).to_excel(output_path, index=False, sheet_name=sheet_name)
|
|
526
|
-
return {
|
|
598
|
+
return {"success": True, "file_path": output_path}
|
|
527
599
|
except Exception as e:
|
|
528
600
|
raise FileOperationError(f"Failed to write XLSX: {str(e)}")
|
|
529
601
|
|
|
@@ -545,38 +617,45 @@ class OfficeTool(BaseTool):
|
|
|
545
617
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
546
618
|
|
|
547
619
|
# Determine file type and validate
|
|
548
|
-
if file_ext ==
|
|
549
|
-
file_type =
|
|
550
|
-
elif file_ext ==
|
|
551
|
-
file_type =
|
|
552
|
-
elif file_ext ==
|
|
553
|
-
file_type =
|
|
554
|
-
elif file_ext ==
|
|
555
|
-
file_type =
|
|
556
|
-
elif file_ext in [
|
|
557
|
-
|
|
620
|
+
if file_ext == ".pdf":
|
|
621
|
+
file_type = "pdf"
|
|
622
|
+
elif file_ext == ".docx":
|
|
623
|
+
file_type = "docx"
|
|
624
|
+
elif file_ext == ".pptx":
|
|
625
|
+
file_type = "pptx"
|
|
626
|
+
elif file_ext == ".xlsx":
|
|
627
|
+
file_type = "xlsx"
|
|
628
|
+
elif file_ext in [
|
|
629
|
+
".png",
|
|
630
|
+
".jpg",
|
|
631
|
+
".jpeg",
|
|
632
|
+
".tiff",
|
|
633
|
+
".bmp",
|
|
634
|
+
".gif",
|
|
635
|
+
]:
|
|
636
|
+
file_type = "image"
|
|
558
637
|
else:
|
|
559
|
-
file_type =
|
|
638
|
+
file_type = "other"
|
|
560
639
|
|
|
561
640
|
# Validate document structure
|
|
562
641
|
self._validate_document(file_path, file_type)
|
|
563
642
|
|
|
564
643
|
# Extract text based on file type
|
|
565
|
-
if file_type ==
|
|
644
|
+
if file_type == "pdf":
|
|
566
645
|
return self._sanitize_text(self._extract_pdf_text(file_path))
|
|
567
|
-
elif file_type ==
|
|
646
|
+
elif file_type == "docx":
|
|
568
647
|
doc = DocxDocument(file_path)
|
|
569
648
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
570
|
-
return self._sanitize_text(
|
|
571
|
-
elif file_type ==
|
|
649
|
+
return self._sanitize_text("\n".join(paragraphs))
|
|
650
|
+
elif file_type == "pptx":
|
|
572
651
|
prs = Presentation(file_path)
|
|
573
652
|
texts = []
|
|
574
653
|
for slide in prs.slides:
|
|
575
654
|
for shape in slide.shapes:
|
|
576
|
-
if hasattr(shape,
|
|
655
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
577
656
|
texts.append(shape.text)
|
|
578
|
-
return self._sanitize_text(
|
|
579
|
-
elif file_type ==
|
|
657
|
+
return self._sanitize_text("\n".join(texts))
|
|
658
|
+
elif file_type == "xlsx":
|
|
580
659
|
data = pd.read_excel(file_path)
|
|
581
660
|
# Handle different return types from pd.read_excel()
|
|
582
661
|
if isinstance(data, pd.DataFrame):
|
|
@@ -587,8 +666,9 @@ class OfficeTool(BaseTool):
|
|
|
587
666
|
first_df = data[first_sheet_name]
|
|
588
667
|
return self._sanitize_text(first_df.to_string(index=False))
|
|
589
668
|
else:
|
|
590
|
-
|
|
591
|
-
|
|
669
|
+
# Fallback for unexpected data types
|
|
670
|
+
return self._sanitize_text("")
|
|
671
|
+
elif file_type == "image":
|
|
592
672
|
return self._sanitize_text(self._extract_image_text(file_path))
|
|
593
673
|
else:
|
|
594
674
|
# Use Tika as fallback for other formats
|