aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,564 @@
1
+ """
2
+ Data Loader Tool - Universal data loading from multiple file formats
3
+
4
+ This tool provides comprehensive data loading capabilities with:
5
+ - Auto-detection of file formats
6
+ - Multiple loading strategies (full, streaming, chunked, lazy)
7
+ - Data quality validation on load
8
+ - Schema inference and validation
9
+ - Support for CSV, Excel, JSON, Parquet, and other formats
10
+ """
11
+
12
+ import os
13
+ import logging
14
+ from typing import Dict, Any, List, Optional, Union, Iterator
15
+ from enum import Enum
16
+ from pathlib import Path
17
+
18
+ import pandas as pd
19
+ from pydantic import BaseModel, Field, ConfigDict
20
+
21
+ from aiecs.tools.base_tool import BaseTool
22
+ from aiecs.tools import register_tool
23
+
24
+
25
+ class DataSourceType(str, Enum):
26
+ """Supported data source types"""
27
+
28
+ CSV = "csv"
29
+ EXCEL = "excel"
30
+ JSON = "json"
31
+ PARQUET = "parquet"
32
+ FEATHER = "feather"
33
+ HDF5 = "hdf5"
34
+ STATA = "stata"
35
+ SAS = "sas"
36
+ SPSS = "spss"
37
+ AUTO = "auto"
38
+
39
+
40
+ class LoadStrategy(str, Enum):
41
+ """Data loading strategies"""
42
+
43
+ FULL_LOAD = "full_load"
44
+ STREAMING = "streaming"
45
+ CHUNKED = "chunked"
46
+ LAZY = "lazy"
47
+ INCREMENTAL = "incremental"
48
+
49
+
50
+ class DataLoaderError(Exception):
51
+ """Base exception for DataLoader errors"""
52
+
53
+
54
+ class FileFormatError(DataLoaderError):
55
+ """Raised when file format is unsupported or invalid"""
56
+
57
+
58
+ class SchemaValidationError(DataLoaderError):
59
+ """Raised when schema validation fails"""
60
+
61
+
62
+ class DataQualityError(DataLoaderError):
63
+ """Raised when data quality issues are detected"""
64
+
65
+
66
+ @register_tool("data_loader")
67
+ class DataLoaderTool(BaseTool):
68
+ """
69
+ Universal data loading tool that can:
70
+ 1. Load data from multiple file formats
71
+ 2. Auto-detect data formats and schemas
72
+ 3. Handle large datasets with streaming
73
+ 4. Validate data quality on load
74
+
75
+ Integrates with pandas_tool for core data operations.
76
+ """
77
+
78
+ # Configuration schema
79
+ class Config(BaseModel):
80
+ """Configuration for the data loader tool"""
81
+
82
+ model_config = ConfigDict(env_prefix="DATA_LOADER_")
83
+
84
+ max_file_size_mb: int = Field(default=500, description="Maximum file size in megabytes")
85
+ default_chunk_size: int = Field(
86
+ default=10000, description="Default chunk size for chunked loading"
87
+ )
88
+ max_memory_usage_mb: int = Field(
89
+ default=2000, description="Maximum memory usage in megabytes"
90
+ )
91
+ enable_schema_inference: bool = Field(
92
+ default=True,
93
+ description="Whether to enable automatic schema inference",
94
+ )
95
+ enable_quality_validation: bool = Field(
96
+ default=True,
97
+ description="Whether to enable data quality validation",
98
+ )
99
+ default_encoding: str = Field(
100
+ default="utf-8",
101
+ description="Default text encoding for file operations",
102
+ )
103
+
104
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
105
+ """
106
+ Initialize DataLoaderTool with settings.
107
+
108
+ Args:
109
+ config: Optional configuration overrides
110
+ """
111
+ super().__init__(config)
112
+
113
+ # Parse configuration
114
+ self.config = self.Config(**(config or {}))
115
+
116
+ self.logger = logging.getLogger(__name__)
117
+ if not self.logger.handlers:
118
+ handler = logging.StreamHandler()
119
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
120
+ self.logger.addHandler(handler)
121
+ self.logger.setLevel(logging.INFO)
122
+
123
+ # Initialize external tools
124
+ self._init_external_tools()
125
+
126
+ def _init_external_tools(self):
127
+ """Initialize external task tools"""
128
+ self.external_tools = {}
129
+
130
+ # Initialize PandasTool for data operations
131
+ try:
132
+ from aiecs.tools.task_tools.pandas_tool import PandasTool
133
+
134
+ self.external_tools["pandas"] = PandasTool()
135
+ self.logger.info("PandasTool initialized successfully")
136
+ except ImportError:
137
+ self.logger.warning("PandasTool not available")
138
+ self.external_tools["pandas"] = None
139
+
140
+ # Schema definitions
141
+ class LoadDataSchema(BaseModel):
142
+ """Schema for load_data operation"""
143
+
144
+ source: str = Field(description="Path to data source file")
145
+ source_type: Optional[DataSourceType] = Field(
146
+ default=DataSourceType.AUTO, description="Data source type"
147
+ )
148
+ strategy: LoadStrategy = Field(
149
+ default=LoadStrategy.FULL_LOAD, description="Loading strategy"
150
+ )
151
+ data_schema: Optional[Dict[str, Any]] = Field(
152
+ default=None, description="Expected schema for validation"
153
+ )
154
+ validation_rules: Optional[Dict[str, Any]] = Field(
155
+ default=None, description="Data quality validation rules"
156
+ )
157
+ nrows: Optional[int] = Field(default=None, description="Number of rows to load")
158
+ chunk_size: Optional[int] = Field(
159
+ default=None, description="Chunk size for chunked loading"
160
+ )
161
+ encoding: Optional[str] = Field(default=None, description="File encoding")
162
+
163
+ class DetectFormatSchema(BaseModel):
164
+ """Schema for detect_format operation"""
165
+
166
+ source: str = Field(description="Path to data source file")
167
+
168
+ class ValidateSchemaSchema(BaseModel):
169
+ """Schema for validate_schema operation"""
170
+
171
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to validate")
172
+ data_schema: Dict[str, Any] = Field(description="Expected schema")
173
+
174
+ class StreamDataSchema(BaseModel):
175
+ """Schema for stream_data operation"""
176
+
177
+ source: str = Field(description="Path to data source file")
178
+ chunk_size: int = Field(default=10000, description="Chunk size for streaming")
179
+ source_type: Optional[DataSourceType] = Field(
180
+ default=DataSourceType.AUTO, description="Data source type"
181
+ )
182
+
183
+ def load_data(
184
+ self,
185
+ source: str,
186
+ source_type: DataSourceType = DataSourceType.AUTO,
187
+ strategy: LoadStrategy = LoadStrategy.FULL_LOAD,
188
+ schema: Optional[Dict[str, Any]] = None,
189
+ validation_rules: Optional[Dict[str, Any]] = None,
190
+ nrows: Optional[int] = None,
191
+ chunk_size: Optional[int] = None,
192
+ encoding: Optional[str] = None,
193
+ ) -> Dict[str, Any]:
194
+ """
195
+ Load data from source with automatic format detection.
196
+
197
+ Args:
198
+ source: Path to data source file
199
+ source_type: Type of data source (auto-detected if not specified)
200
+ strategy: Loading strategy to use
201
+ schema: Expected schema for validation
202
+ validation_rules: Data quality validation rules
203
+ nrows: Number of rows to load (None for all)
204
+ chunk_size: Chunk size for chunked loading
205
+ encoding: File encoding
206
+
207
+ Returns:
208
+ Dict containing:
209
+ - data: Loaded DataFrame or data structure
210
+ - metadata: Metadata about loaded data
211
+ - quality_report: Quality assessment results
212
+
213
+ Raises:
214
+ DataLoaderError: If loading fails
215
+ FileFormatError: If format is unsupported
216
+ """
217
+ try:
218
+ # Validate source exists
219
+ if not os.path.exists(source):
220
+ raise DataLoaderError(f"Source file not found: {source}")
221
+
222
+ # Detect format if auto
223
+ if source_type == DataSourceType.AUTO:
224
+ source_type = self._detect_format(source)
225
+
226
+ # Check file size
227
+ file_size_mb = os.path.getsize(source) / (1024 * 1024)
228
+ if file_size_mb > self.config.max_file_size_mb:
229
+ self.logger.warning(f"File size {file_size_mb:.2f}MB exceeds recommended limit")
230
+
231
+ # Load data based on strategy
232
+ if strategy == LoadStrategy.FULL_LOAD:
233
+ data = self._load_full(source, source_type, nrows, encoding)
234
+ elif strategy == LoadStrategy.CHUNKED:
235
+ data = self._load_chunked(
236
+ source,
237
+ source_type,
238
+ chunk_size or self.config.default_chunk_size,
239
+ encoding,
240
+ )
241
+ elif strategy == LoadStrategy.STREAMING:
242
+ data = self._load_streaming(
243
+ source,
244
+ source_type,
245
+ chunk_size or self.config.default_chunk_size,
246
+ encoding,
247
+ )
248
+ elif strategy == LoadStrategy.LAZY:
249
+ data = self._load_lazy(source, source_type, encoding)
250
+ else:
251
+ raise DataLoaderError(f"Unsupported loading strategy: {strategy}")
252
+
253
+ # Generate metadata
254
+ metadata = self._generate_metadata(data, source, source_type)
255
+
256
+ # Validate schema if provided
257
+ if schema and self.config.enable_schema_inference:
258
+ schema_valid = self._validate_schema_internal(data, schema)
259
+ metadata["schema_valid"] = schema_valid
260
+
261
+ # Validate quality if enabled
262
+ quality_report = {}
263
+ if self.config.enable_quality_validation and isinstance(data, pd.DataFrame):
264
+ quality_report = self._validate_quality(data, validation_rules)
265
+
266
+ self.logger.info(f"Successfully loaded data from {source}")
267
+
268
+ return {
269
+ "data": data,
270
+ "metadata": metadata,
271
+ "quality_report": quality_report,
272
+ "source": source,
273
+ "source_type": source_type.value,
274
+ "strategy": strategy.value,
275
+ }
276
+
277
+ except Exception as e:
278
+ self.logger.error(f"Error loading data from {source}: {e}")
279
+ raise DataLoaderError(f"Failed to load data: {e}")
280
+
281
+ def detect_format(self, source: str) -> Dict[str, Any]:
282
+ """
283
+ Detect file format from source.
284
+
285
+ Args:
286
+ source: Path to data source file
287
+
288
+ Returns:
289
+ Dict containing detected format information
290
+ """
291
+ try:
292
+ detected_type = self._detect_format(source)
293
+
294
+ return {
295
+ "source": source,
296
+ "detected_type": detected_type.value,
297
+ "file_extension": Path(source).suffix.lower(),
298
+ "confidence": "high",
299
+ }
300
+ except Exception as e:
301
+ self.logger.error(f"Error detecting format: {e}")
302
+ raise FileFormatError(f"Failed to detect format: {e}")
303
+
304
+ def validate_schema(
305
+ self,
306
+ data: Union[Dict[str, Any], List[Dict[str, Any]]],
307
+ schema: Dict[str, Any],
308
+ ) -> Dict[str, Any]:
309
+ """
310
+ Validate data against expected schema.
311
+
312
+ Args:
313
+ data: Data to validate
314
+ schema: Expected schema definition
315
+
316
+ Returns:
317
+ Dict containing validation results
318
+ """
319
+ try:
320
+ # Convert to DataFrame if needed
321
+ if isinstance(data, list):
322
+ df = pd.DataFrame(data)
323
+ elif isinstance(data, dict):
324
+ df = pd.DataFrame([data])
325
+ else:
326
+ df = data
327
+
328
+ is_valid = self._validate_schema_internal(df, schema)
329
+
330
+ issues = []
331
+ if not is_valid:
332
+ # Check column presence
333
+ expected_columns = set(schema.get("columns", {}).keys())
334
+ actual_columns = set(df.columns)
335
+ missing = expected_columns - actual_columns
336
+ extra = actual_columns - expected_columns
337
+
338
+ if missing:
339
+ issues.append(f"Missing columns: {missing}")
340
+ if extra:
341
+ issues.append(f"Extra columns: {extra}")
342
+
343
+ return {
344
+ "valid": is_valid,
345
+ "issues": issues,
346
+ "expected_columns": list(schema.get("columns", {}).keys()),
347
+ "actual_columns": list(df.columns),
348
+ }
349
+
350
+ except Exception as e:
351
+ self.logger.error(f"Error validating schema: {e}")
352
+ raise SchemaValidationError(f"Schema validation failed: {e}")
353
+
354
+ def stream_data(
355
+ self,
356
+ source: str,
357
+ chunk_size: int = 10000,
358
+ source_type: DataSourceType = DataSourceType.AUTO,
359
+ ) -> Dict[str, Any]:
360
+ """
361
+ Stream data in chunks for large files.
362
+
363
+ Args:
364
+ source: Path to data source file
365
+ chunk_size: Size of each chunk
366
+ source_type: Type of data source
367
+
368
+ Returns:
369
+ Dict containing streaming iterator information
370
+ """
371
+ try:
372
+ if source_type == DataSourceType.AUTO:
373
+ source_type = self._detect_format(source)
374
+
375
+ # Create iterator based on format
376
+ if source_type == DataSourceType.CSV:
377
+ iterator = pd.read_csv(source, chunksize=chunk_size)
378
+ elif source_type == DataSourceType.JSON:
379
+ iterator = pd.read_json(source, lines=True, chunksize=chunk_size)
380
+ else:
381
+ raise FileFormatError(f"Streaming not supported for format: {source_type}")
382
+
383
+ return {
384
+ "iterator": iterator,
385
+ "chunk_size": chunk_size,
386
+ "source_type": source_type.value,
387
+ "message": "Streaming iterator created successfully",
388
+ }
389
+
390
+ except Exception as e:
391
+ self.logger.error(f"Error creating stream: {e}")
392
+ raise DataLoaderError(f"Failed to create stream: {e}")
393
+
394
+ # Internal helper methods
395
+
396
+ def _detect_format(self, source: str) -> DataSourceType:
397
+ """Detect file format from extension"""
398
+ ext = Path(source).suffix.lower()
399
+
400
+ format_map = {
401
+ ".csv": DataSourceType.CSV,
402
+ ".xlsx": DataSourceType.EXCEL,
403
+ ".xls": DataSourceType.EXCEL,
404
+ ".json": DataSourceType.JSON,
405
+ ".parquet": DataSourceType.PARQUET,
406
+ ".feather": DataSourceType.FEATHER,
407
+ ".h5": DataSourceType.HDF5,
408
+ ".hdf": DataSourceType.HDF5,
409
+ ".dta": DataSourceType.STATA,
410
+ ".sas7bdat": DataSourceType.SAS,
411
+ ".sav": DataSourceType.SPSS,
412
+ }
413
+
414
+ detected = format_map.get(ext)
415
+ if not detected:
416
+ raise FileFormatError(f"Unsupported file format: {ext}")
417
+
418
+ return detected
419
+
420
+ def _load_full(
421
+ self,
422
+ source: str,
423
+ source_type: DataSourceType,
424
+ nrows: Optional[int],
425
+ encoding: Optional[str],
426
+ ) -> pd.DataFrame:
427
+ """Load entire dataset into memory"""
428
+ encoding = encoding or self.config.default_encoding
429
+
430
+ if source_type == DataSourceType.CSV:
431
+ return pd.read_csv(source, nrows=nrows, encoding=encoding)
432
+ elif source_type == DataSourceType.EXCEL:
433
+ return pd.read_excel(source, nrows=nrows)
434
+ elif source_type == DataSourceType.JSON:
435
+ return pd.read_json(source, nrows=nrows, encoding=encoding)
436
+ elif source_type == DataSourceType.PARQUET:
437
+ return pd.read_parquet(source)
438
+ elif source_type == DataSourceType.FEATHER:
439
+ return pd.read_feather(source)
440
+ elif source_type == DataSourceType.HDF5:
441
+ return pd.read_hdf(source)
442
+ elif source_type == DataSourceType.STATA:
443
+ df = pd.read_stata(source)
444
+ if nrows:
445
+ return df.head(nrows)
446
+ return df
447
+ elif source_type == DataSourceType.SAS:
448
+ return pd.read_sas(source)
449
+ elif source_type == DataSourceType.SPSS:
450
+ try:
451
+ import pyreadstat
452
+
453
+ df, meta = pyreadstat.read_sav(source)
454
+ return df
455
+ except ImportError:
456
+ raise DataLoaderError("pyreadstat required for SPSS files")
457
+ else:
458
+ raise FileFormatError(f"Unsupported format for full load: {source_type}")
459
+
460
+ def _load_chunked(
461
+ self,
462
+ source: str,
463
+ source_type: DataSourceType,
464
+ chunk_size: int,
465
+ encoding: Optional[str],
466
+ ) -> pd.DataFrame:
467
+ """Load data in chunks and combine"""
468
+ encoding = encoding or self.config.default_encoding
469
+ chunks = []
470
+
471
+ if source_type == DataSourceType.CSV:
472
+ for chunk in pd.read_csv(source, chunksize=chunk_size, encoding=encoding):
473
+ chunks.append(chunk)
474
+ elif source_type == DataSourceType.JSON:
475
+ for chunk in pd.read_json(source, lines=True, chunksize=chunk_size, encoding=encoding):
476
+ chunks.append(chunk)
477
+ else:
478
+ raise FileFormatError(f"Chunked loading not supported for: {source_type}")
479
+
480
+ return pd.concat(chunks, ignore_index=True)
481
+
482
+ def _load_streaming(
483
+ self,
484
+ source: str,
485
+ source_type: DataSourceType,
486
+ chunk_size: int,
487
+ encoding: Optional[str],
488
+ ) -> Iterator[pd.DataFrame]:
489
+ """Create streaming iterator"""
490
+ encoding = encoding or self.config.default_encoding
491
+
492
+ if source_type == DataSourceType.CSV:
493
+ return pd.read_csv(source, chunksize=chunk_size, encoding=encoding)
494
+ elif source_type == DataSourceType.JSON:
495
+ return pd.read_json(source, lines=True, chunksize=chunk_size, encoding=encoding)
496
+ else:
497
+ raise FileFormatError(f"Streaming not supported for: {source_type}")
498
+
499
+ def _load_lazy(self, source: str, source_type: DataSourceType, encoding: Optional[str]) -> Any:
500
+ """Create lazy loading wrapper"""
501
+ # For now, return full load with warning
502
+ self.logger.warning("Lazy loading not fully implemented, using full load")
503
+ return self._load_full(source, source_type, None, encoding)
504
+
505
+ def _generate_metadata(
506
+ self, data: Any, source: str, source_type: DataSourceType
507
+ ) -> Dict[str, Any]:
508
+ """Generate metadata about loaded data"""
509
+ if isinstance(data, pd.DataFrame):
510
+ return {
511
+ "rows": len(data),
512
+ "columns": len(data.columns),
513
+ "column_names": list(data.columns),
514
+ "dtypes": {col: str(dtype) for col, dtype in data.dtypes.items()},
515
+ "memory_usage_mb": data.memory_usage(deep=True).sum() / (1024 * 1024),
516
+ "file_size_mb": os.path.getsize(source) / (1024 * 1024),
517
+ }
518
+ else:
519
+ return {
520
+ "type": str(type(data)),
521
+ "file_size_mb": os.path.getsize(source) / (1024 * 1024),
522
+ }
523
+
524
+ def _validate_schema_internal(self, data: pd.DataFrame, schema: Dict[str, Any]) -> bool:
525
+ """Internal schema validation"""
526
+ if "columns" not in schema:
527
+ return True
528
+
529
+ expected_columns = set(schema["columns"].keys())
530
+ actual_columns = set(data.columns)
531
+
532
+ return expected_columns.issubset(actual_columns)
533
+
534
+ def _validate_quality(
535
+ self, data: pd.DataFrame, validation_rules: Optional[Dict[str, Any]]
536
+ ) -> Dict[str, Any]:
537
+ """Validate data quality"""
538
+ quality_report = {
539
+ "total_rows": len(data),
540
+ "total_columns": len(data.columns),
541
+ "missing_values": data.isnull().sum().to_dict(),
542
+ "duplicate_rows": data.duplicated().sum(),
543
+ "quality_score": 1.0,
544
+ }
545
+
546
+ # Calculate quality score
547
+ missing_ratio = (
548
+ data.isnull().sum().sum() / (len(data) * len(data.columns)) if len(data) > 0 else 0
549
+ )
550
+ duplicate_ratio = quality_report["duplicate_rows"] / len(data) if len(data) > 0 else 0
551
+
552
+ quality_score = 1.0 - (missing_ratio * 0.5 + duplicate_ratio * 0.5)
553
+ quality_report["quality_score"] = max(0.0, min(1.0, quality_score))
554
+
555
+ # Add issues list
556
+ issues = []
557
+ if missing_ratio > 0.1:
558
+ issues.append(f"High missing value ratio: {missing_ratio:.2%}")
559
+ if duplicate_ratio > 0.05:
560
+ issues.append(f"High duplicate ratio: {duplicate_ratio:.2%}")
561
+
562
+ quality_report["issues"] = issues
563
+
564
+ return quality_report