aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,688 @@
1
+ import os
2
+ import logging
3
+ import tempfile
4
+ from typing import Dict, Any, List, Optional, Union, Tuple
5
+ from enum import Enum
6
+ from dataclasses import dataclass
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+
12
+ from aiecs.tools.base_tool import BaseTool
13
+ from aiecs.tools import register_tool
14
+
15
+ # Enums for configuration options
16
+
17
+
18
+ class ScalerType(str, Enum):
19
+ STANDARD = "standard"
20
+ MINMAX = "minmax"
21
+ ROBUST = "robust"
22
+ NONE = "none"
23
+
24
+
25
+ # Exceptions
26
+ class StatsToolError(Exception):
27
+ pass
28
+
29
+
30
+ class FileOperationError(StatsToolError):
31
+ pass
32
+
33
+
34
+ class AnalysisError(StatsToolError):
35
+ pass
36
+
37
+
38
+ # Utility Dataclass for Statistical Results
39
+
40
+
41
+ @dataclass
42
+ class StatsResult:
43
+ """Structured statistical result."""
44
+
45
+ test_type: str
46
+ statistic: float
47
+ pvalue: float
48
+ significant: bool
49
+ additional_metrics: Dict[str, Any]
50
+
51
+ def to_dict(self) -> Dict[str, Any]:
52
+ return {
53
+ "test_type": self.test_type,
54
+ "statistic": self.statistic,
55
+ "pvalue": self.pvalue,
56
+ "significant": self.significant,
57
+ **self.additional_metrics,
58
+ }
59
+
60
+
61
+ @register_tool("stats")
62
+ class StatsTool(BaseTool):
63
+ """Enhanced statistical analysis tool for various data formats and operations."""
64
+
65
+ # Configuration schema
66
+ class Config(BaseModel):
67
+ """Configuration for the stats tool"""
68
+
69
+ model_config = ConfigDict(env_prefix="STATS_TOOL_")
70
+
71
+ max_file_size_mb: int = Field(default=200, description="Maximum file size in megabytes")
72
+ allowed_extensions: List[str] = Field(
73
+ default=[
74
+ ".sav",
75
+ ".sas7bdat",
76
+ ".por",
77
+ ".csv",
78
+ ".xlsx",
79
+ ".xls",
80
+ ".json",
81
+ ".parquet",
82
+ ".feather",
83
+ ],
84
+ description="Allowed file extensions",
85
+ )
86
+
87
+ def __init__(self, config: Dict[str, Any] = None):
88
+ super().__init__(config)
89
+
90
+ # Parse configuration
91
+ self.config = self.Config(**(config or {}))
92
+
93
+ self.logger = logging.getLogger(__name__)
94
+ if not self.logger.handlers:
95
+ h = logging.StreamHandler()
96
+ h.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
97
+ self.logger.addHandler(h)
98
+ self.logger.setLevel(logging.INFO)
99
+
100
+ def _load_data(
101
+ self,
102
+ file_path: str,
103
+ nrows: Optional[int] = None,
104
+ sheet_name: Optional[Union[str, int]] = 0,
105
+ ) -> pd.DataFrame:
106
+ """Load data from various file formats into a pandas DataFrame."""
107
+ try:
108
+ ext = os.path.splitext(file_path)[1].lower()
109
+ if ext in [".sav", ".sas7bdat", ".por"]:
110
+ import pyreadstat
111
+
112
+ if ext == ".sav":
113
+ df, meta = pyreadstat.read_sav(file_path)
114
+ elif ext == ".sas7bdat":
115
+ df, meta = pyreadstat.read_sas7bdat(file_path)
116
+ else:
117
+ df, meta = pyreadstat.read_por(file_path)
118
+ return df
119
+ elif ext == ".csv":
120
+ return pd.read_csv(file_path, nrows=nrows)
121
+ elif ext in [".xlsx", ".xls"]:
122
+ return pd.read_excel(file_path, sheet_name=sheet_name, nrows=nrows)
123
+ elif ext == ".json":
124
+ return pd.read_json(file_path)
125
+ elif ext == ".parquet":
126
+ return pd.read_parquet(file_path)
127
+ elif ext == ".feather":
128
+ return pd.read_feather(file_path)
129
+ else:
130
+ raise FileOperationError(f"Unsupported file format: {ext}")
131
+ except Exception as e:
132
+ raise FileOperationError(f"Error reading file {file_path}: {str(e)}")
133
+
134
+ def _validate_variables(self, df: pd.DataFrame, vars_to_check: List[str]) -> None:
135
+ """Validate variables exist in the dataset."""
136
+ if not vars_to_check:
137
+ return
138
+ available_vars = df.columns.tolist()
139
+ missing_vars = [var for var in vars_to_check if var not in available_vars]
140
+ if missing_vars:
141
+ raise FileOperationError(f"Variables not found in dataset: {', '.join(missing_vars)}")
142
+
143
+ def _interpret_effect_size(self, d: float) -> str:
144
+ """Interpret Cohen's d or Cramer's V effect size."""
145
+ thresholds = [(0.2, "negligible"), (0.5, "small"), (0.8, "medium")]
146
+ for threshold, label in thresholds:
147
+ if abs(d) < threshold:
148
+ return label
149
+ return "large"
150
+
151
+ def read_data(
152
+ self,
153
+ file_path: str,
154
+ nrows: Optional[int] = None,
155
+ sheet_name: Optional[Union[str, int]] = 0,
156
+ ) -> Dict[str, Any]:
157
+ """Read data from various file formats."""
158
+ df = self._load_data(file_path, nrows, sheet_name)
159
+ return {
160
+ "variables": df.columns.tolist(),
161
+ "observations": len(df),
162
+ "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
163
+ "memory_usage": df.memory_usage(deep=True).sum() / (1024 * 1024),
164
+ "preview": df.head(5).to_dict(orient="records"),
165
+ }
166
+
167
+ def describe(
168
+ self,
169
+ file_path: str,
170
+ variables: Optional[List[str]] = None,
171
+ include_percentiles: bool = False,
172
+ percentiles: Optional[List[float]] = None,
173
+ ) -> Dict[str, Any]:
174
+ """Generate descriptive statistics for variables."""
175
+ df = self._load_data(file_path)
176
+ if variables:
177
+ self._validate_variables(df, variables)
178
+ df = df[variables]
179
+ desc = df.describe()
180
+ if include_percentiles and percentiles:
181
+ additional_percentiles = [p for p in percentiles if p not in [0.25, 0.5, 0.75]]
182
+ if additional_percentiles:
183
+ additional_desc = df.describe(percentiles=percentiles)
184
+ desc = pd.concat(
185
+ [
186
+ desc,
187
+ additional_desc.loc[[f"{int(p*100)}%" for p in additional_percentiles]],
188
+ ]
189
+ )
190
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
191
+ if numeric_cols.any():
192
+ desc.loc["skew"] = df[numeric_cols].skew()
193
+ desc.loc["kurtosis"] = df[numeric_cols].kurt()
194
+ return {"statistics": desc.to_dict(), "summary": desc.to_string()}
195
+
196
+ def ttest(
197
+ self,
198
+ file_path: str,
199
+ var1: str,
200
+ var2: str,
201
+ equal_var: bool = True,
202
+ paired: bool = False,
203
+ ) -> Dict[str, Any]:
204
+ """Perform t-tests (independent or paired). Also handles legacy ttest_ind."""
205
+ df = self._load_data(file_path)
206
+ self._validate_variables(df, [var1, var2])
207
+ import scipy.stats as stats
208
+
209
+ a = df[var1].dropna().values
210
+ b = df[var2].dropna().values
211
+ if paired:
212
+ min_len = min(len(a), len(b))
213
+ stat, p = stats.ttest_rel(a[:min_len], b[:min_len])
214
+ test_type = "paired t-test"
215
+ else:
216
+ stat, p = stats.ttest_ind(a, b, equal_var=equal_var)
217
+ test_type = (
218
+ "independent t-test (equal variance)"
219
+ if equal_var
220
+ else "Welch's t-test (unequal variance)"
221
+ )
222
+ mean_a = np.mean(a)
223
+ mean_b = np.mean(b)
224
+ std_a = np.std(a, ddof=1)
225
+ std_b = np.std(b, ddof=1)
226
+ if equal_var:
227
+ pooled_std = np.sqrt(
228
+ ((len(a) - 1) * std_a**2 + (len(b) - 1) * std_b**2) / (len(a) + len(b) - 2)
229
+ )
230
+ cohens_d = (mean_a - mean_b) / pooled_std
231
+ else:
232
+ cohens_d = (mean_a - mean_b) / np.sqrt((std_a**2 + std_b**2) / 2)
233
+ return StatsResult(
234
+ test_type=test_type,
235
+ statistic=float(stat),
236
+ pvalue=float(p),
237
+ significant=p < 0.05,
238
+ additional_metrics={
239
+ "cohens_d": float(cohens_d),
240
+ "effect_size_interpretation": self._interpret_effect_size(cohens_d),
241
+ "group1_mean": float(mean_a),
242
+ "group2_mean": float(mean_b),
243
+ "group1_std": float(std_a),
244
+ "group2_std": float(std_b),
245
+ "group1_n": int(len(a)),
246
+ "group2_n": int(len(b)),
247
+ },
248
+ ).to_dict()
249
+
250
+ # Legacy method (now an alias)
251
+ ttest_ind = ttest
252
+
253
+ def correlation(
254
+ self,
255
+ file_path: str,
256
+ variables: Optional[List[str]] = None,
257
+ var1: Optional[str] = None,
258
+ var2: Optional[str] = None,
259
+ method: str = "pearson",
260
+ ) -> Dict[str, Any]:
261
+ """Perform correlation analysis."""
262
+ df = self._load_data(file_path)
263
+ if variables:
264
+ self._validate_variables(df, variables)
265
+ if var1 and var2:
266
+ self._validate_variables(df, [var1, var2])
267
+ import scipy.stats as stats
268
+
269
+ result = {}
270
+ if variables:
271
+ corr_matrix = df[variables].corr(method=method)
272
+ result["correlation_matrix"] = corr_matrix.to_dict()
273
+ flat_corrs = [
274
+ {
275
+ "var1": v1,
276
+ "var2": v2,
277
+ "correlation": corr_matrix.loc[v1, v2],
278
+ "abs_correlation": abs(corr_matrix.loc[v1, v2]),
279
+ }
280
+ for i, v1 in enumerate(variables)
281
+ for j, v2 in enumerate(variables)
282
+ if i < j
283
+ ]
284
+ flat_corrs.sort(key=lambda x: x["abs_correlation"], reverse=True)
285
+ result["pairs"] = flat_corrs
286
+ elif var1 and var2:
287
+ x = df[var1].dropna()
288
+ y = df[var2].dropna()
289
+ method_map = {
290
+ "pearson": (stats.pearsonr, "Pearson's r"),
291
+ "spearman": (stats.spearmanr, "Spearman's rho"),
292
+ "kendall": (stats.kendalltau, "Kendall's tau"),
293
+ }
294
+ func, method_name = method_map[method]
295
+ corr, p = func(x, y)
296
+ result = {
297
+ "method": method_name,
298
+ "correlation": float(corr),
299
+ "pvalue": float(p),
300
+ "significant": p < 0.05,
301
+ "n": len(x),
302
+ }
303
+ return result
304
+
305
+ def anova(
306
+ self,
307
+ file_path: str,
308
+ dependent: str,
309
+ factor: str,
310
+ post_hoc: bool = False,
311
+ ) -> Dict[str, Any]:
312
+ """Perform one-way ANOVA with optional post-hoc tests."""
313
+ df = self._load_data(file_path)
314
+ self._validate_variables(df, [dependent, factor])
315
+ import scipy.stats as stats
316
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
317
+
318
+ dependent_var = df[dependent].dropna()
319
+ factor_var = df[factor].dropna()
320
+ min_len = min(len(dependent_var), len(factor_var))
321
+ dependent_var = dependent_var[:min_len]
322
+ factor_var = factor_var[:min_len]
323
+ groups = {name: group[dependent].dropna().values for name, group in df.groupby(factor)}
324
+ stat, p = stats.f_oneway(*groups.values())
325
+ result = {
326
+ "F": float(stat),
327
+ "pvalue": float(p),
328
+ "significant": p < 0.05,
329
+ "groups": len(groups),
330
+ "group_sizes": {name: len(values) for name, values in groups.items()},
331
+ "group_means": {name: float(np.mean(values)) for name, values in groups.items()},
332
+ "group_std": {name: float(np.std(values, ddof=1)) for name, values in groups.items()},
333
+ }
334
+ if post_hoc:
335
+ post_hoc_df = pd.DataFrame({"value": dependent_var, "group": factor_var})
336
+ tukey = pairwise_tukeyhsd(post_hoc_df["value"], post_hoc_df["group"])
337
+ from itertools import combinations
338
+
339
+ group_pairs = list(combinations(tukey.groupsunique, 2))
340
+ tukey_results = [
341
+ {
342
+ "group1": str(group1),
343
+ "group2": str(group2),
344
+ "mean_difference": float(mean_diff),
345
+ "p_adjusted": float(p_adj),
346
+ "significant": bool(reject),
347
+ "conf_lower": float(lower),
348
+ "conf_upper": float(upper),
349
+ }
350
+ for (
351
+ group1,
352
+ group2,
353
+ ), mean_diff, p_adj, lower, upper, reject in zip(
354
+ group_pairs,
355
+ tukey.meandiffs,
356
+ tukey.pvalues,
357
+ tukey.confint[:, 0],
358
+ tukey.confint[:, 1],
359
+ tukey.reject,
360
+ )
361
+ ]
362
+ result["post_hoc"] = {
363
+ "method": "Tukey HSD",
364
+ "alpha": 0.05, # Standard significance level for Tukey HSD
365
+ "comparisons": tukey_results,
366
+ }
367
+ return result
368
+
369
+ def chi_square(
370
+ self, file_path: str, var1: str, var2: str, correction: bool = True
371
+ ) -> Dict[str, Any]:
372
+ """Perform chi-square test of independence."""
373
+ df = self._load_data(file_path)
374
+ self._validate_variables(df, [var1, var2])
375
+ import scipy.stats as stats
376
+
377
+ contingency = pd.crosstab(df[var1], df[var2])
378
+ chi2, p, dof, expected = stats.chi2_contingency(contingency, correction=correction)
379
+ n = contingency.sum().sum()
380
+ min_dim = min(contingency.shape) - 1
381
+ cramers_v = np.sqrt(chi2 / (n * min_dim))
382
+ return {
383
+ "chi2": float(chi2),
384
+ "pvalue": float(p),
385
+ "dof": int(dof),
386
+ "significant": p < 0.05,
387
+ "cramers_v": float(cramers_v),
388
+ "effect_size_interpretation": self._interpret_effect_size(cramers_v),
389
+ "contingency_table": contingency.to_dict(),
390
+ "expected_frequencies": pd.DataFrame(
391
+ expected, index=contingency.index, columns=contingency.columns
392
+ ).to_dict(),
393
+ "test_type": (
394
+ "Chi-square test with Yates correction" if correction else "Chi-square test"
395
+ ),
396
+ }
397
+
398
+ def non_parametric(
399
+ self,
400
+ file_path: str,
401
+ test_type: str,
402
+ variables: List[str],
403
+ grouping: Optional[str] = None,
404
+ ) -> Dict[str, Any]:
405
+ """Perform non-parametric statistical tests."""
406
+ df = self._load_data(file_path)
407
+ self._validate_variables(df, variables + ([grouping] if grouping else []))
408
+ import scipy.stats as stats
409
+
410
+ if test_type == "mann_whitney":
411
+ if len(variables) != 2:
412
+ raise AnalysisError("Mann-Whitney U test requires exactly 2 variables")
413
+ x = df[variables[0]].dropna().values
414
+ y = df[variables[1]].dropna().values
415
+ u_stat, p_value = stats.mannwhitneyu(x, y)
416
+ return StatsResult(
417
+ test_type="Mann-Whitney U test",
418
+ statistic=float(u_stat),
419
+ pvalue=float(p_value),
420
+ significant=p_value < 0.05,
421
+ additional_metrics={
422
+ "n1": len(x),
423
+ "n2": len(y),
424
+ "median1": float(np.median(x)),
425
+ "median2": float(np.median(y)),
426
+ },
427
+ ).to_dict()
428
+ elif test_type == "wilcoxon":
429
+ if len(variables) != 2:
430
+ raise AnalysisError("Wilcoxon signed-rank test requires exactly 2 variables")
431
+ x = df[variables[0]].dropna().values
432
+ y = df[variables[1]].dropna().values
433
+ min_len = min(len(x), len(y))
434
+ x = x[:min_len]
435
+ y = y[:min_len]
436
+ w_stat, p_value = stats.wilcoxon(x, y)
437
+ return StatsResult(
438
+ test_type="Wilcoxon signed-rank test",
439
+ statistic=float(w_stat),
440
+ pvalue=float(p_value),
441
+ significant=p_value < 0.05,
442
+ additional_metrics={
443
+ "n_pairs": min_len,
444
+ "median_difference": float(np.median(x - y)),
445
+ },
446
+ ).to_dict()
447
+ elif test_type == "kruskal":
448
+ if not grouping:
449
+ raise AnalysisError("Kruskal-Wallis test requires a grouping variable")
450
+ groups = {
451
+ f"{var}_{name}": group[var].dropna().values
452
+ for name, group in df.groupby(grouping)
453
+ for var in variables
454
+ }
455
+ h_stat, p_value = stats.kruskal(*groups.values())
456
+ return StatsResult(
457
+ test_type="Kruskal-Wallis H test",
458
+ statistic=float(h_stat),
459
+ pvalue=float(p_value),
460
+ significant=p_value < 0.05,
461
+ additional_metrics={
462
+ "groups": len(groups),
463
+ "group_sizes": {name: len(values) for name, values in groups.items()},
464
+ "group_medians": {
465
+ name: float(np.median(values)) for name, values in groups.items()
466
+ },
467
+ },
468
+ ).to_dict()
469
+ elif test_type == "friedman":
470
+ if len(variables) < 2:
471
+ raise AnalysisError("Friedman test requires at least 2 variables")
472
+ data = df[variables].dropna()
473
+ chi2, p_value = stats.friedmanchisquare(*[data[var].values for var in variables])
474
+ return StatsResult(
475
+ test_type="Friedman test",
476
+ statistic=float(chi2),
477
+ pvalue=float(p_value),
478
+ significant=p_value < 0.05,
479
+ additional_metrics={
480
+ "n_measures": len(variables),
481
+ "n_samples": len(data),
482
+ "variable_medians": {var: float(np.median(data[var])) for var in variables},
483
+ },
484
+ ).to_dict()
485
+ else:
486
+ raise AnalysisError(
487
+ f"Unsupported non-parametric test type: {test_type}. Supported types: mann_whitney, wilcoxon, kruskal, friedman"
488
+ )
489
+
490
+ def regression(
491
+ self,
492
+ file_path: str,
493
+ formula: str,
494
+ regression_type: str = "ols",
495
+ robust: bool = False,
496
+ structured_output: bool = True,
497
+ ) -> Dict[str, Any]:
498
+ """Perform regression analysis with various models."""
499
+ df = self._load_data(file_path)
500
+ import statsmodels.formula.api as smf
501
+
502
+ try:
503
+ model_map = {
504
+ "ols": smf.ols,
505
+ "logit": smf.logit,
506
+ "probit": smf.probit,
507
+ "poisson": smf.poisson,
508
+ }
509
+ model = model_map[regression_type](formula=formula, data=df)
510
+ fit = model.fit(cov_type="HC3" if robust else "nonrobust")
511
+ if structured_output:
512
+ result = {
513
+ "model_type": regression_type,
514
+ "formula": formula,
515
+ "n_observations": int(fit.nobs),
516
+ "r_squared": (float(fit.rsquared) if hasattr(fit, "rsquared") else None),
517
+ "adj_r_squared": (
518
+ float(fit.rsquared_adj) if hasattr(fit, "rsquared_adj") else None
519
+ ),
520
+ "aic": float(fit.aic) if hasattr(fit, "aic") else None,
521
+ "bic": float(fit.bic) if hasattr(fit, "bic") else None,
522
+ "f_statistic": (float(fit.fvalue) if hasattr(fit, "fvalue") else None),
523
+ "f_pvalue": (float(fit.f_pvalue) if hasattr(fit, "f_pvalue") else None),
524
+ "log_likelihood": (float(fit.llf) if hasattr(fit, "llf") else None),
525
+ "coefficients": {
526
+ var: {
527
+ "coef": float(fit.params[var]),
528
+ "std_err": float(fit.bse[var]),
529
+ "t_value": (
530
+ float(fit.tvalues[var]) if hasattr(fit, "tvalues") else None
531
+ ),
532
+ "p_value": float(fit.pvalues[var]),
533
+ "significant": fit.pvalues[var] < 0.05,
534
+ "conf_lower": float(fit.conf_int().loc[var, 0]),
535
+ "conf_upper": float(fit.conf_int().loc[var, 1]),
536
+ }
537
+ for var in fit.params.index
538
+ },
539
+ }
540
+ return {
541
+ "summary_text": fit.summary().as_text(),
542
+ "structured": result,
543
+ }
544
+ return {"summary": fit.summary().as_text()}
545
+ except Exception as e:
546
+ raise AnalysisError(f"Regression error: {str(e)}")
547
+
548
+ def time_series(
549
+ self,
550
+ file_path: str,
551
+ variable: str,
552
+ date_variable: Optional[str] = None,
553
+ model_type: str = "arima",
554
+ order: Optional[Tuple[int, int, int]] = (1, 1, 1),
555
+ seasonal_order: Optional[Tuple[int, int, int, int]] = None,
556
+ forecast_periods: int = 10,
557
+ ) -> Dict[str, Any]:
558
+ """Perform time series analysis."""
559
+ df = self._load_data(file_path)
560
+ self._validate_variables(df, [variable] + ([date_variable] if date_variable else []))
561
+ from statsmodels.tsa.arima.model import ARIMA
562
+ from statsmodels.tsa.statespace.sarimax import SARIMAX
563
+
564
+ try:
565
+ ts_data = df[variable].dropna()
566
+ if date_variable and date_variable in df.columns:
567
+ ts_data.index = df[date_variable]
568
+ if model_type == "arima":
569
+ model = ARIMA(ts_data, order=order)
570
+ fit = model.fit()
571
+ model_type_name = "ARIMA"
572
+ elif model_type == "sarima":
573
+ if not seasonal_order:
574
+ raise AnalysisError("seasonal_order must be provided for SARIMA model")
575
+ model = SARIMAX(ts_data, order=order, seasonal_order=seasonal_order)
576
+ fit = model.fit(disp=False)
577
+ model_type_name = "SARIMA"
578
+ else:
579
+ raise AnalysisError(f"Unsupported time series model: {model_type}")
580
+ forecast = fit.forecast(steps=forecast_periods)
581
+ forecast_index = pd.date_range(
582
+ start=(
583
+ ts_data.index[-1]
584
+ if isinstance(ts_data.index, pd.DatetimeIndex)
585
+ else len(ts_data)
586
+ ),
587
+ periods=forecast_periods + 1,
588
+ freq="D",
589
+ )[1:]
590
+ return {
591
+ "model_type": model_type_name,
592
+ "order": order,
593
+ "seasonal_order": (seasonal_order if model_type == "sarima" else None),
594
+ "aic": float(fit.aic),
595
+ "bic": float(fit.bic),
596
+ "forecast": {
597
+ "values": (
598
+ forecast.tolist()
599
+ if isinstance(forecast, np.ndarray)
600
+ else forecast.values.tolist()
601
+ ),
602
+ "index": (
603
+ forecast_index.strftime("%Y-%m-%d").tolist()
604
+ if isinstance(forecast_index, pd.DatetimeIndex)
605
+ else list(range(len(forecast)))
606
+ ),
607
+ },
608
+ "summary": str(fit.summary()),
609
+ }
610
+ except Exception as e:
611
+ raise AnalysisError(f"Time series analysis error: {str(e)}")
612
+
613
+ def preprocess(
614
+ self,
615
+ file_path: str,
616
+ variables: List[str],
617
+ operation: str,
618
+ scaler_type: ScalerType = ScalerType.STANDARD,
619
+ output_path: Optional[str] = None,
620
+ ) -> Dict[str, Any]:
621
+ """Preprocess data with various operations."""
622
+ df = self._load_data(file_path)
623
+ self._validate_variables(df, variables)
624
+ data = df[variables].copy()
625
+ result = {"operation": operation}
626
+ if operation == "scale":
627
+ from sklearn.preprocessing import (
628
+ StandardScaler,
629
+ MinMaxScaler,
630
+ RobustScaler,
631
+ )
632
+
633
+ scaler_map = {
634
+ ScalerType.STANDARD: (StandardScaler, "StandardScaler"),
635
+ ScalerType.MINMAX: (MinMaxScaler, "MinMaxScaler"),
636
+ ScalerType.ROBUST: (RobustScaler, "RobustScaler"),
637
+ }
638
+ scaler_cls, scaler_name = scaler_map[scaler_type]
639
+ scaler = scaler_cls()
640
+ scaled_data = scaler.fit_transform(data)
641
+ scaled_df = pd.DataFrame(
642
+ scaled_data,
643
+ columns=[f"{col}_scaled" for col in data.columns],
644
+ index=data.index,
645
+ )
646
+ result.update(
647
+ {
648
+ "scaler": scaler_name,
649
+ "original_stats": data.describe().to_dict(),
650
+ "scaled_stats": scaled_df.describe().to_dict(),
651
+ "preview": scaled_df.head(5).to_dict(orient="records"),
652
+ }
653
+ )
654
+ processed_df = scaled_df
655
+ elif operation == "impute":
656
+ import numpy as np
657
+
658
+ imputed_df = data.copy()
659
+ numeric_cols = data.select_dtypes(include=[np.number]).columns
660
+ for col in numeric_cols:
661
+ imputed_df[col] = data[col].fillna(data[col].mean())
662
+ cat_cols = data.select_dtypes(exclude=[np.number]).columns
663
+ for col in cat_cols:
664
+ imputed_df[col] = data[col].fillna(
665
+ data[col].mode()[0] if not data[col].mode().empty else None
666
+ )
667
+ result.update(
668
+ {
669
+ "imputation_method": {
670
+ "numeric": "mean",
671
+ "categorical": "mode",
672
+ },
673
+ "missing_counts_before": data.isna().sum().to_dict(),
674
+ "missing_counts_after": imputed_df.isna().sum().to_dict(),
675
+ "preview": imputed_df.head(5).to_dict(orient="records"),
676
+ }
677
+ )
678
+ processed_df = imputed_df
679
+ if output_path:
680
+ output_path = (
681
+ os.path.abspath(output_path)
682
+ if os.path.isabs(output_path)
683
+ else os.path.join(tempfile.gettempdir(), "stats_outputs", output_path)
684
+ )
685
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
686
+ processed_df.to_csv(output_path)
687
+ result["output_file"] = output_path
688
+ return result