aiecs 1.0.1__py3-none-any.whl → 1.7.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (340) hide show
  1. aiecs/__init__.py +13 -16
  2. aiecs/__main__.py +7 -7
  3. aiecs/aiecs_client.py +269 -75
  4. aiecs/application/executors/operation_executor.py +79 -54
  5. aiecs/application/knowledge_graph/__init__.py +7 -0
  6. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  7. aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
  8. aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
  11. aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
  12. aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
  13. aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
  14. aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
  15. aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
  16. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  17. aiecs/application/knowledge_graph/extractors/base.py +98 -0
  18. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
  19. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
  20. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
  21. aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
  22. aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
  23. aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
  24. aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
  25. aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
  26. aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
  27. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
  28. aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
  29. aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
  30. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
  31. aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
  32. aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
  33. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
  34. aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
  35. aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
  36. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  37. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
  38. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
  39. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  40. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
  41. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  42. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  43. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
  44. aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
  45. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
  46. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  47. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  48. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
  49. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
  50. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
  51. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
  52. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
  53. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
  54. aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
  55. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
  56. aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
  57. aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
  58. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
  59. aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
  60. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  61. aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
  62. aiecs/application/knowledge_graph/search/reranker.py +293 -0
  63. aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
  64. aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
  65. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  66. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
  67. aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
  68. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  69. aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
  70. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  71. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
  72. aiecs/common/__init__.py +9 -0
  73. aiecs/common/knowledge_graph/__init__.py +17 -0
  74. aiecs/common/knowledge_graph/runnable.py +471 -0
  75. aiecs/config/__init__.py +20 -5
  76. aiecs/config/config.py +762 -31
  77. aiecs/config/graph_config.py +131 -0
  78. aiecs/config/tool_config.py +435 -0
  79. aiecs/core/__init__.py +29 -13
  80. aiecs/core/interface/__init__.py +2 -2
  81. aiecs/core/interface/execution_interface.py +22 -22
  82. aiecs/core/interface/storage_interface.py +37 -88
  83. aiecs/core/registry/__init__.py +31 -0
  84. aiecs/core/registry/service_registry.py +92 -0
  85. aiecs/domain/__init__.py +270 -1
  86. aiecs/domain/agent/__init__.py +191 -0
  87. aiecs/domain/agent/base_agent.py +3949 -0
  88. aiecs/domain/agent/exceptions.py +99 -0
  89. aiecs/domain/agent/graph_aware_mixin.py +569 -0
  90. aiecs/domain/agent/hybrid_agent.py +1731 -0
  91. aiecs/domain/agent/integration/__init__.py +29 -0
  92. aiecs/domain/agent/integration/context_compressor.py +216 -0
  93. aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
  94. aiecs/domain/agent/integration/protocols.py +281 -0
  95. aiecs/domain/agent/integration/retry_policy.py +218 -0
  96. aiecs/domain/agent/integration/role_config.py +213 -0
  97. aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
  98. aiecs/domain/agent/lifecycle.py +291 -0
  99. aiecs/domain/agent/llm_agent.py +692 -0
  100. aiecs/domain/agent/memory/__init__.py +12 -0
  101. aiecs/domain/agent/memory/conversation.py +1124 -0
  102. aiecs/domain/agent/migration/__init__.py +14 -0
  103. aiecs/domain/agent/migration/conversion.py +163 -0
  104. aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
  105. aiecs/domain/agent/models.py +894 -0
  106. aiecs/domain/agent/observability.py +479 -0
  107. aiecs/domain/agent/persistence.py +449 -0
  108. aiecs/domain/agent/prompts/__init__.py +29 -0
  109. aiecs/domain/agent/prompts/builder.py +159 -0
  110. aiecs/domain/agent/prompts/formatters.py +187 -0
  111. aiecs/domain/agent/prompts/template.py +255 -0
  112. aiecs/domain/agent/registry.py +253 -0
  113. aiecs/domain/agent/tool_agent.py +444 -0
  114. aiecs/domain/agent/tools/__init__.py +15 -0
  115. aiecs/domain/agent/tools/schema_generator.py +377 -0
  116. aiecs/domain/community/__init__.py +155 -0
  117. aiecs/domain/community/agent_adapter.py +469 -0
  118. aiecs/domain/community/analytics.py +432 -0
  119. aiecs/domain/community/collaborative_workflow.py +648 -0
  120. aiecs/domain/community/communication_hub.py +634 -0
  121. aiecs/domain/community/community_builder.py +320 -0
  122. aiecs/domain/community/community_integration.py +796 -0
  123. aiecs/domain/community/community_manager.py +803 -0
  124. aiecs/domain/community/decision_engine.py +849 -0
  125. aiecs/domain/community/exceptions.py +231 -0
  126. aiecs/domain/community/models/__init__.py +33 -0
  127. aiecs/domain/community/models/community_models.py +234 -0
  128. aiecs/domain/community/resource_manager.py +461 -0
  129. aiecs/domain/community/shared_context_manager.py +589 -0
  130. aiecs/domain/context/__init__.py +40 -10
  131. aiecs/domain/context/context_engine.py +1910 -0
  132. aiecs/domain/context/conversation_models.py +87 -53
  133. aiecs/domain/context/graph_memory.py +582 -0
  134. aiecs/domain/execution/model.py +12 -4
  135. aiecs/domain/knowledge_graph/__init__.py +19 -0
  136. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  137. aiecs/domain/knowledge_graph/models/entity.py +148 -0
  138. aiecs/domain/knowledge_graph/models/evidence.py +178 -0
  139. aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
  140. aiecs/domain/knowledge_graph/models/path.py +171 -0
  141. aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
  142. aiecs/domain/knowledge_graph/models/query.py +261 -0
  143. aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
  144. aiecs/domain/knowledge_graph/models/relation.py +202 -0
  145. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  146. aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
  147. aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
  148. aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
  149. aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
  150. aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
  151. aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
  152. aiecs/domain/task/dsl_processor.py +172 -56
  153. aiecs/domain/task/model.py +20 -8
  154. aiecs/domain/task/task_context.py +27 -24
  155. aiecs/infrastructure/__init__.py +0 -2
  156. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  157. aiecs/infrastructure/graph_storage/base.py +837 -0
  158. aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
  159. aiecs/infrastructure/graph_storage/cache.py +424 -0
  160. aiecs/infrastructure/graph_storage/distributed.py +223 -0
  161. aiecs/infrastructure/graph_storage/error_handling.py +380 -0
  162. aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
  163. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  164. aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
  165. aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
  166. aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
  167. aiecs/infrastructure/graph_storage/metrics.py +344 -0
  168. aiecs/infrastructure/graph_storage/migration.py +400 -0
  169. aiecs/infrastructure/graph_storage/pagination.py +483 -0
  170. aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
  171. aiecs/infrastructure/graph_storage/postgres.py +1563 -0
  172. aiecs/infrastructure/graph_storage/property_storage.py +353 -0
  173. aiecs/infrastructure/graph_storage/protocols.py +76 -0
  174. aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
  175. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  176. aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
  177. aiecs/infrastructure/graph_storage/streaming.py +487 -0
  178. aiecs/infrastructure/graph_storage/tenant.py +412 -0
  179. aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
  180. aiecs/infrastructure/messaging/websocket_manager.py +51 -35
  181. aiecs/infrastructure/monitoring/__init__.py +22 -0
  182. aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
  183. aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
  184. aiecs/infrastructure/monitoring/structured_logger.py +3 -7
  185. aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
  186. aiecs/infrastructure/persistence/__init__.py +14 -1
  187. aiecs/infrastructure/persistence/context_engine_client.py +184 -0
  188. aiecs/infrastructure/persistence/database_manager.py +67 -43
  189. aiecs/infrastructure/persistence/file_storage.py +180 -103
  190. aiecs/infrastructure/persistence/redis_client.py +74 -21
  191. aiecs/llm/__init__.py +73 -25
  192. aiecs/llm/callbacks/__init__.py +11 -0
  193. aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
  194. aiecs/llm/client_factory.py +230 -37
  195. aiecs/llm/client_resolver.py +155 -0
  196. aiecs/llm/clients/__init__.py +38 -0
  197. aiecs/llm/clients/base_client.py +328 -0
  198. aiecs/llm/clients/google_function_calling_mixin.py +415 -0
  199. aiecs/llm/clients/googleai_client.py +314 -0
  200. aiecs/llm/clients/openai_client.py +158 -0
  201. aiecs/llm/clients/openai_compatible_mixin.py +367 -0
  202. aiecs/llm/clients/vertex_client.py +1186 -0
  203. aiecs/llm/clients/xai_client.py +201 -0
  204. aiecs/llm/config/__init__.py +51 -0
  205. aiecs/llm/config/config_loader.py +272 -0
  206. aiecs/llm/config/config_validator.py +206 -0
  207. aiecs/llm/config/model_config.py +143 -0
  208. aiecs/llm/protocols.py +149 -0
  209. aiecs/llm/utils/__init__.py +10 -0
  210. aiecs/llm/utils/validate_config.py +89 -0
  211. aiecs/main.py +140 -121
  212. aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
  213. aiecs/scripts/aid/__init__.py +19 -0
  214. aiecs/scripts/aid/module_checker.py +499 -0
  215. aiecs/scripts/aid/version_manager.py +235 -0
  216. aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
  217. aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
  218. aiecs/scripts/dependance_check/__init__.py +15 -0
  219. aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
  220. aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
  221. aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
  222. aiecs/scripts/dependance_patch/__init__.py +7 -0
  223. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  224. aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
  225. aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
  226. aiecs/scripts/knowledge_graph/__init__.py +3 -0
  227. aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
  228. aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
  229. aiecs/scripts/tools_develop/README.md +671 -0
  230. aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
  231. aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
  232. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  233. aiecs/scripts/tools_develop/__init__.py +21 -0
  234. aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
  235. aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
  236. aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
  237. aiecs/scripts/tools_develop/schema_coverage.py +511 -0
  238. aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
  239. aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
  240. aiecs/scripts/tools_develop/verify_tools.py +352 -0
  241. aiecs/tasks/__init__.py +0 -1
  242. aiecs/tasks/worker.py +115 -47
  243. aiecs/tools/__init__.py +194 -72
  244. aiecs/tools/apisource/__init__.py +99 -0
  245. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  246. aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
  247. aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
  248. aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
  249. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  250. aiecs/tools/apisource/monitoring/metrics.py +330 -0
  251. aiecs/tools/apisource/providers/__init__.py +112 -0
  252. aiecs/tools/apisource/providers/base.py +671 -0
  253. aiecs/tools/apisource/providers/census.py +397 -0
  254. aiecs/tools/apisource/providers/fred.py +535 -0
  255. aiecs/tools/apisource/providers/newsapi.py +409 -0
  256. aiecs/tools/apisource/providers/worldbank.py +352 -0
  257. aiecs/tools/apisource/reliability/__init__.py +12 -0
  258. aiecs/tools/apisource/reliability/error_handler.py +363 -0
  259. aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
  260. aiecs/tools/apisource/tool.py +832 -0
  261. aiecs/tools/apisource/utils/__init__.py +9 -0
  262. aiecs/tools/apisource/utils/validators.py +334 -0
  263. aiecs/tools/base_tool.py +415 -21
  264. aiecs/tools/docs/__init__.py +121 -0
  265. aiecs/tools/docs/ai_document_orchestrator.py +607 -0
  266. aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
  267. aiecs/tools/docs/content_insertion_tool.py +1320 -0
  268. aiecs/tools/docs/document_creator_tool.py +1464 -0
  269. aiecs/tools/docs/document_layout_tool.py +1160 -0
  270. aiecs/tools/docs/document_parser_tool.py +1016 -0
  271. aiecs/tools/docs/document_writer_tool.py +2008 -0
  272. aiecs/tools/knowledge_graph/__init__.py +17 -0
  273. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
  274. aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
  275. aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
  276. aiecs/tools/langchain_adapter.py +300 -138
  277. aiecs/tools/schema_generator.py +455 -0
  278. aiecs/tools/search_tool/__init__.py +100 -0
  279. aiecs/tools/search_tool/analyzers.py +581 -0
  280. aiecs/tools/search_tool/cache.py +264 -0
  281. aiecs/tools/search_tool/constants.py +128 -0
  282. aiecs/tools/search_tool/context.py +224 -0
  283. aiecs/tools/search_tool/core.py +778 -0
  284. aiecs/tools/search_tool/deduplicator.py +119 -0
  285. aiecs/tools/search_tool/error_handler.py +242 -0
  286. aiecs/tools/search_tool/metrics.py +343 -0
  287. aiecs/tools/search_tool/rate_limiter.py +172 -0
  288. aiecs/tools/search_tool/schemas.py +275 -0
  289. aiecs/tools/statistics/__init__.py +80 -0
  290. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
  291. aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
  292. aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
  293. aiecs/tools/statistics/data_loader_tool.py +555 -0
  294. aiecs/tools/statistics/data_profiler_tool.py +638 -0
  295. aiecs/tools/statistics/data_transformer_tool.py +580 -0
  296. aiecs/tools/statistics/data_visualizer_tool.py +498 -0
  297. aiecs/tools/statistics/model_trainer_tool.py +507 -0
  298. aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
  299. aiecs/tools/task_tools/__init__.py +49 -36
  300. aiecs/tools/task_tools/chart_tool.py +200 -184
  301. aiecs/tools/task_tools/classfire_tool.py +268 -267
  302. aiecs/tools/task_tools/image_tool.py +220 -141
  303. aiecs/tools/task_tools/office_tool.py +226 -146
  304. aiecs/tools/task_tools/pandas_tool.py +477 -121
  305. aiecs/tools/task_tools/report_tool.py +390 -142
  306. aiecs/tools/task_tools/research_tool.py +149 -79
  307. aiecs/tools/task_tools/scraper_tool.py +339 -145
  308. aiecs/tools/task_tools/stats_tool.py +448 -209
  309. aiecs/tools/temp_file_manager.py +26 -24
  310. aiecs/tools/tool_executor/__init__.py +18 -16
  311. aiecs/tools/tool_executor/tool_executor.py +364 -52
  312. aiecs/utils/LLM_output_structor.py +74 -48
  313. aiecs/utils/__init__.py +14 -3
  314. aiecs/utils/base_callback.py +0 -3
  315. aiecs/utils/cache_provider.py +696 -0
  316. aiecs/utils/execution_utils.py +50 -31
  317. aiecs/utils/prompt_loader.py +1 -0
  318. aiecs/utils/token_usage_repository.py +37 -11
  319. aiecs/ws/socket_server.py +14 -4
  320. {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/METADATA +52 -15
  321. aiecs-1.7.17.dist-info/RECORD +337 -0
  322. aiecs-1.7.17.dist-info/entry_points.txt +13 -0
  323. aiecs/config/registry.py +0 -19
  324. aiecs/domain/context/content_engine.py +0 -982
  325. aiecs/llm/base_client.py +0 -99
  326. aiecs/llm/openai_client.py +0 -125
  327. aiecs/llm/vertex_client.py +0 -186
  328. aiecs/llm/xai_client.py +0 -184
  329. aiecs/scripts/dependency_checker.py +0 -857
  330. aiecs/scripts/quick_dependency_check.py +0 -269
  331. aiecs/tools/task_tools/search_api.py +0 -7
  332. aiecs-1.0.1.dist-info/RECORD +0 -90
  333. aiecs-1.0.1.dist-info/entry_points.txt +0 -7
  334. /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
  335. /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
  336. /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
  337. /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
  338. {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/WHEEL +0 -0
  339. {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/licenses/LICENSE +0 -0
  340. {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/top_level.txt +0 -0
@@ -5,38 +5,44 @@ from typing import Dict, Any, List, Optional, Union, Tuple
5
5
  from enum import Enum
6
6
  from dataclasses import dataclass
7
7
 
8
- import pandas as pd
8
+ import pandas as pd # type: ignore[import-untyped]
9
9
  import numpy as np
10
- from pydantic_settings import BaseSettings
11
- from pydantic import ValidationError, ConfigDict
10
+ from pydantic import Field, BaseModel
11
+ from pydantic_settings import BaseSettings, SettingsConfigDict
12
12
 
13
13
  from aiecs.tools.base_tool import BaseTool
14
14
  from aiecs.tools import register_tool
15
15
 
16
16
  # Enums for configuration options
17
+
18
+
17
19
  class ScalerType(str, Enum):
18
20
  STANDARD = "standard"
19
21
  MINMAX = "minmax"
20
22
  ROBUST = "robust"
21
23
  NONE = "none"
22
24
 
23
- class StatsSettings(BaseSettings):
24
- """Configuration for StatsTool."""
25
- max_file_size_mb: int = 200
26
- allowed_extensions: List[str] = ['.sav', '.sas7bdat', '.por', '.csv', '.xlsx', '.xls', '.json', '.parquet', '.feather']
27
- env_prefix: str = 'STATS_TOOL_'
28
-
29
- model_config = ConfigDict(env_prefix='STATS_TOOL_')
30
25
 
31
26
  # Exceptions
32
- class StatsToolError(Exception): pass
33
- class FileOperationError(StatsToolError): pass
34
- class AnalysisError(StatsToolError): pass
27
+ class StatsToolError(Exception):
28
+ pass
29
+
30
+
31
+ class FileOperationError(StatsToolError):
32
+ pass
33
+
34
+
35
+ class AnalysisError(StatsToolError):
36
+ pass
37
+
35
38
 
36
39
  # Utility Dataclass for Statistical Results
40
+
41
+
37
42
  @dataclass
38
43
  class StatsResult:
39
44
  """Structured statistical result."""
45
+
40
46
  test_type: str
41
47
  statistic: float
42
48
  pvalue: float
@@ -45,53 +51,186 @@ class StatsResult:
45
51
 
46
52
  def to_dict(self) -> Dict[str, Any]:
47
53
  return {
48
- 'test_type': self.test_type,
49
- 'statistic': self.statistic,
50
- 'pvalue': self.pvalue,
51
- 'significant': self.significant,
52
- **self.additional_metrics
54
+ "test_type": self.test_type,
55
+ "statistic": self.statistic,
56
+ "pvalue": self.pvalue,
57
+ "significant": self.significant,
58
+ **self.additional_metrics,
53
59
  }
54
60
 
55
- @register_tool('stats')
61
+
62
+ @register_tool("stats")
56
63
  class StatsTool(BaseTool):
57
64
  """Enhanced statistical analysis tool for various data formats and operations."""
58
- def __init__(self, config: Dict[str, Any] = None):
59
- super().__init__(config)
60
- self.settings = StatsSettings()
61
- if config:
62
- try:
63
- self.settings = self.settings.model_validate({**self.settings.model_dump(), **config})
64
- except ValidationError as e:
65
- raise ValueError(f"Invalid settings: {e}")
65
+
66
+ # Configuration schema
67
+ class Config(BaseSettings):
68
+ """Configuration for the stats tool
69
+
70
+ Automatically reads from environment variables with STATS_TOOL_ prefix.
71
+ Example: STATS_TOOL_MAX_FILE_SIZE_MB -> max_file_size_mb
72
+ """
73
+
74
+ model_config = SettingsConfigDict(env_prefix="STATS_TOOL_")
75
+
76
+ max_file_size_mb: int = Field(default=200, description="Maximum file size in megabytes")
77
+ allowed_extensions: List[str] = Field(
78
+ default=[
79
+ ".sav",
80
+ ".sas7bdat",
81
+ ".por",
82
+ ".csv",
83
+ ".xlsx",
84
+ ".xls",
85
+ ".json",
86
+ ".parquet",
87
+ ".feather",
88
+ ],
89
+ description="Allowed file extensions",
90
+ )
91
+
92
+ # Schema definitions
93
+ class Read_dataSchema(BaseModel):
94
+ """Schema for read_data operation"""
95
+
96
+ file_path: str = Field(description="Path to the data file to read")
97
+ nrows: Optional[int] = Field(default=None, description="Optional number of rows to read from the file. If None, reads all rows")
98
+ sheet_name: Optional[Union[str, int]] = Field(default=0, description="Sheet name or index for Excel files. Can be a string name or integer index (0-based)")
99
+
100
+ class DescribeSchema(BaseModel):
101
+ """Schema for describe operation"""
102
+
103
+ file_path: str = Field(description="Path to the data file")
104
+ variables: Optional[List[str]] = Field(default=None, description="Optional list of variable names to describe. If None, describes all variables")
105
+ include_percentiles: bool = Field(default=False, description="Whether to include custom percentiles in the descriptive statistics")
106
+ percentiles: Optional[List[float]] = Field(default=None, description="Optional list of percentile values (0.0 to 1.0) to include. Only used if include_percentiles is True")
107
+
108
+ class TtestSchema(BaseModel):
109
+ """Schema for ttest operation"""
110
+
111
+ file_path: str = Field(description="Path to the data file")
112
+ var1: str = Field(description="Name of the first variable for the t-test")
113
+ var2: str = Field(description="Name of the second variable for the t-test")
114
+ equal_var: bool = Field(default=True, description="Whether to assume equal variances. If True, uses standard t-test; if False, uses Welch's t-test")
115
+ paired: bool = Field(default=False, description="Whether to perform a paired t-test. If True, performs paired t-test; if False, performs independent t-test")
116
+
117
+ class CorrelationSchema(BaseModel):
118
+ """Schema for correlation operation"""
119
+
120
+ file_path: str = Field(description="Path to the data file")
121
+ variables: Optional[List[str]] = Field(default=None, description="Optional list of variable names for correlation matrix. If provided, computes correlation matrix for all pairs")
122
+ var1: Optional[str] = Field(default=None, description="First variable name for pairwise correlation. Must be used together with var2")
123
+ var2: Optional[str] = Field(default=None, description="Second variable name for pairwise correlation. Must be used together with var1")
124
+ method: str = Field(default="pearson", description="Correlation method: 'pearson' (linear), 'spearman' (rank-based), or 'kendall' (tau)")
125
+
126
+ class AnovaSchema(BaseModel):
127
+ """Schema for anova operation"""
128
+
129
+ file_path: str = Field(description="Path to the data file")
130
+ dependent: str = Field(description="Name of the dependent variable (continuous)")
131
+ factor: str = Field(description="Name of the factor/grouping variable (categorical)")
132
+ post_hoc: bool = Field(default=False, description="Whether to perform post-hoc tests (Tukey HSD) to identify which groups differ significantly")
133
+
134
+ class Chi_squareSchema(BaseModel):
135
+ """Schema for chi_square operation"""
136
+
137
+ file_path: str = Field(description="Path to the data file")
138
+ var1: str = Field(description="Name of the first categorical variable")
139
+ var2: str = Field(description="Name of the second categorical variable")
140
+ correction: bool = Field(default=True, description="Whether to apply Yates' correction for continuity. Recommended for 2x2 contingency tables")
141
+
142
+ class Non_parametricSchema(BaseModel):
143
+ """Schema for non_parametric operation"""
144
+
145
+ file_path: str = Field(description="Path to the data file")
146
+ test_type: str = Field(description="Type of non-parametric test: 'mann_whitney' (2 groups), 'wilcoxon' (paired), 'kruskal' (multiple groups), or 'friedman' (repeated measures)")
147
+ variables: List[str] = Field(description="List of variable names to test. Number of variables depends on test_type")
148
+ grouping: Optional[str] = Field(default=None, description="Optional grouping variable name. Required for 'kruskal' test, not used for other tests")
149
+
150
+ class RegressionSchema(BaseModel):
151
+ """Schema for regression operation"""
152
+
153
+ file_path: str = Field(description="Path to the data file")
154
+ formula: str = Field(description="Regression formula string (e.g., 'y ~ x1 + x2'). Uses R-style formula syntax")
155
+ regression_type: str = Field(default="ols", description="Type of regression model: 'ols' (ordinary least squares), 'logit' (logistic), 'probit', or 'poisson'")
156
+ robust: bool = Field(default=False, description="Whether to use robust standard errors (HC3 heteroscedasticity-consistent)")
157
+ structured_output: bool = Field(default=True, description="Whether to return structured output with coefficients, p-values, and confidence intervals. If False, returns summary text only")
158
+
159
+ class Time_seriesSchema(BaseModel):
160
+ """Schema for time_series operation"""
161
+
162
+ file_path: str = Field(description="Path to the data file")
163
+ variable: str = Field(description="Name of the time series variable to analyze")
164
+ date_variable: Optional[str] = Field(default=None, description="Optional name of the date/time variable. If provided, uses it as the time index")
165
+ model_type: str = Field(default="arima", description="Type of time series model: 'arima' or 'sarima' (seasonal ARIMA)")
166
+ order: Optional[Tuple[int, int, int]] = Field(default=(1, 1, 1), description="ARIMA order tuple (p, d, q) where p=autoregressive, d=differencing, q=moving average")
167
+ seasonal_order: Optional[Tuple[int, int, int, int]] = Field(default=None, description="Optional SARIMA seasonal order tuple (P, D, Q, s). Required for 'sarima' model type")
168
+ forecast_periods: int = Field(default=10, description="Number of periods to forecast into the future")
169
+
170
+ class PreprocessSchema(BaseModel):
171
+ """Schema for preprocess operation"""
172
+
173
+ file_path: str = Field(description="Path to the data file")
174
+ variables: List[str] = Field(description="List of variable names to preprocess")
175
+ operation: str = Field(description="Preprocessing operation: 'scale' (normalize) or 'impute' (fill missing values)")
176
+ scaler_type: ScalerType = Field(default=ScalerType.STANDARD, description="Type of scaler to use for scaling operation: 'standard' (z-score), 'minmax' (0-1), 'robust' (median/IQR), or 'none'")
177
+ output_path: Optional[str] = Field(default=None, description="Optional path to save the preprocessed data. If None, data is not saved to file")
178
+
179
+ def __init__(self, config: Optional[Dict[str, Any]] = None, **kwargs):
180
+ """
181
+ Initialize StatsTool with settings and resources.
182
+
183
+ Args:
184
+ config (Dict, optional): Configuration overrides for StatsTool.
185
+ **kwargs: Additional arguments passed to BaseTool (e.g., tool_name)
186
+
187
+ Configuration is automatically loaded by BaseTool from:
188
+ 1. Explicit config dict (highest priority)
189
+ 2. YAML config files (config/tools/stats.yaml)
190
+ 3. Environment variables (via dotenv from .env files)
191
+ 4. Tool defaults (lowest priority)
192
+ """
193
+ super().__init__(config, **kwargs)
194
+
195
+ # Configuration is automatically loaded by BaseTool into self._config_obj
196
+ # Access config via self._config_obj (BaseSettings instance)
197
+ self.config = self._config_obj if self._config_obj else self.Config()
198
+
66
199
  self.logger = logging.getLogger(__name__)
67
200
  if not self.logger.handlers:
68
201
  h = logging.StreamHandler()
69
- h.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
202
+ h.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
70
203
  self.logger.addHandler(h)
71
204
  self.logger.setLevel(logging.INFO)
72
205
 
73
- def _load_data(self, file_path: str, nrows: Optional[int] = None, sheet_name: Optional[Union[str, int]] = 0) -> pd.DataFrame:
206
+ def _load_data(
207
+ self,
208
+ file_path: str,
209
+ nrows: Optional[int] = None,
210
+ sheet_name: Optional[Union[str, int]] = 0,
211
+ ) -> pd.DataFrame:
74
212
  """Load data from various file formats into a pandas DataFrame."""
75
213
  try:
76
214
  ext = os.path.splitext(file_path)[1].lower()
77
- if ext in ['.sav', '.sas7bdat', '.por']:
78
- import pyreadstat
79
- if ext == '.sav':
215
+ if ext in [".sav", ".sas7bdat", ".por"]:
216
+ import pyreadstat # type: ignore[import-untyped]
217
+
218
+ if ext == ".sav":
80
219
  df, meta = pyreadstat.read_sav(file_path)
81
- elif ext == '.sas7bdat':
220
+ elif ext == ".sas7bdat":
82
221
  df, meta = pyreadstat.read_sas7bdat(file_path)
83
222
  else:
84
223
  df, meta = pyreadstat.read_por(file_path)
85
224
  return df
86
- elif ext == '.csv':
225
+ elif ext == ".csv":
87
226
  return pd.read_csv(file_path, nrows=nrows)
88
- elif ext in ['.xlsx', '.xls']:
227
+ elif ext in [".xlsx", ".xls"]:
89
228
  return pd.read_excel(file_path, sheet_name=sheet_name, nrows=nrows)
90
- elif ext == '.json':
229
+ elif ext == ".json":
91
230
  return pd.read_json(file_path)
92
- elif ext == '.parquet':
231
+ elif ext == ".parquet":
93
232
  return pd.read_parquet(file_path)
94
- elif ext == '.feather':
233
+ elif ext == ".feather":
95
234
  return pd.read_feather(file_path)
96
235
  else:
97
236
  raise FileOperationError(f"Unsupported file format: {ext}")
@@ -115,18 +254,29 @@ class StatsTool(BaseTool):
115
254
  return label
116
255
  return "large"
117
256
 
118
- def read_data(self, file_path: str, nrows: Optional[int] = None, sheet_name: Optional[Union[str, int]] = 0) -> Dict[str, Any]:
257
+ def read_data(
258
+ self,
259
+ file_path: str,
260
+ nrows: Optional[int] = None,
261
+ sheet_name: Optional[Union[str, int]] = 0,
262
+ ) -> Dict[str, Any]:
119
263
  """Read data from various file formats."""
120
264
  df = self._load_data(file_path, nrows, sheet_name)
121
265
  return {
122
- 'variables': df.columns.tolist(),
123
- 'observations': len(df),
124
- 'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()},
125
- 'memory_usage': df.memory_usage(deep=True).sum() / (1024 * 1024),
126
- 'preview': df.head(5).to_dict(orient='records')
266
+ "variables": df.columns.tolist(),
267
+ "observations": len(df),
268
+ "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
269
+ "memory_usage": df.memory_usage(deep=True).sum() / (1024 * 1024),
270
+ "preview": df.head(5).to_dict(orient="records"),
127
271
  }
128
272
 
129
- def describe(self, file_path: str, variables: Optional[List[str]] = None, include_percentiles: bool = False, percentiles: Optional[List[float]] = None) -> Dict[str, Any]:
273
+ def describe(
274
+ self,
275
+ file_path: str,
276
+ variables: Optional[List[str]] = None,
277
+ include_percentiles: bool = False,
278
+ percentiles: Optional[List[float]] = None,
279
+ ) -> Dict[str, Any]:
130
280
  """Generate descriptive statistics for variables."""
131
281
  df = self._load_data(file_path)
132
282
  if variables:
@@ -137,21 +287,31 @@ class StatsTool(BaseTool):
137
287
  additional_percentiles = [p for p in percentiles if p not in [0.25, 0.5, 0.75]]
138
288
  if additional_percentiles:
139
289
  additional_desc = df.describe(percentiles=percentiles)
140
- desc = pd.concat([desc, additional_desc.loc[[f"{int(p*100)}%" for p in additional_percentiles]]])
290
+ desc = pd.concat(
291
+ [
292
+ desc,
293
+ additional_desc.loc[[f"{int(p*100)}%" for p in additional_percentiles]],
294
+ ]
295
+ )
141
296
  numeric_cols = df.select_dtypes(include=[np.number]).columns
142
297
  if numeric_cols.any():
143
- desc.loc['skew'] = df[numeric_cols].skew()
144
- desc.loc['kurtosis'] = df[numeric_cols].kurt()
145
- return {
146
- 'statistics': desc.to_dict(),
147
- 'summary': desc.to_string()
148
- }
298
+ desc.loc["skew"] = df[numeric_cols].skew()
299
+ desc.loc["kurtosis"] = df[numeric_cols].kurt()
300
+ return {"statistics": desc.to_dict(), "summary": desc.to_string()}
149
301
 
150
- def ttest(self, file_path: str, var1: str, var2: str, equal_var: bool = True, paired: bool = False) -> Dict[str, Any]:
302
+ def ttest(
303
+ self,
304
+ file_path: str,
305
+ var1: str,
306
+ var2: str,
307
+ equal_var: bool = True,
308
+ paired: bool = False,
309
+ ) -> Dict[str, Any]:
151
310
  """Perform t-tests (independent or paired). Also handles legacy ttest_ind."""
152
311
  df = self._load_data(file_path)
153
312
  self._validate_variables(df, [var1, var2])
154
- import scipy.stats as stats
313
+ import scipy.stats as stats # type: ignore[import-untyped]
314
+
155
315
  a = df[var1].dropna().values
156
316
  b = df[var2].dropna().values
157
317
  if paired:
@@ -176,64 +336,85 @@ class StatsTool(BaseTool):
176
336
  pvalue=float(p),
177
337
  significant=p < 0.05,
178
338
  additional_metrics={
179
- 'cohens_d': float(cohens_d),
180
- 'effect_size_interpretation': self._interpret_effect_size(cohens_d),
181
- 'group1_mean': float(mean_a),
182
- 'group2_mean': float(mean_b),
183
- 'group1_std': float(std_a),
184
- 'group2_std': float(std_b),
185
- 'group1_n': int(len(a)),
186
- 'group2_n': int(len(b))
187
- }
339
+ "cohens_d": float(cohens_d),
340
+ "effect_size_interpretation": self._interpret_effect_size(cohens_d),
341
+ "group1_mean": float(mean_a),
342
+ "group2_mean": float(mean_b),
343
+ "group1_std": float(std_a),
344
+ "group2_std": float(std_b),
345
+ "group1_n": int(len(a)),
346
+ "group2_n": int(len(b)),
347
+ },
188
348
  ).to_dict()
189
349
 
190
350
  # Legacy method (now an alias)
191
351
  ttest_ind = ttest
192
352
 
193
- def correlation(self, file_path: str, variables: Optional[List[str]] = None, var1: Optional[str] = None, var2: Optional[str] = None, method: str = "pearson") -> Dict[str, Any]:
353
+ def correlation(
354
+ self,
355
+ file_path: str,
356
+ variables: Optional[List[str]] = None,
357
+ var1: Optional[str] = None,
358
+ var2: Optional[str] = None,
359
+ method: str = "pearson",
360
+ ) -> Dict[str, Any]:
194
361
  """Perform correlation analysis."""
195
362
  df = self._load_data(file_path)
196
363
  if variables:
197
364
  self._validate_variables(df, variables)
198
365
  if var1 and var2:
199
366
  self._validate_variables(df, [var1, var2])
200
- import scipy.stats as stats
367
+ import scipy.stats as stats # type: ignore[import-untyped]
368
+
201
369
  result = {}
202
370
  if variables:
203
371
  corr_matrix = df[variables].corr(method=method)
204
- result['correlation_matrix'] = corr_matrix.to_dict()
372
+ result["correlation_matrix"] = corr_matrix.to_dict()
205
373
  flat_corrs = [
206
- {'var1': v1, 'var2': v2, 'correlation': corr_matrix.loc[v1, v2], 'abs_correlation': abs(corr_matrix.loc[v1, v2])}
374
+ {
375
+ "var1": v1,
376
+ "var2": v2,
377
+ "correlation": corr_matrix.loc[v1, v2],
378
+ "abs_correlation": abs(corr_matrix.loc[v1, v2]),
379
+ }
207
380
  for i, v1 in enumerate(variables)
208
- for j, v2 in enumerate(variables) if i < j
381
+ for j, v2 in enumerate(variables)
382
+ if i < j
209
383
  ]
210
- flat_corrs.sort(key=lambda x: x['abs_correlation'], reverse=True)
211
- result['pairs'] = flat_corrs
384
+ flat_corrs.sort(key=lambda x: x["abs_correlation"], reverse=True)
385
+ result["pairs"] = flat_corrs
212
386
  elif var1 and var2:
213
387
  x = df[var1].dropna()
214
388
  y = df[var2].dropna()
215
389
  method_map = {
216
- 'pearson': (stats.pearsonr, "Pearson's r"),
217
- 'spearman': (stats.spearmanr, "Spearman's rho"),
218
- 'kendall': (stats.kendalltau, "Kendall's tau")
390
+ "pearson": (stats.pearsonr, "Pearson's r"),
391
+ "spearman": (stats.spearmanr, "Spearman's rho"),
392
+ "kendall": (stats.kendalltau, "Kendall's tau"),
219
393
  }
220
394
  func, method_name = method_map[method]
221
395
  corr, p = func(x, y)
222
396
  result = {
223
- 'method': method_name,
224
- 'correlation': float(corr),
225
- 'pvalue': float(p),
226
- 'significant': p < 0.05,
227
- 'n': len(x)
397
+ "method": method_name,
398
+ "correlation": float(corr),
399
+ "pvalue": float(p),
400
+ "significant": p < 0.05,
401
+ "n": len(x),
228
402
  }
229
403
  return result
230
404
 
231
- def anova(self, file_path: str, dependent: str, factor: str, post_hoc: bool = False) -> Dict[str, Any]:
405
+ def anova(
406
+ self,
407
+ file_path: str,
408
+ dependent: str,
409
+ factor: str,
410
+ post_hoc: bool = False,
411
+ ) -> Dict[str, Any]:
232
412
  """Perform one-way ANOVA with optional post-hoc tests."""
233
413
  df = self._load_data(file_path)
234
414
  self._validate_variables(df, [dependent, factor])
235
- import scipy.stats as stats
236
- from statsmodels.stats.multicomp import pairwise_tukeyhsd
415
+ import scipy.stats as stats # type: ignore[import-untyped] # type: ignore[import-untyped]
416
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd # type: ignore[import-untyped]
417
+
237
418
  dependent_var = df[dependent].dropna()
238
419
  factor_var = df[factor].dropna()
239
420
  min_len = min(len(dependent_var), len(factor_var))
@@ -242,42 +423,46 @@ class StatsTool(BaseTool):
242
423
  groups = {name: group[dependent].dropna().values for name, group in df.groupby(factor)}
243
424
  stat, p = stats.f_oneway(*groups.values())
244
425
  result = {
245
- 'F': float(stat),
246
- 'pvalue': float(p),
247
- 'significant': p < 0.05,
248
- 'groups': len(groups),
249
- 'group_sizes': {name: len(values) for name, values in groups.items()},
250
- 'group_means': {name: float(np.mean(values)) for name, values in groups.items()},
251
- 'group_std': {name: float(np.std(values, ddof=1)) for name, values in groups.items()}
426
+ "F": float(stat),
427
+ "pvalue": float(p),
428
+ "significant": p < 0.05,
429
+ "groups": len(groups),
430
+ "group_sizes": {name: len(values) for name, values in groups.items()},
431
+ "group_means": {name: float(np.mean(values)) for name, values in groups.items()},
432
+ "group_std": {name: float(np.std(values, ddof=1)) for name, values in groups.items()},
252
433
  }
253
434
  if post_hoc:
254
- post_hoc_df = pd.DataFrame({'value': dependent_var, 'group': factor_var})
255
- tukey = pairwise_tukeyhsd(post_hoc_df['value'], post_hoc_df['group'])
435
+ post_hoc_df = pd.DataFrame({"value": dependent_var, "group": factor_var})
436
+ tukey = pairwise_tukeyhsd(post_hoc_df["value"], post_hoc_df["group"])
256
437
  from itertools import combinations
438
+
257
439
  group_pairs = list(combinations(tukey.groupsunique, 2))
258
440
  tukey_results = [
259
441
  {
260
- 'group1': str(group1),
261
- 'group2': str(group2),
262
- 'mean_difference': float(mean_diff),
263
- 'p_adjusted': float(p_adj),
264
- 'significant': bool(reject),
265
- 'conf_lower': float(lower),
266
- 'conf_upper': float(upper)
442
+ "group1": str(group1),
443
+ "group2": str(group2),
444
+ "mean_difference": float(mean_diff),
445
+ "p_adjusted": float(p_adj),
446
+ "significant": bool(reject),
447
+ "conf_lower": float(lower),
448
+ "conf_upper": float(upper),
267
449
  }
268
- for (group1, group2), mean_diff, p_adj, lower, upper, reject in zip(
450
+ for (
451
+ group1,
452
+ group2,
453
+ ), mean_diff, p_adj, lower, upper, reject in zip(
269
454
  group_pairs,
270
455
  tukey.meandiffs,
271
456
  tukey.pvalues,
272
- tukey.confint[:,0],
273
- tukey.confint[:,1],
274
- tukey.reject
457
+ tukey.confint[:, 0],
458
+ tukey.confint[:, 1],
459
+ tukey.reject,
275
460
  )
276
461
  ]
277
- result['post_hoc'] = {
278
- 'method': 'Tukey HSD',
279
- 'alpha': 0.05, # Standard significance level for Tukey HSD
280
- 'comparisons': tukey_results
462
+ result["post_hoc"] = {
463
+ "method": "Tukey HSD",
464
+ "alpha": 0.05, # Standard significance level for Tukey HSD
465
+ "comparisons": tukey_results,
281
466
  }
282
467
  return result
283
468
 
@@ -285,48 +470,56 @@ class StatsTool(BaseTool):
285
470
  """Perform chi-square test of independence."""
286
471
  df = self._load_data(file_path)
287
472
  self._validate_variables(df, [var1, var2])
288
- import scipy.stats as stats
473
+ import scipy.stats as stats # type: ignore[import-untyped]
474
+
289
475
  contingency = pd.crosstab(df[var1], df[var2])
290
476
  chi2, p, dof, expected = stats.chi2_contingency(contingency, correction=correction)
291
477
  n = contingency.sum().sum()
292
478
  min_dim = min(contingency.shape) - 1
293
479
  cramers_v = np.sqrt(chi2 / (n * min_dim))
294
480
  return {
295
- 'chi2': float(chi2),
296
- 'pvalue': float(p),
297
- 'dof': int(dof),
298
- 'significant': p < 0.05,
299
- 'cramers_v': float(cramers_v),
300
- 'effect_size_interpretation': self._interpret_effect_size(cramers_v),
301
- 'contingency_table': contingency.to_dict(),
302
- 'expected_frequencies': pd.DataFrame(expected, index=contingency.index, columns=contingency.columns).to_dict(),
303
- 'test_type': 'Chi-square test with Yates correction' if correction else 'Chi-square test'
481
+ "chi2": float(chi2),
482
+ "pvalue": float(p),
483
+ "dof": int(dof),
484
+ "significant": p < 0.05,
485
+ "cramers_v": float(cramers_v),
486
+ "effect_size_interpretation": self._interpret_effect_size(cramers_v),
487
+ "contingency_table": contingency.to_dict(),
488
+ "expected_frequencies": pd.DataFrame(expected, index=contingency.index, columns=contingency.columns).to_dict(),
489
+ "test_type": ("Chi-square test with Yates correction" if correction else "Chi-square test"),
304
490
  }
305
491
 
306
- def non_parametric(self, file_path: str, test_type: str, variables: List[str], grouping: Optional[str] = None) -> Dict[str, Any]:
492
+ def non_parametric(
493
+ self,
494
+ file_path: str,
495
+ test_type: str,
496
+ variables: List[str],
497
+ grouping: Optional[str] = None,
498
+ ) -> Dict[str, Any]:
307
499
  """Perform non-parametric statistical tests."""
308
500
  df = self._load_data(file_path)
309
501
  self._validate_variables(df, variables + ([grouping] if grouping else []))
310
- import scipy.stats as stats
311
- if test_type == 'mann_whitney':
502
+ import scipy.stats as stats # type: ignore[import-untyped]
503
+
504
+ if test_type == "mann_whitney":
312
505
  if len(variables) != 2:
313
506
  raise AnalysisError("Mann-Whitney U test requires exactly 2 variables")
314
507
  x = df[variables[0]].dropna().values
315
508
  y = df[variables[1]].dropna().values
316
509
  u_stat, p_value = stats.mannwhitneyu(x, y)
317
510
  return StatsResult(
318
- test_type='Mann-Whitney U test',
511
+ test_type="Mann-Whitney U test",
319
512
  statistic=float(u_stat),
320
513
  pvalue=float(p_value),
321
514
  significant=p_value < 0.05,
322
515
  additional_metrics={
323
- 'n1': len(x),
324
- 'n2': len(y),
325
- 'median1': float(np.median(x)),
326
- 'median2': float(np.median(y))
327
- }
516
+ "n1": len(x),
517
+ "n2": len(y),
518
+ "median1": float(np.median(x)),
519
+ "median2": float(np.median(y)),
520
+ },
328
521
  ).to_dict()
329
- elif test_type == 'wilcoxon':
522
+ elif test_type == "wilcoxon":
330
523
  if len(variables) != 2:
331
524
  raise AnalysisError("Wilcoxon signed-rank test requires exactly 2 variables")
332
525
  x = df[variables[0]].dropna().values
@@ -336,161 +529,202 @@ class StatsTool(BaseTool):
336
529
  y = y[:min_len]
337
530
  w_stat, p_value = stats.wilcoxon(x, y)
338
531
  return StatsResult(
339
- test_type='Wilcoxon signed-rank test',
532
+ test_type="Wilcoxon signed-rank test",
340
533
  statistic=float(w_stat),
341
534
  pvalue=float(p_value),
342
535
  significant=p_value < 0.05,
343
536
  additional_metrics={
344
- 'n_pairs': min_len,
345
- 'median_difference': float(np.median(x - y))
346
- }
537
+ "n_pairs": min_len,
538
+ "median_difference": float(np.median(x - y)),
539
+ },
347
540
  ).to_dict()
348
- elif test_type == 'kruskal':
541
+ elif test_type == "kruskal":
349
542
  if not grouping:
350
543
  raise AnalysisError("Kruskal-Wallis test requires a grouping variable")
351
544
  groups = {f"{var}_{name}": group[var].dropna().values for name, group in df.groupby(grouping) for var in variables}
352
545
  h_stat, p_value = stats.kruskal(*groups.values())
353
546
  return StatsResult(
354
- test_type='Kruskal-Wallis H test',
547
+ test_type="Kruskal-Wallis H test",
355
548
  statistic=float(h_stat),
356
549
  pvalue=float(p_value),
357
550
  significant=p_value < 0.05,
358
551
  additional_metrics={
359
- 'groups': len(groups),
360
- 'group_sizes': {name: len(values) for name, values in groups.items()},
361
- 'group_medians': {name: float(np.median(values)) for name, values in groups.items()}
362
- }
552
+ "groups": len(groups),
553
+ "group_sizes": {name: len(values) for name, values in groups.items()},
554
+ "group_medians": {name: float(np.median(values)) for name, values in groups.items()},
555
+ },
363
556
  ).to_dict()
364
- elif test_type == 'friedman':
557
+ elif test_type == "friedman":
365
558
  if len(variables) < 2:
366
559
  raise AnalysisError("Friedman test requires at least 2 variables")
367
560
  data = df[variables].dropna()
368
561
  chi2, p_value = stats.friedmanchisquare(*[data[var].values for var in variables])
369
562
  return StatsResult(
370
- test_type='Friedman test',
563
+ test_type="Friedman test",
371
564
  statistic=float(chi2),
372
565
  pvalue=float(p_value),
373
566
  significant=p_value < 0.05,
374
567
  additional_metrics={
375
- 'n_measures': len(variables),
376
- 'n_samples': len(data),
377
- 'variable_medians': {var: float(np.median(data[var])) for var in variables}
378
- }
568
+ "n_measures": len(variables),
569
+ "n_samples": len(data),
570
+ "variable_medians": {var: float(np.median(data[var])) for var in variables},
571
+ },
379
572
  ).to_dict()
380
573
  else:
381
574
  raise AnalysisError(f"Unsupported non-parametric test type: {test_type}. Supported types: mann_whitney, wilcoxon, kruskal, friedman")
382
575
 
383
- def regression(self, file_path: str, formula: str, regression_type: str = "ols", robust: bool = False, structured_output: bool = True) -> Dict[str, Any]:
576
+ def regression(
577
+ self,
578
+ file_path: str,
579
+ formula: str,
580
+ regression_type: str = "ols",
581
+ robust: bool = False,
582
+ structured_output: bool = True,
583
+ ) -> Dict[str, Any]:
384
584
  """Perform regression analysis with various models."""
385
585
  df = self._load_data(file_path)
386
- import statsmodels.formula.api as smf
586
+ import statsmodels.formula.api as smf # type: ignore[import-untyped]
587
+
387
588
  try:
388
589
  model_map = {
389
- 'ols': smf.ols,
390
- 'logit': smf.logit,
391
- 'probit': smf.probit,
392
- 'poisson': smf.poisson
590
+ "ols": smf.ols,
591
+ "logit": smf.logit,
592
+ "probit": smf.probit,
593
+ "poisson": smf.poisson,
393
594
  }
394
595
  model = model_map[regression_type](formula=formula, data=df)
395
- fit = model.fit(cov_type='HC3' if robust else 'nonrobust')
596
+ fit = model.fit(cov_type="HC3" if robust else "nonrobust")
396
597
  if structured_output:
397
598
  result = {
398
- 'model_type': regression_type,
399
- 'formula': formula,
400
- 'n_observations': int(fit.nobs),
401
- 'r_squared': float(fit.rsquared) if hasattr(fit, 'rsquared') else None,
402
- 'adj_r_squared': float(fit.rsquared_adj) if hasattr(fit, 'rsquared_adj') else None,
403
- 'aic': float(fit.aic) if hasattr(fit, 'aic') else None,
404
- 'bic': float(fit.bic) if hasattr(fit, 'bic') else None,
405
- 'f_statistic': float(fit.fvalue) if hasattr(fit, 'fvalue') else None,
406
- 'f_pvalue': float(fit.f_pvalue) if hasattr(fit, 'f_pvalue') else None,
407
- 'log_likelihood': float(fit.llf) if hasattr(fit, 'llf') else None,
408
- 'coefficients': {
599
+ "model_type": regression_type,
600
+ "formula": formula,
601
+ "n_observations": int(fit.nobs),
602
+ "r_squared": (float(fit.rsquared) if hasattr(fit, "rsquared") else None),
603
+ "adj_r_squared": (float(fit.rsquared_adj) if hasattr(fit, "rsquared_adj") else None),
604
+ "aic": float(fit.aic) if hasattr(fit, "aic") else None,
605
+ "bic": float(fit.bic) if hasattr(fit, "bic") else None,
606
+ "f_statistic": (float(fit.fvalue) if hasattr(fit, "fvalue") else None),
607
+ "f_pvalue": (float(fit.f_pvalue) if hasattr(fit, "f_pvalue") else None),
608
+ "log_likelihood": (float(fit.llf) if hasattr(fit, "llf") else None),
609
+ "coefficients": {
409
610
  var: {
410
- 'coef': float(fit.params[var]),
411
- 'std_err': float(fit.bse[var]),
412
- 't_value': float(fit.tvalues[var]) if hasattr(fit, 'tvalues') else None,
413
- 'p_value': float(fit.pvalues[var]),
414
- 'significant': fit.pvalues[var] < 0.05,
415
- 'conf_lower': float(fit.conf_int().loc[var, 0]),
416
- 'conf_upper': float(fit.conf_int().loc[var, 1])
417
- } for var in fit.params.index
418
- }
611
+ "coef": float(fit.params[var]),
612
+ "std_err": float(fit.bse[var]),
613
+ "t_value": (float(fit.tvalues[var]) if hasattr(fit, "tvalues") else None),
614
+ "p_value": float(fit.pvalues[var]),
615
+ "significant": fit.pvalues[var] < 0.05,
616
+ "conf_lower": float(fit.conf_int().loc[var, 0]),
617
+ "conf_upper": float(fit.conf_int().loc[var, 1]),
618
+ }
619
+ for var in fit.params.index
620
+ },
419
621
  }
420
- return {'summary_text': fit.summary().as_text(), 'structured': result}
421
- return {'summary': fit.summary().as_text()}
622
+ return {
623
+ "summary_text": fit.summary().as_text(),
624
+ "structured": result,
625
+ }
626
+ return {"summary": fit.summary().as_text()}
422
627
  except Exception as e:
423
628
  raise AnalysisError(f"Regression error: {str(e)}")
424
629
 
425
- def time_series(self, file_path: str, variable: str, date_variable: Optional[str] = None, model_type: str = "arima", order: Optional[Tuple[int, int, int]] = (1, 1, 1), seasonal_order: Optional[Tuple[int, int, int, int]] = None, forecast_periods: int = 10) -> Dict[str, Any]:
630
+ def time_series(
631
+ self,
632
+ file_path: str,
633
+ variable: str,
634
+ date_variable: Optional[str] = None,
635
+ model_type: str = "arima",
636
+ order: Optional[Tuple[int, int, int]] = (1, 1, 1),
637
+ seasonal_order: Optional[Tuple[int, int, int, int]] = None,
638
+ forecast_periods: int = 10,
639
+ ) -> Dict[str, Any]:
426
640
  """Perform time series analysis."""
427
641
  df = self._load_data(file_path)
428
642
  self._validate_variables(df, [variable] + ([date_variable] if date_variable else []))
429
- from statsmodels.tsa.arima.model import ARIMA
430
- from statsmodels.tsa.statespace.sarimax import SARIMAX
643
+ from statsmodels.tsa.arima.model import ARIMA # type: ignore[import-untyped]
644
+ from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore[import-untyped]
645
+
431
646
  try:
432
647
  ts_data = df[variable].dropna()
433
648
  if date_variable and date_variable in df.columns:
434
649
  ts_data.index = df[date_variable]
435
- if model_type == 'arima':
650
+ if model_type == "arima":
436
651
  model = ARIMA(ts_data, order=order)
437
652
  fit = model.fit()
438
- model_type_name = 'ARIMA'
439
- elif model_type == 'sarima':
653
+ model_type_name = "ARIMA"
654
+ elif model_type == "sarima":
440
655
  if not seasonal_order:
441
656
  raise AnalysisError("seasonal_order must be provided for SARIMA model")
442
657
  model = SARIMAX(ts_data, order=order, seasonal_order=seasonal_order)
443
658
  fit = model.fit(disp=False)
444
- model_type_name = 'SARIMA'
659
+ model_type_name = "SARIMA"
445
660
  else:
446
661
  raise AnalysisError(f"Unsupported time series model: {model_type}")
447
662
  forecast = fit.forecast(steps=forecast_periods)
448
663
  forecast_index = pd.date_range(
449
- start=ts_data.index[-1] if isinstance(ts_data.index, pd.DatetimeIndex) else len(ts_data),
664
+ start=(ts_data.index[-1] if isinstance(ts_data.index, pd.DatetimeIndex) else len(ts_data)),
450
665
  periods=forecast_periods + 1,
451
- freq='D'
666
+ freq="D",
452
667
  )[1:]
453
668
  return {
454
- 'model_type': model_type_name,
455
- 'order': order,
456
- 'seasonal_order': seasonal_order if model_type == 'sarima' else None,
457
- 'aic': float(fit.aic),
458
- 'bic': float(fit.bic),
459
- 'forecast': {
460
- 'values': forecast.tolist() if isinstance(forecast, np.ndarray) else forecast.values.tolist(),
461
- 'index': forecast_index.strftime('%Y-%m-%d').tolist() if isinstance(forecast_index, pd.DatetimeIndex) else list(range(len(forecast)))
669
+ "model_type": model_type_name,
670
+ "order": order,
671
+ "seasonal_order": (seasonal_order if model_type == "sarima" else None),
672
+ "aic": float(fit.aic),
673
+ "bic": float(fit.bic),
674
+ "forecast": {
675
+ "values": (forecast.tolist() if isinstance(forecast, np.ndarray) else forecast.values.tolist()),
676
+ "index": (forecast_index.strftime("%Y-%m-%d").tolist() if isinstance(forecast_index, pd.DatetimeIndex) else list(range(len(forecast)))),
462
677
  },
463
- 'summary': str(fit.summary())
678
+ "summary": str(fit.summary()),
464
679
  }
465
680
  except Exception as e:
466
681
  raise AnalysisError(f"Time series analysis error: {str(e)}")
467
682
 
468
- def preprocess(self, file_path: str, variables: List[str], operation: str, scaler_type: ScalerType = ScalerType.STANDARD, output_path: Optional[str] = None) -> Dict[str, Any]:
683
+ def preprocess(
684
+ self,
685
+ file_path: str,
686
+ variables: List[str],
687
+ operation: str,
688
+ scaler_type: ScalerType = ScalerType.STANDARD,
689
+ output_path: Optional[str] = None,
690
+ ) -> Dict[str, Any]:
469
691
  """Preprocess data with various operations."""
470
692
  df = self._load_data(file_path)
471
693
  self._validate_variables(df, variables)
472
694
  data = df[variables].copy()
473
- result = {'operation': operation}
474
- if operation == 'scale':
475
- from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
695
+ result: Dict[str, Any] = {"operation": operation}
696
+ if operation == "scale":
697
+ from sklearn.preprocessing import ( # type: ignore[import-untyped]
698
+ StandardScaler,
699
+ MinMaxScaler,
700
+ RobustScaler,
701
+ )
702
+
476
703
  scaler_map = {
477
704
  ScalerType.STANDARD: (StandardScaler, "StandardScaler"),
478
705
  ScalerType.MINMAX: (MinMaxScaler, "MinMaxScaler"),
479
- ScalerType.ROBUST: (RobustScaler, "RobustScaler")
706
+ ScalerType.ROBUST: (RobustScaler, "RobustScaler"),
480
707
  }
481
708
  scaler_cls, scaler_name = scaler_map[scaler_type]
482
709
  scaler = scaler_cls()
483
710
  scaled_data = scaler.fit_transform(data)
484
- scaled_df = pd.DataFrame(scaled_data, columns=[f"{col}_scaled" for col in data.columns], index=data.index)
485
- result.update({
486
- 'scaler': scaler_name,
487
- 'original_stats': data.describe().to_dict(),
488
- 'scaled_stats': scaled_df.describe().to_dict(),
489
- 'preview': scaled_df.head(5).to_dict(orient='records')
490
- })
711
+ scaled_df = pd.DataFrame(
712
+ scaled_data,
713
+ columns=[f"{col}_scaled" for col in data.columns],
714
+ index=data.index,
715
+ )
716
+ result.update(
717
+ {
718
+ "scaler": scaler_name,
719
+ "original_stats": data.describe().to_dict(),
720
+ "scaled_stats": scaled_df.describe().to_dict(),
721
+ "preview": scaled_df.head(5).to_dict(orient="records"),
722
+ }
723
+ )
491
724
  processed_df = scaled_df
492
- elif operation == 'impute':
725
+ elif operation == "impute":
493
726
  import numpy as np
727
+
494
728
  imputed_df = data.copy()
495
729
  numeric_cols = data.select_dtypes(include=[np.number]).columns
496
730
  for col in numeric_cols:
@@ -498,16 +732,21 @@ class StatsTool(BaseTool):
498
732
  cat_cols = data.select_dtypes(exclude=[np.number]).columns
499
733
  for col in cat_cols:
500
734
  imputed_df[col] = data[col].fillna(data[col].mode()[0] if not data[col].mode().empty else None)
501
- result.update({
502
- 'imputation_method': {'numeric': 'mean', 'categorical': 'mode'},
503
- 'missing_counts_before': data.isna().sum().to_dict(),
504
- 'missing_counts_after': imputed_df.isna().sum().to_dict(),
505
- 'preview': imputed_df.head(5).to_dict(orient='records')
506
- })
735
+ result.update(
736
+ {
737
+ "imputation_method": {
738
+ "numeric": "mean",
739
+ "categorical": "mode",
740
+ },
741
+ "missing_counts_before": data.isna().sum().to_dict(),
742
+ "missing_counts_after": imputed_df.isna().sum().to_dict(),
743
+ "preview": imputed_df.head(5).to_dict(orient="records"),
744
+ }
745
+ )
507
746
  processed_df = imputed_df
508
747
  if output_path:
509
- output_path = os.path.abspath(output_path) if os.path.isabs(output_path) else os.path.join(tempfile.gettempdir(), 'stats_outputs', output_path)
748
+ output_path = os.path.abspath(output_path) if os.path.isabs(output_path) else os.path.join(tempfile.gettempdir(), "stats_outputs", output_path)
510
749
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
511
750
  processed_df.to_csv(output_path)
512
- result['output_file'] = output_path
751
+ result["output_file"] = output_path
513
752
  return result