aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Import Speed Optimization Utilities
|
|
3
|
+
|
|
4
|
+
Provides optimizations for structured data import:
|
|
5
|
+
- Parallel batch processing with worker pools
|
|
6
|
+
- Async I/O for file reading
|
|
7
|
+
- Batch size auto-tuning
|
|
8
|
+
- Performance metrics tracking
|
|
9
|
+
- Streaming import for large files
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import os
|
|
14
|
+
import time
|
|
15
|
+
import psutil
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, TypeVar
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class PerformanceMetrics:
|
|
26
|
+
"""
|
|
27
|
+
Import performance metrics
|
|
28
|
+
|
|
29
|
+
Tracks detailed timing and throughput information during import.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Timing
|
|
33
|
+
start_time: float = 0.0
|
|
34
|
+
end_time: float = 0.0
|
|
35
|
+
read_time_seconds: float = 0.0
|
|
36
|
+
transform_time_seconds: float = 0.0
|
|
37
|
+
write_time_seconds: float = 0.0
|
|
38
|
+
|
|
39
|
+
# Throughput
|
|
40
|
+
total_rows: int = 0
|
|
41
|
+
rows_per_second: float = 0.0
|
|
42
|
+
|
|
43
|
+
# Memory
|
|
44
|
+
peak_memory_mb: float = 0.0
|
|
45
|
+
initial_memory_mb: float = 0.0
|
|
46
|
+
|
|
47
|
+
# Batch info
|
|
48
|
+
batch_count: int = 0
|
|
49
|
+
avg_batch_time_seconds: float = 0.0
|
|
50
|
+
|
|
51
|
+
# Parallelism
|
|
52
|
+
worker_count: int = 1
|
|
53
|
+
parallel_speedup: float = 1.0
|
|
54
|
+
|
|
55
|
+
def calculate_throughput(self) -> None:
|
|
56
|
+
"""Calculate derived metrics after import completes"""
|
|
57
|
+
duration = self.end_time - self.start_time
|
|
58
|
+
if duration > 0:
|
|
59
|
+
self.rows_per_second = self.total_rows / duration
|
|
60
|
+
if self.batch_count > 0:
|
|
61
|
+
total_batch_time = self.read_time_seconds + self.transform_time_seconds + self.write_time_seconds
|
|
62
|
+
self.avg_batch_time_seconds = total_batch_time / self.batch_count
|
|
63
|
+
|
|
64
|
+
def get_summary(self) -> Dict[str, Any]:
|
|
65
|
+
"""Get summary dictionary for logging/reporting"""
|
|
66
|
+
duration = self.end_time - self.start_time
|
|
67
|
+
return {
|
|
68
|
+
"total_rows": self.total_rows,
|
|
69
|
+
"duration_seconds": round(duration, 2),
|
|
70
|
+
"rows_per_second": round(self.rows_per_second, 1),
|
|
71
|
+
"read_time_seconds": round(self.read_time_seconds, 2),
|
|
72
|
+
"transform_time_seconds": round(self.transform_time_seconds, 2),
|
|
73
|
+
"write_time_seconds": round(self.write_time_seconds, 2),
|
|
74
|
+
"peak_memory_mb": round(self.peak_memory_mb, 1),
|
|
75
|
+
"batch_count": self.batch_count,
|
|
76
|
+
"worker_count": self.worker_count,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class BatchSizeOptimizer:
|
|
81
|
+
"""
|
|
82
|
+
Auto-tunes batch size based on system resources and data characteristics.
|
|
83
|
+
|
|
84
|
+
Factors considered:
|
|
85
|
+
- Available memory
|
|
86
|
+
- Number of columns/properties
|
|
87
|
+
- Data type complexity
|
|
88
|
+
- Historical performance
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
# Memory thresholds
|
|
92
|
+
MIN_BATCH_SIZE = 50
|
|
93
|
+
MAX_BATCH_SIZE = 10000
|
|
94
|
+
DEFAULT_BATCH_SIZE = 1000
|
|
95
|
+
|
|
96
|
+
# Memory allocation per row (estimated)
|
|
97
|
+
BASE_MEMORY_PER_ROW_BYTES = 1024 # 1KB base
|
|
98
|
+
MEMORY_PER_COLUMN_BYTES = 100 # 100 bytes per column
|
|
99
|
+
|
|
100
|
+
def __init__(self, target_memory_percent: float = 0.25):
|
|
101
|
+
"""
|
|
102
|
+
Initialize batch size optimizer
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
target_memory_percent: Target percentage of available memory to use (0-1)
|
|
106
|
+
"""
|
|
107
|
+
self.target_memory_percent = target_memory_percent
|
|
108
|
+
self._batch_times: List[float] = []
|
|
109
|
+
self._current_batch_size = self.DEFAULT_BATCH_SIZE
|
|
110
|
+
|
|
111
|
+
def estimate_batch_size(
|
|
112
|
+
self,
|
|
113
|
+
column_count: int,
|
|
114
|
+
sample_row_size_bytes: Optional[int] = None,
|
|
115
|
+
) -> int:
|
|
116
|
+
"""
|
|
117
|
+
Estimate optimal batch size based on system resources.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
column_count: Number of columns in the data
|
|
121
|
+
sample_row_size_bytes: Optional measured row size
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Recommended batch size
|
|
125
|
+
"""
|
|
126
|
+
try:
|
|
127
|
+
available_memory = psutil.virtual_memory().available
|
|
128
|
+
except Exception:
|
|
129
|
+
# Fallback if psutil fails
|
|
130
|
+
return self.DEFAULT_BATCH_SIZE
|
|
131
|
+
|
|
132
|
+
# Calculate target memory for batches
|
|
133
|
+
target_memory = available_memory * self.target_memory_percent
|
|
134
|
+
|
|
135
|
+
# Estimate memory per row
|
|
136
|
+
if sample_row_size_bytes:
|
|
137
|
+
memory_per_row = sample_row_size_bytes
|
|
138
|
+
else:
|
|
139
|
+
memory_per_row = self.BASE_MEMORY_PER_ROW_BYTES + (column_count * self.MEMORY_PER_COLUMN_BYTES)
|
|
140
|
+
|
|
141
|
+
# Calculate batch size
|
|
142
|
+
batch_size = int(target_memory / memory_per_row)
|
|
143
|
+
|
|
144
|
+
# Clamp to reasonable range
|
|
145
|
+
batch_size = max(self.MIN_BATCH_SIZE, min(batch_size, self.MAX_BATCH_SIZE))
|
|
146
|
+
|
|
147
|
+
self._current_batch_size = batch_size
|
|
148
|
+
logger.debug(f"Estimated batch size: {batch_size} (columns={column_count}, memory_per_row={memory_per_row})")
|
|
149
|
+
|
|
150
|
+
return batch_size
|
|
151
|
+
|
|
152
|
+
def record_batch_time(self, batch_time: float, rows_processed: int) -> None:
|
|
153
|
+
"""
|
|
154
|
+
Record batch processing time for adaptive tuning.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
batch_time: Time to process the batch in seconds
|
|
158
|
+
rows_processed: Number of rows processed in the batch
|
|
159
|
+
"""
|
|
160
|
+
self._batch_times.append(batch_time / max(rows_processed, 1))
|
|
161
|
+
|
|
162
|
+
def adjust_batch_size(self) -> int:
|
|
163
|
+
"""
|
|
164
|
+
Adjust batch size based on historical performance.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Adjusted batch size
|
|
168
|
+
"""
|
|
169
|
+
if len(self._batch_times) < 3:
|
|
170
|
+
return self._current_batch_size
|
|
171
|
+
|
|
172
|
+
# Calculate average time per row
|
|
173
|
+
recent_times = self._batch_times[-5:]
|
|
174
|
+
avg_time_per_row = sum(recent_times) / len(recent_times)
|
|
175
|
+
|
|
176
|
+
# If processing is fast, increase batch size
|
|
177
|
+
if avg_time_per_row < 0.001: # < 1ms per row
|
|
178
|
+
self._current_batch_size = min(
|
|
179
|
+
self._current_batch_size * 2,
|
|
180
|
+
self.MAX_BATCH_SIZE
|
|
181
|
+
)
|
|
182
|
+
# If processing is slow, decrease batch size
|
|
183
|
+
elif avg_time_per_row > 0.01: # > 10ms per row
|
|
184
|
+
self._current_batch_size = max(
|
|
185
|
+
self._current_batch_size // 2,
|
|
186
|
+
self.MIN_BATCH_SIZE
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return self._current_batch_size
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class ParallelBatchProcessor:
|
|
193
|
+
"""
|
|
194
|
+
Processes batches in parallel using a worker pool.
|
|
195
|
+
|
|
196
|
+
Uses ThreadPoolExecutor for I/O-bound work (default) or
|
|
197
|
+
ProcessPoolExecutor for CPU-bound work.
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
def __init__(
|
|
201
|
+
self,
|
|
202
|
+
max_workers: Optional[int] = None,
|
|
203
|
+
use_processes: bool = False,
|
|
204
|
+
):
|
|
205
|
+
"""
|
|
206
|
+
Initialize parallel batch processor.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
max_workers: Maximum number of workers. Default: CPU count - 1
|
|
210
|
+
use_processes: Use ProcessPoolExecutor instead of ThreadPoolExecutor
|
|
211
|
+
"""
|
|
212
|
+
if max_workers is None:
|
|
213
|
+
max_workers = max(1, os.cpu_count() - 1) if os.cpu_count() else 1
|
|
214
|
+
|
|
215
|
+
self.max_workers = max_workers
|
|
216
|
+
self.use_processes = use_processes
|
|
217
|
+
self._executor: Optional[ThreadPoolExecutor] = None
|
|
218
|
+
self._progress_lock = asyncio.Lock()
|
|
219
|
+
self._processed_rows = 0
|
|
220
|
+
self._total_rows = 0
|
|
221
|
+
|
|
222
|
+
async def __aenter__(self):
|
|
223
|
+
"""Enter async context manager"""
|
|
224
|
+
if self.use_processes:
|
|
225
|
+
self._executor = ProcessPoolExecutor(max_workers=self.max_workers)
|
|
226
|
+
else:
|
|
227
|
+
self._executor = ThreadPoolExecutor(max_workers=self.max_workers)
|
|
228
|
+
return self
|
|
229
|
+
|
|
230
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
231
|
+
"""Exit async context manager"""
|
|
232
|
+
if self._executor:
|
|
233
|
+
self._executor.shutdown(wait=True)
|
|
234
|
+
self._executor = None
|
|
235
|
+
|
|
236
|
+
async def process_batches_parallel(
|
|
237
|
+
self,
|
|
238
|
+
batches: List[List[Dict[str, Any]]],
|
|
239
|
+
process_func: Callable[[List[Dict[str, Any]]], Any],
|
|
240
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
241
|
+
) -> List[Any]:
|
|
242
|
+
"""
|
|
243
|
+
Process multiple batches in parallel.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
batches: List of batch data (each batch is a list of row dicts)
|
|
247
|
+
process_func: Function to process each batch
|
|
248
|
+
progress_callback: Optional callback(processed_rows, total_rows)
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
List of results from each batch
|
|
252
|
+
"""
|
|
253
|
+
if not self._executor:
|
|
254
|
+
raise RuntimeError("ParallelBatchProcessor must be used as async context manager")
|
|
255
|
+
|
|
256
|
+
self._total_rows = sum(len(batch) for batch in batches)
|
|
257
|
+
self._processed_rows = 0
|
|
258
|
+
|
|
259
|
+
loop = asyncio.get_event_loop()
|
|
260
|
+
|
|
261
|
+
async def process_with_progress(batch: List[Dict[str, Any]]) -> Any:
|
|
262
|
+
# Run in thread pool
|
|
263
|
+
result = await loop.run_in_executor(self._executor, process_func, batch)
|
|
264
|
+
|
|
265
|
+
# Update progress
|
|
266
|
+
async with self._progress_lock:
|
|
267
|
+
self._processed_rows += len(batch)
|
|
268
|
+
if progress_callback:
|
|
269
|
+
progress_callback(self._processed_rows, self._total_rows)
|
|
270
|
+
|
|
271
|
+
return result
|
|
272
|
+
|
|
273
|
+
# Create tasks for all batches
|
|
274
|
+
tasks = [process_with_progress(batch) for batch in batches]
|
|
275
|
+
|
|
276
|
+
# Process in parallel
|
|
277
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
278
|
+
|
|
279
|
+
return results
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def worker_count(self) -> int:
|
|
283
|
+
"""Get the number of workers"""
|
|
284
|
+
return self.max_workers
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class MemoryTracker:
|
|
288
|
+
"""
|
|
289
|
+
Tracks memory usage during import.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
def __init__(self):
|
|
293
|
+
self._initial_memory = 0
|
|
294
|
+
self._peak_memory = 0
|
|
295
|
+
self._current_memory = 0
|
|
296
|
+
|
|
297
|
+
def start_tracking(self) -> None:
|
|
298
|
+
"""Start memory tracking"""
|
|
299
|
+
try:
|
|
300
|
+
process = psutil.Process()
|
|
301
|
+
self._initial_memory = process.memory_info().rss
|
|
302
|
+
self._peak_memory = self._initial_memory
|
|
303
|
+
except Exception:
|
|
304
|
+
pass
|
|
305
|
+
|
|
306
|
+
def update(self) -> None:
|
|
307
|
+
"""Update memory tracking"""
|
|
308
|
+
try:
|
|
309
|
+
process = psutil.Process()
|
|
310
|
+
self._current_memory = process.memory_info().rss
|
|
311
|
+
self._peak_memory = max(self._peak_memory, self._current_memory)
|
|
312
|
+
except Exception:
|
|
313
|
+
pass
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def initial_memory_mb(self) -> float:
|
|
317
|
+
"""Get initial memory in MB"""
|
|
318
|
+
return self._initial_memory / (1024 * 1024)
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
def peak_memory_mb(self) -> float:
|
|
322
|
+
"""Get peak memory in MB"""
|
|
323
|
+
return self._peak_memory / (1024 * 1024)
|
|
324
|
+
|
|
325
|
+
@property
|
|
326
|
+
def current_memory_mb(self) -> float:
|
|
327
|
+
"""Get current memory in MB"""
|
|
328
|
+
return self._current_memory / (1024 * 1024)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
class StreamingCSVReader:
|
|
332
|
+
"""
|
|
333
|
+
Streaming CSV reader for large files.
|
|
334
|
+
|
|
335
|
+
Reads CSV file in chunks without loading entire file into memory.
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
def __init__(
|
|
339
|
+
self,
|
|
340
|
+
file_path: str,
|
|
341
|
+
chunk_size: int = 10000,
|
|
342
|
+
encoding: str = "utf-8",
|
|
343
|
+
delimiter: str = ",",
|
|
344
|
+
):
|
|
345
|
+
"""
|
|
346
|
+
Initialize streaming CSV reader.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
file_path: Path to CSV file
|
|
350
|
+
chunk_size: Number of rows per chunk
|
|
351
|
+
encoding: File encoding
|
|
352
|
+
delimiter: CSV delimiter
|
|
353
|
+
"""
|
|
354
|
+
self.file_path = file_path
|
|
355
|
+
self.chunk_size = chunk_size
|
|
356
|
+
self.encoding = encoding
|
|
357
|
+
self.delimiter = delimiter
|
|
358
|
+
|
|
359
|
+
async def read_chunks(self):
|
|
360
|
+
"""
|
|
361
|
+
Async generator that yields chunks of data.
|
|
362
|
+
|
|
363
|
+
Yields:
|
|
364
|
+
pandas DataFrame chunks
|
|
365
|
+
"""
|
|
366
|
+
try:
|
|
367
|
+
import pandas as pd
|
|
368
|
+
except ImportError:
|
|
369
|
+
raise ImportError("pandas is required for streaming CSV reading")
|
|
370
|
+
|
|
371
|
+
# Use pandas chunked reading
|
|
372
|
+
for chunk in pd.read_csv(
|
|
373
|
+
self.file_path,
|
|
374
|
+
chunksize=self.chunk_size,
|
|
375
|
+
encoding=self.encoding,
|
|
376
|
+
delimiter=self.delimiter,
|
|
377
|
+
):
|
|
378
|
+
yield chunk
|
|
379
|
+
# Allow other async tasks to run
|
|
380
|
+
await asyncio.sleep(0)
|
|
381
|
+
|
|
382
|
+
def count_rows(self) -> int:
|
|
383
|
+
"""
|
|
384
|
+
Count total rows in file (for progress tracking).
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
Total row count (excluding header)
|
|
388
|
+
"""
|
|
389
|
+
count = 0
|
|
390
|
+
with open(self.file_path, 'r', encoding=self.encoding) as f:
|
|
391
|
+
# Skip header
|
|
392
|
+
next(f, None)
|
|
393
|
+
for _ in f:
|
|
394
|
+
count += 1
|
|
395
|
+
return count
|
|
396
|
+
|