aiecs 1.0.1__py3-none-any.whl → 1.7.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +435 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3949 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1731 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +894 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +377 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +230 -37
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +328 -0
- aiecs/llm/clients/google_function_calling_mixin.py +415 -0
- aiecs/llm/clients/googleai_client.py +314 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +1186 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1464 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1016 -0
- aiecs/tools/docs/document_writer_tool.py +2008 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +220 -141
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/METADATA +52 -15
- aiecs-1.7.17.dist-info/RECORD +337 -0
- aiecs-1.7.17.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/top_level.txt +0 -0
|
@@ -1,53 +1,63 @@
|
|
|
1
|
+
from aiecs.tools import register_tool
|
|
2
|
+
from aiecs.tools.tool_executor import (
|
|
3
|
+
validate_input,
|
|
4
|
+
)
|
|
5
|
+
from aiecs.tools.base_tool import BaseTool
|
|
1
6
|
import os
|
|
2
7
|
import re
|
|
3
8
|
import logging
|
|
4
9
|
import asyncio
|
|
5
10
|
import time
|
|
6
|
-
from typing import Dict, Any, List, Optional,
|
|
11
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
7
12
|
from enum import Enum
|
|
8
13
|
|
|
9
|
-
from pydantic import BaseModel, Field, field_validator
|
|
14
|
+
from pydantic import BaseModel, Field, field_validator
|
|
15
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
10
16
|
|
|
11
17
|
# Lazy imports for heavy dependencies
|
|
12
18
|
rake_nltk = None
|
|
13
19
|
spacy = None
|
|
14
20
|
|
|
21
|
+
|
|
15
22
|
def _init_heavy_dependencies():
|
|
16
23
|
"""Initialize heavy dependencies when actually needed"""
|
|
17
24
|
global rake_nltk, spacy
|
|
18
|
-
|
|
25
|
+
|
|
19
26
|
if rake_nltk is None:
|
|
20
27
|
try:
|
|
21
|
-
import rake_nltk as _rake_nltk
|
|
28
|
+
import rake_nltk as _rake_nltk # type: ignore[import-untyped]
|
|
29
|
+
|
|
22
30
|
rake_nltk = _rake_nltk
|
|
23
31
|
except ImportError:
|
|
24
32
|
import logging
|
|
33
|
+
|
|
25
34
|
logging.getLogger(__name__).error("rake_nltk not available")
|
|
26
|
-
|
|
35
|
+
|
|
27
36
|
if spacy is None:
|
|
28
37
|
try:
|
|
29
38
|
import spacy as _spacy
|
|
39
|
+
|
|
30
40
|
spacy = _spacy
|
|
31
41
|
except ImportError:
|
|
32
42
|
import logging
|
|
43
|
+
|
|
33
44
|
logging.getLogger(__name__).warning("spacy not available (optional)")
|
|
34
45
|
|
|
35
|
-
from aiecs.tools import register_tool
|
|
36
|
-
from aiecs.tools.base_tool import BaseTool
|
|
37
|
-
from aiecs.tools.tool_executor import (
|
|
38
|
-
validate_input,
|
|
39
|
-
)
|
|
40
46
|
|
|
41
47
|
# Enums for configuration options
|
|
48
|
+
|
|
49
|
+
|
|
42
50
|
class Language(str, Enum):
|
|
43
51
|
ENGLISH = "en"
|
|
44
52
|
CHINESE = "zh"
|
|
45
53
|
AUTO = "auto"
|
|
46
54
|
|
|
55
|
+
|
|
47
56
|
class ModelType(str, Enum):
|
|
48
57
|
SPACY_ENGLISH = "en_core_web_sm"
|
|
49
58
|
SPACY_CHINESE = "zh_core_web_sm"
|
|
50
59
|
|
|
60
|
+
|
|
51
61
|
@register_tool("classifier")
|
|
52
62
|
class ClassifierTool(BaseTool):
|
|
53
63
|
"""
|
|
@@ -69,182 +79,126 @@ class ClassifierTool(BaseTool):
|
|
|
69
79
|
"""
|
|
70
80
|
|
|
71
81
|
# Configuration schema
|
|
72
|
-
class Config(
|
|
73
|
-
"""Configuration for the classifier tool
|
|
82
|
+
class Config(BaseSettings):
|
|
83
|
+
"""Configuration for the classifier tool
|
|
84
|
+
|
|
85
|
+
Automatically reads from environment variables with CLASSIFIER_TOOL_ prefix.
|
|
86
|
+
Example: CLASSIFIER_TOOL_MAX_WORKERS -> max_workers
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
model_config = SettingsConfigDict(env_prefix="CLASSIFIER_TOOL_")
|
|
90
|
+
|
|
74
91
|
max_workers: int = Field(
|
|
75
92
|
default=min(32, (os.cpu_count() or 4) * 2),
|
|
76
|
-
description="Maximum number of worker threads"
|
|
93
|
+
description="Maximum number of worker threads",
|
|
77
94
|
)
|
|
78
95
|
pipeline_cache_ttl: int = Field(
|
|
79
96
|
default=3600,
|
|
80
|
-
description="Time-to-live for pipeline cache in seconds"
|
|
81
|
-
)
|
|
82
|
-
pipeline_cache_size: int = Field(
|
|
83
|
-
default=10,
|
|
84
|
-
description="Maximum number of pipeline cache entries"
|
|
85
|
-
)
|
|
86
|
-
max_text_length: int = Field(
|
|
87
|
-
default=10_000,
|
|
88
|
-
description="Maximum text length in characters"
|
|
89
|
-
)
|
|
90
|
-
spacy_model_en: str = Field(
|
|
91
|
-
default="en_core_web_sm",
|
|
92
|
-
description="spaCy model for English"
|
|
93
|
-
)
|
|
94
|
-
spacy_model_zh: str = Field(
|
|
95
|
-
default="zh_core_web_sm",
|
|
96
|
-
description="spaCy model for Chinese"
|
|
97
|
+
description="Time-to-live for pipeline cache in seconds",
|
|
97
98
|
)
|
|
99
|
+
pipeline_cache_size: int = Field(default=10, description="Maximum number of pipeline cache entries")
|
|
100
|
+
max_text_length: int = Field(default=10_000, description="Maximum text length in characters")
|
|
101
|
+
spacy_model_en: str = Field(default="en_core_web_sm", description="spaCy model for English")
|
|
102
|
+
spacy_model_zh: str = Field(default="zh_core_web_sm", description="spaCy model for Chinese")
|
|
98
103
|
allowed_models: List[str] = Field(
|
|
99
|
-
default=[
|
|
100
|
-
|
|
101
|
-
"zh_core_web_sm"
|
|
102
|
-
],
|
|
103
|
-
description="List of allowed spaCy models"
|
|
104
|
-
)
|
|
105
|
-
rate_limit_enabled: bool = Field(
|
|
106
|
-
default=True,
|
|
107
|
-
description="Enable rate limiting"
|
|
104
|
+
default=["en_core_web_sm", "zh_core_web_sm"],
|
|
105
|
+
description="List of allowed spaCy models",
|
|
108
106
|
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)
|
|
113
|
-
rate_limit_window: int = Field(
|
|
114
|
-
default=60,
|
|
115
|
-
description="Rate limit window in seconds"
|
|
116
|
-
)
|
|
117
|
-
use_rake_for_english: bool = Field(
|
|
118
|
-
default=True,
|
|
119
|
-
description="Use RAKE for English phrase extraction"
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
model_config = ConfigDict(env_prefix="CLASSIFIER_TOOL_")
|
|
107
|
+
rate_limit_enabled: bool = Field(default=True, description="Enable rate limiting")
|
|
108
|
+
rate_limit_requests: int = Field(default=100, description="Maximum requests per window")
|
|
109
|
+
rate_limit_window: int = Field(default=60, description="Rate limit window in seconds")
|
|
110
|
+
use_rake_for_english: bool = Field(default=True, description="Use RAKE for English phrase extraction")
|
|
123
111
|
|
|
124
112
|
# Base schema for text operations
|
|
125
113
|
class BaseTextSchema(BaseModel):
|
|
126
114
|
"""Base schema for text operations"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
)
|
|
115
|
+
|
|
116
|
+
text: str = Field(description="Text to process")
|
|
130
117
|
|
|
131
118
|
@field_validator("text")
|
|
132
119
|
@classmethod
|
|
133
120
|
def check_length_and_content(cls, v: str) -> str:
|
|
134
121
|
if len(v) > 10_000: # Using a constant here for validation
|
|
135
|
-
raise ValueError(
|
|
122
|
+
raise ValueError("Text length exceeds 10,000 characters")
|
|
136
123
|
# Check for malicious patterns (e.g., SQL injection)
|
|
137
|
-
if re.search(
|
|
124
|
+
if re.search(
|
|
125
|
+
r"(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)",
|
|
126
|
+
v,
|
|
127
|
+
re.IGNORECASE,
|
|
128
|
+
):
|
|
138
129
|
raise ValueError("Text contains potentially malicious content")
|
|
139
130
|
return v
|
|
140
131
|
|
|
141
132
|
# Input schemas for operations
|
|
142
133
|
class ClassifySchema(BaseTextSchema):
|
|
143
|
-
"""Schema for
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
)
|
|
148
|
-
language: Optional[Language] = Field(
|
|
149
|
-
default=None,
|
|
150
|
-
description="Language of the text"
|
|
151
|
-
)
|
|
134
|
+
"""Schema for classify operation"""
|
|
135
|
+
|
|
136
|
+
model: Optional[str] = Field(default=None, description="Model to use for classification")
|
|
137
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
152
138
|
|
|
153
139
|
@field_validator("model")
|
|
154
140
|
@classmethod
|
|
155
141
|
def check_model(cls, v: Optional[str]) -> Optional[str]:
|
|
156
|
-
allowed_models = [
|
|
157
|
-
"en_core_web_sm",
|
|
158
|
-
"zh_core_web_sm"
|
|
159
|
-
]
|
|
142
|
+
allowed_models = ["en_core_web_sm", "zh_core_web_sm"]
|
|
160
143
|
if v and v not in allowed_models:
|
|
161
144
|
raise ValueError(f"Model '{v}' not in allowed spaCy models: {allowed_models}")
|
|
162
145
|
return v
|
|
163
146
|
|
|
164
147
|
class TokenizeSchema(BaseTextSchema):
|
|
165
|
-
"""Schema for
|
|
166
|
-
language: Optional[Language] = Field(
|
|
167
|
-
default=None,
|
|
168
|
-
description="Language of the text"
|
|
169
|
-
)
|
|
148
|
+
"""Schema for tokenize operation"""
|
|
170
149
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
)
|
|
150
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
151
|
+
|
|
152
|
+
class Pos_tagSchema(BaseTextSchema):
|
|
153
|
+
"""Schema for pos_tag operation"""
|
|
154
|
+
|
|
155
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
177
156
|
|
|
178
157
|
class NERSchema(BaseTextSchema):
|
|
179
|
-
"""Schema for
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
description="Language of the text"
|
|
183
|
-
)
|
|
158
|
+
"""Schema for ner operation"""
|
|
159
|
+
|
|
160
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
184
161
|
|
|
185
162
|
class LemmatizeSchema(BaseTextSchema):
|
|
186
|
-
"""Schema for
|
|
187
|
-
language: Optional[Language] = Field(
|
|
188
|
-
default=None,
|
|
189
|
-
description="Language of the text"
|
|
190
|
-
)
|
|
163
|
+
"""Schema for lemmatize operation"""
|
|
191
164
|
|
|
192
|
-
|
|
193
|
-
"""Schema for dependency parsing"""
|
|
194
|
-
language: Optional[Language] = Field(
|
|
195
|
-
default=None,
|
|
196
|
-
description="Language of the text"
|
|
197
|
-
)
|
|
165
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
198
166
|
|
|
199
|
-
class
|
|
200
|
-
"""Schema for
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
)
|
|
167
|
+
class Dependency_parseSchema(BaseTextSchema):
|
|
168
|
+
"""Schema for dependency_parse operation"""
|
|
169
|
+
|
|
170
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
171
|
+
|
|
172
|
+
class Keyword_extractSchema(BaseTextSchema):
|
|
173
|
+
"""Schema for keyword_extract operation"""
|
|
174
|
+
|
|
175
|
+
top_k: int = Field(default=10, description="Number of keywords to extract")
|
|
176
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
209
177
|
extract_phrases: bool = Field(
|
|
210
178
|
default=True,
|
|
211
|
-
description="Whether to extract phrases or just keywords"
|
|
179
|
+
description="Whether to extract phrases or just keywords",
|
|
212
180
|
)
|
|
213
181
|
|
|
214
182
|
class SummarizeSchema(BaseTextSchema):
|
|
215
|
-
"""Schema for
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
)
|
|
220
|
-
language: Optional[Language] = Field(
|
|
221
|
-
default=None,
|
|
222
|
-
description="Language of the text"
|
|
223
|
-
)
|
|
183
|
+
"""Schema for summarize operation"""
|
|
184
|
+
|
|
185
|
+
max_length: int = Field(default=150, description="Maximum length of the summary")
|
|
186
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
224
187
|
|
|
225
|
-
class
|
|
188
|
+
class Batch_processSchema(BaseModel):
|
|
226
189
|
"""Schema for batch processing"""
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
)
|
|
233
|
-
language: Optional[Language] = Field(
|
|
234
|
-
default=None,
|
|
235
|
-
description="Language of the texts"
|
|
236
|
-
)
|
|
237
|
-
model: Optional[str] = Field(
|
|
238
|
-
default=None,
|
|
239
|
-
description="Model to use for processing"
|
|
240
|
-
)
|
|
190
|
+
|
|
191
|
+
texts: List[str] = Field(description="List of texts to process")
|
|
192
|
+
operation: str = Field(description="Operation to perform on each text")
|
|
193
|
+
language: Optional[Language] = Field(default=None, description="Language of the texts")
|
|
194
|
+
model: Optional[str] = Field(default=None, description="Model to use for processing")
|
|
241
195
|
top_k: Optional[int] = Field(
|
|
242
196
|
default=None,
|
|
243
|
-
description="Number of keywords to extract (for keyword_extract)"
|
|
197
|
+
description="Number of keywords to extract (for keyword_extract)",
|
|
244
198
|
)
|
|
245
199
|
max_length: Optional[int] = Field(
|
|
246
200
|
default=None,
|
|
247
|
-
description="Maximum length of the summary (for summarize)"
|
|
201
|
+
description="Maximum length of the summary (for summarize)",
|
|
248
202
|
)
|
|
249
203
|
|
|
250
204
|
@field_validator("texts")
|
|
@@ -252,8 +206,12 @@ class ClassifierTool(BaseTool):
|
|
|
252
206
|
def check_texts(cls, v: List[str]) -> List[str]:
|
|
253
207
|
for text in v:
|
|
254
208
|
if len(text) > 10_000: # Using a constant here for validation
|
|
255
|
-
raise ValueError(
|
|
256
|
-
if re.search(
|
|
209
|
+
raise ValueError("Text length exceeds 10,000 characters")
|
|
210
|
+
if re.search(
|
|
211
|
+
r"(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)",
|
|
212
|
+
text,
|
|
213
|
+
re.IGNORECASE,
|
|
214
|
+
):
|
|
257
215
|
raise ValueError("Text contains potentially malicious content")
|
|
258
216
|
return v
|
|
259
217
|
|
|
@@ -266,24 +224,31 @@ class ClassifierTool(BaseTool):
|
|
|
266
224
|
|
|
267
225
|
Raises:
|
|
268
226
|
ValueError: If config contains invalid settings.
|
|
227
|
+
|
|
228
|
+
Configuration is automatically loaded by BaseTool from:
|
|
229
|
+
1. Explicit config dict (highest priority)
|
|
230
|
+
2. YAML config files (config/tools/classifier.yaml)
|
|
231
|
+
3. Environment variables (via dotenv from .env files)
|
|
232
|
+
4. Tool defaults (lowest priority)
|
|
269
233
|
"""
|
|
270
234
|
super().__init__(config)
|
|
271
235
|
|
|
272
|
-
#
|
|
273
|
-
|
|
236
|
+
# Configuration is automatically loaded by BaseTool into self._config_obj
|
|
237
|
+
# Access config via self._config_obj (BaseSettings instance)
|
|
238
|
+
self.config = self._config_obj if self._config_obj else self.Config()
|
|
274
239
|
|
|
275
240
|
# Set up logger
|
|
276
241
|
self.logger = logging.getLogger(__name__)
|
|
277
242
|
if not self.logger.handlers:
|
|
278
243
|
handler = logging.StreamHandler()
|
|
279
|
-
handler.setFormatter(logging.Formatter(
|
|
244
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
280
245
|
self.logger.addHandler(handler)
|
|
281
246
|
self.logger.setLevel(logging.INFO)
|
|
282
247
|
|
|
283
248
|
# Initialize resources
|
|
284
|
-
self._spacy_nlp = {} # Language -> spaCy pipeline
|
|
285
|
-
self._metrics = {
|
|
286
|
-
self._request_timestamps = []
|
|
249
|
+
self._spacy_nlp: Dict[str, Any] = {} # Language -> spaCy pipeline
|
|
250
|
+
self._metrics = {"requests": 0, "cache_hits": 0, "processing_time": []}
|
|
251
|
+
self._request_timestamps: List[float] = []
|
|
287
252
|
|
|
288
253
|
def _get_sentiment_lexicon(self, language: str) -> Dict[str, float]:
|
|
289
254
|
"""
|
|
@@ -295,22 +260,66 @@ class ClassifierTool(BaseTool):
|
|
|
295
260
|
Returns:
|
|
296
261
|
Dict[str, float]: Sentiment lexicon with word -> score mapping.
|
|
297
262
|
"""
|
|
298
|
-
if language ==
|
|
263
|
+
if language == "en":
|
|
299
264
|
# Simple English sentiment lexicon
|
|
300
265
|
return {
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
266
|
+
"good": 1.0,
|
|
267
|
+
"great": 1.5,
|
|
268
|
+
"excellent": 2.0,
|
|
269
|
+
"amazing": 2.0,
|
|
270
|
+
"wonderful": 1.5,
|
|
271
|
+
"fantastic": 2.0,
|
|
272
|
+
"awesome": 1.5,
|
|
273
|
+
"perfect": 2.0,
|
|
274
|
+
"love": 1.5,
|
|
275
|
+
"like": 1.0,
|
|
276
|
+
"happy": 1.5,
|
|
277
|
+
"pleased": 1.0,
|
|
278
|
+
"satisfied": 1.0,
|
|
279
|
+
"positive": 1.0,
|
|
280
|
+
"best": 2.0,
|
|
281
|
+
"bad": -1.0,
|
|
282
|
+
"terrible": -2.0,
|
|
283
|
+
"awful": -2.0,
|
|
284
|
+
"horrible": -2.0,
|
|
285
|
+
"hate": -2.0,
|
|
286
|
+
"dislike": -1.0,
|
|
287
|
+
"sad": -1.5,
|
|
288
|
+
"angry": -1.5,
|
|
289
|
+
"disappointed": -1.5,
|
|
290
|
+
"negative": -1.0,
|
|
291
|
+
"worst": -2.0,
|
|
292
|
+
"poor": -1.0,
|
|
293
|
+
"fail": -1.5,
|
|
294
|
+
"wrong": -1.0,
|
|
295
|
+
"problem": -1.0,
|
|
307
296
|
}
|
|
308
297
|
else: # Chinese
|
|
309
298
|
return {
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
299
|
+
"好": 1.0,
|
|
300
|
+
"很好": 1.5,
|
|
301
|
+
"非常好": 2.0,
|
|
302
|
+
"棒": 1.5,
|
|
303
|
+
"优秀": 2.0,
|
|
304
|
+
"完美": 2.0,
|
|
305
|
+
"喜欢": 1.5,
|
|
306
|
+
"爱": 2.0,
|
|
307
|
+
"满意": 1.0,
|
|
308
|
+
"开心": 1.5,
|
|
309
|
+
"高兴": 1.5,
|
|
310
|
+
"积极": 1.0,
|
|
311
|
+
"坏": -1.0,
|
|
312
|
+
"很坏": -1.5,
|
|
313
|
+
"糟糕": -2.0,
|
|
314
|
+
"讨厌": -2.0,
|
|
315
|
+
"恨": -2.0,
|
|
316
|
+
"失望": -1.5,
|
|
317
|
+
"生气": -1.5,
|
|
318
|
+
"愤怒": -2.0,
|
|
319
|
+
"消极": -1.0,
|
|
320
|
+
"问题": -1.0,
|
|
321
|
+
"错误": -1.0,
|
|
322
|
+
"失败": -1.5,
|
|
314
323
|
}
|
|
315
324
|
|
|
316
325
|
def _get_spacy(self, language: str) -> Any:
|
|
@@ -327,11 +336,12 @@ class ClassifierTool(BaseTool):
|
|
|
327
336
|
if spacy is None:
|
|
328
337
|
try:
|
|
329
338
|
import spacy as spacy_module
|
|
339
|
+
|
|
330
340
|
spacy = spacy_module
|
|
331
341
|
except ImportError:
|
|
332
342
|
raise ImportError("spaCy is required but not installed. Please install it with: pip install spacy")
|
|
333
343
|
|
|
334
|
-
model = self.config.spacy_model_zh if language ==
|
|
344
|
+
model = self.config.spacy_model_zh if language == "zh" else self.config.spacy_model_en
|
|
335
345
|
return spacy.load(model, disable=["textcat"])
|
|
336
346
|
|
|
337
347
|
def _detect_language(self, text: str) -> str:
|
|
@@ -346,17 +356,17 @@ class ClassifierTool(BaseTool):
|
|
|
346
356
|
"""
|
|
347
357
|
try:
|
|
348
358
|
# Count Chinese characters (CJK Unified Ideographs)
|
|
349
|
-
chinese_chars = sum(1 for char in text if
|
|
359
|
+
chinese_chars = sum(1 for char in text if "\u4e00" <= char <= "\u9fff")
|
|
350
360
|
total_chars = len([char for char in text if char.isalpha()])
|
|
351
361
|
|
|
352
362
|
if total_chars == 0:
|
|
353
|
-
return
|
|
363
|
+
return "en"
|
|
354
364
|
|
|
355
365
|
# If more than 30% are Chinese characters, consider it Chinese
|
|
356
366
|
chinese_ratio = chinese_chars / total_chars
|
|
357
|
-
return
|
|
367
|
+
return "zh" if chinese_ratio > 0.3 else "en"
|
|
358
368
|
except Exception:
|
|
359
|
-
return
|
|
369
|
+
return "en"
|
|
360
370
|
|
|
361
371
|
def _check_rate_limit(self) -> bool:
|
|
362
372
|
"""
|
|
@@ -373,8 +383,7 @@ class ClassifierTool(BaseTool):
|
|
|
373
383
|
# Get lock from executor
|
|
374
384
|
with self._executor.get_lock("rate_limit"):
|
|
375
385
|
# Remove timestamps outside the window
|
|
376
|
-
self._request_timestamps = [ts for ts in self._request_timestamps
|
|
377
|
-
if current_time - ts <= self.config.rate_limit_window]
|
|
386
|
+
self._request_timestamps = [ts for ts in self._request_timestamps if current_time - ts <= self.config.rate_limit_window]
|
|
378
387
|
|
|
379
388
|
# Check if we're at the limit
|
|
380
389
|
if len(self._request_timestamps) >= self.config.rate_limit_requests:
|
|
@@ -398,10 +407,10 @@ class ClassifierTool(BaseTool):
|
|
|
398
407
|
try:
|
|
399
408
|
# Initialize heavy dependencies if needed
|
|
400
409
|
_init_heavy_dependencies()
|
|
401
|
-
|
|
410
|
+
|
|
402
411
|
if rake_nltk is None:
|
|
403
412
|
raise ImportError("rake_nltk not available")
|
|
404
|
-
|
|
413
|
+
|
|
405
414
|
rake = rake_nltk.Rake()
|
|
406
415
|
rake.extract_keywords_from_text(text)
|
|
407
416
|
phrases = rake.get_ranked_phrases()[:top_k]
|
|
@@ -409,9 +418,9 @@ class ClassifierTool(BaseTool):
|
|
|
409
418
|
except Exception as e:
|
|
410
419
|
self.logger.error(f"Error extracting English phrases: {e}")
|
|
411
420
|
# Fallback to simple keyword extraction
|
|
412
|
-
nlp = self._get_spacy(
|
|
421
|
+
nlp = self._get_spacy("en")
|
|
413
422
|
doc = nlp(text)
|
|
414
|
-
keywords = [token.text for token in doc if token.pos_ in (
|
|
423
|
+
keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
|
|
415
424
|
return keywords
|
|
416
425
|
|
|
417
426
|
def _extract_chinese_phrases(self, text: str, top_k: int) -> List[str]:
|
|
@@ -426,7 +435,7 @@ class ClassifierTool(BaseTool):
|
|
|
426
435
|
List[str]: Extracted phrases.
|
|
427
436
|
"""
|
|
428
437
|
try:
|
|
429
|
-
nlp = self._get_spacy(
|
|
438
|
+
nlp = self._get_spacy("zh")
|
|
430
439
|
doc = nlp(text)
|
|
431
440
|
|
|
432
441
|
# Extract noun phrases and named entities
|
|
@@ -444,7 +453,7 @@ class ClassifierTool(BaseTool):
|
|
|
444
453
|
|
|
445
454
|
# Add important nouns and proper nouns
|
|
446
455
|
for token in doc:
|
|
447
|
-
if token.pos_ in (
|
|
456
|
+
if token.pos_ in ("NOUN", "PROPN") and len(token.text.strip()) > 1:
|
|
448
457
|
phrases.append(token.text.strip())
|
|
449
458
|
|
|
450
459
|
# Remove duplicates and return top_k
|
|
@@ -455,9 +464,9 @@ class ClassifierTool(BaseTool):
|
|
|
455
464
|
self.logger.error(f"Error extracting Chinese phrases with spaCy: {e}")
|
|
456
465
|
# Fallback to simple noun extraction
|
|
457
466
|
try:
|
|
458
|
-
nlp = self._get_spacy(
|
|
467
|
+
nlp = self._get_spacy("zh")
|
|
459
468
|
doc = nlp(text)
|
|
460
|
-
nouns = [token.text for token in doc if token.pos_ in (
|
|
469
|
+
nouns = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")]
|
|
461
470
|
return nouns[:top_k]
|
|
462
471
|
except Exception:
|
|
463
472
|
return []
|
|
@@ -478,14 +487,20 @@ class ClassifierTool(BaseTool):
|
|
|
478
487
|
ValueError: If the pipeline creation fails.
|
|
479
488
|
"""
|
|
480
489
|
try:
|
|
481
|
-
from transformers import pipeline
|
|
490
|
+
from transformers import pipeline # type: ignore[import-not-found]
|
|
491
|
+
|
|
482
492
|
return pipeline(task, model=model)
|
|
483
493
|
except ImportError:
|
|
484
494
|
raise ImportError("transformers library is required for summarization but not installed. Please install it with: pip install transformers")
|
|
485
495
|
except Exception as e:
|
|
486
496
|
raise ValueError(f"Error creating pipeline for task '{task}' with model '{model}': {e}")
|
|
487
497
|
|
|
488
|
-
async def classify(
|
|
498
|
+
async def classify(
|
|
499
|
+
self,
|
|
500
|
+
text: str,
|
|
501
|
+
model: Optional[str] = None,
|
|
502
|
+
language: Optional[str] = None,
|
|
503
|
+
) -> List[Dict[str, Any]]:
|
|
489
504
|
"""
|
|
490
505
|
Perform sentiment classification on text using spaCy and lexicon-based approach.
|
|
491
506
|
|
|
@@ -503,16 +518,12 @@ class ClassifierTool(BaseTool):
|
|
|
503
518
|
language = language or self._detect_language(text)
|
|
504
519
|
|
|
505
520
|
# Get spaCy pipeline and sentiment lexicon
|
|
506
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
507
|
-
None, self._get_spacy, language
|
|
508
|
-
)
|
|
521
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
509
522
|
|
|
510
523
|
sentiment_lexicon = self._get_sentiment_lexicon(language)
|
|
511
524
|
|
|
512
525
|
# Process text with spaCy
|
|
513
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
514
|
-
None, nlp, text
|
|
515
|
-
)
|
|
526
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
516
527
|
|
|
517
528
|
# Calculate sentiment score
|
|
518
529
|
sentiment_score = 0.0
|
|
@@ -556,13 +567,9 @@ class ClassifierTool(BaseTool):
|
|
|
556
567
|
|
|
557
568
|
language = language or self._detect_language(text)
|
|
558
569
|
|
|
559
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
560
|
-
None, self._get_spacy, language
|
|
561
|
-
)
|
|
570
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
562
571
|
|
|
563
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
564
|
-
None, nlp, text
|
|
565
|
-
)
|
|
572
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
566
573
|
|
|
567
574
|
return [token.text for token in doc]
|
|
568
575
|
|
|
@@ -582,18 +589,12 @@ class ClassifierTool(BaseTool):
|
|
|
582
589
|
|
|
583
590
|
language = language or self._detect_language(text)
|
|
584
591
|
|
|
585
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
586
|
-
None, self._get_spacy, language
|
|
587
|
-
)
|
|
592
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
588
593
|
|
|
589
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
590
|
-
None, nlp, text
|
|
591
|
-
)
|
|
594
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
592
595
|
|
|
593
596
|
return [(token.text, token.pos_) for token in doc]
|
|
594
597
|
|
|
595
|
-
@validate_input(NERSchema)
|
|
596
|
-
|
|
597
598
|
async def ner(self, text: str, language: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
598
599
|
"""
|
|
599
600
|
Perform named entity recognition.
|
|
@@ -610,21 +611,20 @@ class ClassifierTool(BaseTool):
|
|
|
610
611
|
|
|
611
612
|
language = language or self._detect_language(text)
|
|
612
613
|
|
|
613
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
614
|
-
None, self._get_spacy, language
|
|
615
|
-
)
|
|
614
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
616
615
|
|
|
617
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
618
|
-
None, nlp, text
|
|
619
|
-
)
|
|
616
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
620
617
|
|
|
621
618
|
return [
|
|
622
|
-
{
|
|
619
|
+
{
|
|
620
|
+
"text": ent.text,
|
|
621
|
+
"label": ent.label_,
|
|
622
|
+
"start": ent.start_char,
|
|
623
|
+
"end": ent.end_char,
|
|
624
|
+
}
|
|
623
625
|
for ent in doc.ents
|
|
624
626
|
]
|
|
625
627
|
|
|
626
|
-
@validate_input(LemmatizeSchema)
|
|
627
|
-
|
|
628
628
|
async def lemmatize(self, text: str, language: Optional[str] = None) -> List[str]:
|
|
629
629
|
"""
|
|
630
630
|
Lemmatize tokens in text using spaCy.
|
|
@@ -641,19 +641,14 @@ class ClassifierTool(BaseTool):
|
|
|
641
641
|
|
|
642
642
|
language = language or self._detect_language(text)
|
|
643
643
|
|
|
644
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
645
|
-
None, self._get_spacy, language
|
|
646
|
-
)
|
|
644
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
647
645
|
|
|
648
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
649
|
-
None, nlp, text
|
|
650
|
-
)
|
|
646
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
651
647
|
|
|
652
|
-
# For Chinese, lemma might be the same as text, but spaCy handles it
|
|
648
|
+
# For Chinese, lemma might be the same as text, but spaCy handles it
|
|
649
|
+
# consistently
|
|
653
650
|
return [token.lemma_ for token in doc]
|
|
654
651
|
|
|
655
|
-
@validate_input(DependencyParseSchema)
|
|
656
|
-
|
|
657
652
|
async def dependency_parse(self, text: str, language: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
658
653
|
"""
|
|
659
654
|
Perform dependency parsing using spaCy (supports both English and Chinese).
|
|
@@ -670,27 +665,27 @@ class ClassifierTool(BaseTool):
|
|
|
670
665
|
|
|
671
666
|
language = language or self._detect_language(text)
|
|
672
667
|
|
|
673
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
674
|
-
None, self._get_spacy, language
|
|
675
|
-
)
|
|
668
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
676
669
|
|
|
677
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
678
|
-
None, nlp, text
|
|
679
|
-
)
|
|
670
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
680
671
|
|
|
681
672
|
return [
|
|
682
673
|
{
|
|
683
674
|
"text": token.text,
|
|
684
675
|
"head": token.head.text,
|
|
685
676
|
"dep": token.dep_,
|
|
686
|
-
"pos": token.pos_
|
|
677
|
+
"pos": token.pos_,
|
|
687
678
|
}
|
|
688
679
|
for token in doc
|
|
689
680
|
]
|
|
690
681
|
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
682
|
+
async def keyword_extract(
|
|
683
|
+
self,
|
|
684
|
+
text: str,
|
|
685
|
+
top_k: int = 10,
|
|
686
|
+
language: Optional[str] = None,
|
|
687
|
+
extract_phrases: bool = True,
|
|
688
|
+
) -> List[str]:
|
|
694
689
|
"""
|
|
695
690
|
Extract keywords or key phrases from text using spaCy.
|
|
696
691
|
|
|
@@ -708,42 +703,28 @@ class ClassifierTool(BaseTool):
|
|
|
708
703
|
|
|
709
704
|
language = language or self._detect_language(text)
|
|
710
705
|
|
|
711
|
-
if language ==
|
|
706
|
+
if language == "zh":
|
|
712
707
|
if extract_phrases:
|
|
713
|
-
return await asyncio.get_event_loop().run_in_executor(
|
|
714
|
-
None, self._extract_chinese_phrases, text, top_k
|
|
715
|
-
)
|
|
708
|
+
return await asyncio.get_event_loop().run_in_executor(None, self._extract_chinese_phrases, text, top_k)
|
|
716
709
|
else:
|
|
717
710
|
# Extract simple keywords using spaCy
|
|
718
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
719
|
-
None, self._get_spacy, language
|
|
720
|
-
)
|
|
711
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
721
712
|
|
|
722
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
723
|
-
None, nlp, text
|
|
724
|
-
)
|
|
713
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
725
714
|
|
|
726
|
-
keywords = [token.text for token in doc if token.pos_ in (
|
|
715
|
+
keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
|
|
727
716
|
return keywords
|
|
728
717
|
else: # English or other languages
|
|
729
718
|
if extract_phrases and self.config.use_rake_for_english:
|
|
730
|
-
return await asyncio.get_event_loop().run_in_executor(
|
|
731
|
-
None, self._extract_english_phrases, text, top_k
|
|
732
|
-
)
|
|
719
|
+
return await asyncio.get_event_loop().run_in_executor(None, self._extract_english_phrases, text, top_k)
|
|
733
720
|
else:
|
|
734
|
-
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
735
|
-
None, self._get_spacy, language
|
|
736
|
-
)
|
|
721
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
737
722
|
|
|
738
|
-
doc = await asyncio.get_event_loop().run_in_executor(
|
|
739
|
-
None, nlp, text
|
|
740
|
-
)
|
|
723
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
741
724
|
|
|
742
|
-
keywords = [token.text for token in doc if token.pos_ in (
|
|
725
|
+
keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
|
|
743
726
|
return keywords
|
|
744
727
|
|
|
745
|
-
@validate_input(SummarizeSchema)
|
|
746
|
-
|
|
747
728
|
async def summarize(self, text: str, max_length: int = 150, language: Optional[str] = None) -> str:
|
|
748
729
|
"""
|
|
749
730
|
Summarize text.
|
|
@@ -761,22 +742,21 @@ class ClassifierTool(BaseTool):
|
|
|
761
742
|
|
|
762
743
|
language = language or self._detect_language(text)
|
|
763
744
|
# Use appropriate models for summarization
|
|
764
|
-
if language ==
|
|
745
|
+
if language == "en":
|
|
765
746
|
model = "facebook/bart-large-cnn"
|
|
766
747
|
else:
|
|
767
748
|
# For Chinese and other languages, use a multilingual model
|
|
768
|
-
# For now, use t5-base, but consider using a Chinese-specific model
|
|
749
|
+
# For now, use t5-base, but consider using a Chinese-specific model
|
|
750
|
+
# in the future
|
|
769
751
|
model = "t5-base"
|
|
770
752
|
|
|
771
|
-
pipe = await asyncio.get_event_loop().run_in_executor(
|
|
772
|
-
None, self._get_hf_pipeline, "summarization", model
|
|
773
|
-
)
|
|
753
|
+
pipe = await asyncio.get_event_loop().run_in_executor(None, self._get_hf_pipeline, "summarization", model)
|
|
774
754
|
|
|
775
755
|
# Different models use different parameter names for length control
|
|
776
756
|
if model.startswith("t5"):
|
|
777
757
|
# T5 models use max_new_tokens instead of max_length
|
|
778
758
|
# For Chinese text, use a more conservative approach
|
|
779
|
-
if language ==
|
|
759
|
+
if language == "zh":
|
|
780
760
|
# Chinese text: use character count and be more conservative
|
|
781
761
|
input_chars = len(text)
|
|
782
762
|
max_new_tokens = min(max_length, max(input_chars // 4, 5))
|
|
@@ -786,13 +766,21 @@ class ClassifierTool(BaseTool):
|
|
|
786
766
|
input_words = len(text.split())
|
|
787
767
|
max_new_tokens = min(max_length, max(input_words // 2, 10))
|
|
788
768
|
min_new_tokens = 5
|
|
789
|
-
|
|
769
|
+
|
|
790
770
|
result = await asyncio.get_event_loop().run_in_executor(
|
|
791
|
-
None,
|
|
771
|
+
None,
|
|
772
|
+
lambda: pipe(
|
|
773
|
+
text,
|
|
774
|
+
max_new_tokens=max_new_tokens,
|
|
775
|
+
min_new_tokens=min_new_tokens,
|
|
776
|
+
do_sample=False,
|
|
777
|
+
)[
|
|
778
|
+
0
|
|
779
|
+
]["summary_text"],
|
|
792
780
|
)
|
|
793
781
|
else:
|
|
794
782
|
# BART and other models use max_length
|
|
795
|
-
if language ==
|
|
783
|
+
if language == "zh":
|
|
796
784
|
# Chinese text: use character count
|
|
797
785
|
input_chars = len(text)
|
|
798
786
|
max_len = min(max_length, max(input_chars // 4, 10))
|
|
@@ -802,18 +790,30 @@ class ClassifierTool(BaseTool):
|
|
|
802
790
|
input_words = len(text.split())
|
|
803
791
|
max_len = min(max_length, max(input_words // 2, 20))
|
|
804
792
|
min_len = 10
|
|
805
|
-
|
|
793
|
+
|
|
806
794
|
result = await asyncio.get_event_loop().run_in_executor(
|
|
807
|
-
None,
|
|
795
|
+
None,
|
|
796
|
+
lambda: pipe(
|
|
797
|
+
text,
|
|
798
|
+
max_length=max_len,
|
|
799
|
+
min_length=min_len,
|
|
800
|
+
do_sample=False,
|
|
801
|
+
)[
|
|
802
|
+
0
|
|
803
|
+
]["summary_text"],
|
|
808
804
|
)
|
|
809
805
|
|
|
810
806
|
return result
|
|
811
807
|
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
808
|
+
async def batch_process(
|
|
809
|
+
self,
|
|
810
|
+
texts: List[str],
|
|
811
|
+
operation: str,
|
|
812
|
+
language: Optional[str] = None,
|
|
813
|
+
model: Optional[str] = None,
|
|
814
|
+
top_k: Optional[int] = None,
|
|
815
|
+
max_length: Optional[int] = None,
|
|
816
|
+
) -> List[Any]:
|
|
817
817
|
"""
|
|
818
818
|
Process multiple texts with the specified operation.
|
|
819
819
|
|
|
@@ -834,7 +834,7 @@ class ClassifierTool(BaseTool):
|
|
|
834
834
|
# Prepare operations to execute in batch
|
|
835
835
|
operations = []
|
|
836
836
|
for text in texts:
|
|
837
|
-
kwargs = {"text": text}
|
|
837
|
+
kwargs: Dict[str, Any] = {"text": text}
|
|
838
838
|
if language:
|
|
839
839
|
kwargs["language"] = language
|
|
840
840
|
if model and operation == "classify":
|
|
@@ -861,23 +861,24 @@ class ClassifierTool(BaseTool):
|
|
|
861
861
|
"metrics": {
|
|
862
862
|
"requests": self._metrics["requests"],
|
|
863
863
|
"cache_hits": self._metrics["cache_hits"],
|
|
864
|
-
"avg_processing_time":
|
|
865
|
-
|
|
864
|
+
"avg_processing_time": (
|
|
865
|
+
sum(float(t) for t in processing_times) / len(processing_times)
|
|
866
|
+
if (processing_times := self._metrics.get("processing_time")) and isinstance(processing_times, list) and len(processing_times) > 0
|
|
867
|
+
else 0.0
|
|
868
|
+
),
|
|
866
869
|
},
|
|
867
870
|
"config": {
|
|
868
871
|
"max_workers": self.config.max_workers,
|
|
869
872
|
"pipeline_cache_size": self.config.pipeline_cache_size,
|
|
870
873
|
"rate_limit_enabled": self.config.rate_limit_enabled,
|
|
871
874
|
"rate_limit_requests": self.config.rate_limit_requests,
|
|
872
|
-
"rate_limit_window": self.config.rate_limit_window
|
|
873
|
-
}
|
|
875
|
+
"rate_limit_window": self.config.rate_limit_window,
|
|
876
|
+
},
|
|
874
877
|
}
|
|
875
878
|
|
|
876
879
|
# Check if models can be loaded
|
|
877
880
|
try:
|
|
878
|
-
await asyncio.get_event_loop().run_in_executor(
|
|
879
|
-
None, self._get_spacy, "en"
|
|
880
|
-
)
|
|
881
|
+
await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, "en")
|
|
881
882
|
result["models"] = {"spacy_en": "ok"}
|
|
882
883
|
except Exception as e:
|
|
883
884
|
result["status"] = "warning"
|
|
@@ -893,7 +894,7 @@ class ClassifierTool(BaseTool):
|
|
|
893
894
|
self._spacy_nlp.clear()
|
|
894
895
|
|
|
895
896
|
# Clear metrics
|
|
896
|
-
self._metrics = {
|
|
897
|
+
self._metrics = {"requests": 0, "cache_hits": 0, "processing_time": []}
|
|
897
898
|
|
|
898
899
|
# Clear rate limiting data
|
|
899
900
|
self._request_timestamps = []
|