aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,922 @@
|
|
|
1
|
+
from aiecs.tools import register_tool
|
|
2
|
+
from aiecs.tools.tool_executor import (
|
|
3
|
+
validate_input,
|
|
4
|
+
)
|
|
5
|
+
from aiecs.tools.base_tool import BaseTool
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import logging
|
|
9
|
+
import asyncio
|
|
10
|
+
import time
|
|
11
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
12
|
+
from enum import Enum
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
|
15
|
+
|
|
16
|
+
# Lazy imports for heavy dependencies
|
|
17
|
+
rake_nltk = None
|
|
18
|
+
spacy = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _init_heavy_dependencies():
|
|
22
|
+
"""Initialize heavy dependencies when actually needed"""
|
|
23
|
+
global rake_nltk, spacy
|
|
24
|
+
|
|
25
|
+
if rake_nltk is None:
|
|
26
|
+
try:
|
|
27
|
+
import rake_nltk as _rake_nltk
|
|
28
|
+
|
|
29
|
+
rake_nltk = _rake_nltk
|
|
30
|
+
except ImportError:
|
|
31
|
+
import logging
|
|
32
|
+
|
|
33
|
+
logging.getLogger(__name__).error("rake_nltk not available")
|
|
34
|
+
|
|
35
|
+
if spacy is None:
|
|
36
|
+
try:
|
|
37
|
+
import spacy as _spacy
|
|
38
|
+
|
|
39
|
+
spacy = _spacy
|
|
40
|
+
except ImportError:
|
|
41
|
+
import logging
|
|
42
|
+
|
|
43
|
+
logging.getLogger(__name__).warning("spacy not available (optional)")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Enums for configuration options
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Language(str, Enum):
|
|
50
|
+
ENGLISH = "en"
|
|
51
|
+
CHINESE = "zh"
|
|
52
|
+
AUTO = "auto"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ModelType(str, Enum):
|
|
56
|
+
SPACY_ENGLISH = "en_core_web_sm"
|
|
57
|
+
SPACY_CHINESE = "zh_core_web_sm"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@register_tool("classifier")
|
|
61
|
+
class ClassifierTool(BaseTool):
|
|
62
|
+
"""
|
|
63
|
+
Text classification, tokenization, POS tagging, NER, lemmatization, dependency parsing,
|
|
64
|
+
keyword extraction, and summarization tool.
|
|
65
|
+
|
|
66
|
+
Operations:
|
|
67
|
+
- classify: Sentiment or topic classification.
|
|
68
|
+
- tokenize: Tokenize text.
|
|
69
|
+
- pos_tag: Part-of-speech tagging.
|
|
70
|
+
- ner: Named entity recognition.
|
|
71
|
+
- lemmatize: Lemmatize tokens.
|
|
72
|
+
- dependency_parse: Dependency parsing.
|
|
73
|
+
- keyword_extract: Extract key phrases.
|
|
74
|
+
- summarize: Summarize text.
|
|
75
|
+
- batch_process: Process multiple texts with any operation.
|
|
76
|
+
|
|
77
|
+
Supports English (spaCy) and Chinese (Jieba, spaCy).
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
# Configuration schema
|
|
81
|
+
class Config(BaseModel):
|
|
82
|
+
"""Configuration for the classifier tool"""
|
|
83
|
+
|
|
84
|
+
max_workers: int = Field(
|
|
85
|
+
default=min(32, (os.cpu_count() or 4) * 2),
|
|
86
|
+
description="Maximum number of worker threads",
|
|
87
|
+
)
|
|
88
|
+
pipeline_cache_ttl: int = Field(
|
|
89
|
+
default=3600,
|
|
90
|
+
description="Time-to-live for pipeline cache in seconds",
|
|
91
|
+
)
|
|
92
|
+
pipeline_cache_size: int = Field(
|
|
93
|
+
default=10, description="Maximum number of pipeline cache entries"
|
|
94
|
+
)
|
|
95
|
+
max_text_length: int = Field(
|
|
96
|
+
default=10_000, description="Maximum text length in characters"
|
|
97
|
+
)
|
|
98
|
+
spacy_model_en: str = Field(default="en_core_web_sm", description="spaCy model for English")
|
|
99
|
+
spacy_model_zh: str = Field(default="zh_core_web_sm", description="spaCy model for Chinese")
|
|
100
|
+
allowed_models: List[str] = Field(
|
|
101
|
+
default=["en_core_web_sm", "zh_core_web_sm"],
|
|
102
|
+
description="List of allowed spaCy models",
|
|
103
|
+
)
|
|
104
|
+
rate_limit_enabled: bool = Field(default=True, description="Enable rate limiting")
|
|
105
|
+
rate_limit_requests: int = Field(default=100, description="Maximum requests per window")
|
|
106
|
+
rate_limit_window: int = Field(default=60, description="Rate limit window in seconds")
|
|
107
|
+
use_rake_for_english: bool = Field(
|
|
108
|
+
default=True, description="Use RAKE for English phrase extraction"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
model_config = ConfigDict(env_prefix="CLASSIFIER_TOOL_")
|
|
112
|
+
|
|
113
|
+
# Base schema for text operations
|
|
114
|
+
class BaseTextSchema(BaseModel):
|
|
115
|
+
"""Base schema for text operations"""
|
|
116
|
+
|
|
117
|
+
text: str = Field(description="Text to process")
|
|
118
|
+
|
|
119
|
+
@field_validator("text")
|
|
120
|
+
@classmethod
|
|
121
|
+
def check_length_and_content(cls, v: str) -> str:
|
|
122
|
+
if len(v) > 10_000: # Using a constant here for validation
|
|
123
|
+
raise ValueError("Text length exceeds 10,000 characters")
|
|
124
|
+
# Check for malicious patterns (e.g., SQL injection)
|
|
125
|
+
if re.search(
|
|
126
|
+
r"(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)",
|
|
127
|
+
v,
|
|
128
|
+
re.IGNORECASE,
|
|
129
|
+
):
|
|
130
|
+
raise ValueError("Text contains potentially malicious content")
|
|
131
|
+
return v
|
|
132
|
+
|
|
133
|
+
# Input schemas for operations
|
|
134
|
+
class ClassifySchema(BaseTextSchema):
|
|
135
|
+
"""Schema for text classification"""
|
|
136
|
+
|
|
137
|
+
model: Optional[str] = Field(default=None, description="Model to use for classification")
|
|
138
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
139
|
+
|
|
140
|
+
@field_validator("model")
|
|
141
|
+
@classmethod
|
|
142
|
+
def check_model(cls, v: Optional[str]) -> Optional[str]:
|
|
143
|
+
allowed_models = ["en_core_web_sm", "zh_core_web_sm"]
|
|
144
|
+
if v and v not in allowed_models:
|
|
145
|
+
raise ValueError(f"Model '{v}' not in allowed spaCy models: {allowed_models}")
|
|
146
|
+
return v
|
|
147
|
+
|
|
148
|
+
class TokenizeSchema(BaseTextSchema):
|
|
149
|
+
"""Schema for text tokenization"""
|
|
150
|
+
|
|
151
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
152
|
+
|
|
153
|
+
class PosTagSchema(BaseTextSchema):
|
|
154
|
+
"""Schema for part-of-speech tagging"""
|
|
155
|
+
|
|
156
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
157
|
+
|
|
158
|
+
class NERSchema(BaseTextSchema):
|
|
159
|
+
"""Schema for named entity recognition"""
|
|
160
|
+
|
|
161
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
162
|
+
|
|
163
|
+
class LemmatizeSchema(BaseTextSchema):
|
|
164
|
+
"""Schema for lemmatization"""
|
|
165
|
+
|
|
166
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
167
|
+
|
|
168
|
+
class DependencyParseSchema(BaseTextSchema):
|
|
169
|
+
"""Schema for dependency parsing"""
|
|
170
|
+
|
|
171
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
172
|
+
|
|
173
|
+
class KeywordExtractSchema(BaseTextSchema):
|
|
174
|
+
"""Schema for keyword extraction"""
|
|
175
|
+
|
|
176
|
+
top_k: int = Field(default=10, description="Number of keywords to extract")
|
|
177
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
178
|
+
extract_phrases: bool = Field(
|
|
179
|
+
default=True,
|
|
180
|
+
description="Whether to extract phrases or just keywords",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
class SummarizeSchema(BaseTextSchema):
|
|
184
|
+
"""Schema for text summarization"""
|
|
185
|
+
|
|
186
|
+
max_length: int = Field(default=150, description="Maximum length of the summary")
|
|
187
|
+
language: Optional[Language] = Field(default=None, description="Language of the text")
|
|
188
|
+
|
|
189
|
+
class BatchProcessSchema(BaseModel):
|
|
190
|
+
"""Schema for batch processing"""
|
|
191
|
+
|
|
192
|
+
texts: List[str] = Field(description="List of texts to process")
|
|
193
|
+
operation: str = Field(description="Operation to perform on each text")
|
|
194
|
+
language: Optional[Language] = Field(default=None, description="Language of the texts")
|
|
195
|
+
model: Optional[str] = Field(default=None, description="Model to use for processing")
|
|
196
|
+
top_k: Optional[int] = Field(
|
|
197
|
+
default=None,
|
|
198
|
+
description="Number of keywords to extract (for keyword_extract)",
|
|
199
|
+
)
|
|
200
|
+
max_length: Optional[int] = Field(
|
|
201
|
+
default=None,
|
|
202
|
+
description="Maximum length of the summary (for summarize)",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
@field_validator("texts")
|
|
206
|
+
@classmethod
|
|
207
|
+
def check_texts(cls, v: List[str]) -> List[str]:
|
|
208
|
+
for text in v:
|
|
209
|
+
if len(text) > 10_000: # Using a constant here for validation
|
|
210
|
+
raise ValueError("Text length exceeds 10,000 characters")
|
|
211
|
+
if re.search(
|
|
212
|
+
r"(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)",
|
|
213
|
+
text,
|
|
214
|
+
re.IGNORECASE,
|
|
215
|
+
):
|
|
216
|
+
raise ValueError("Text contains potentially malicious content")
|
|
217
|
+
return v
|
|
218
|
+
|
|
219
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
220
|
+
"""
|
|
221
|
+
Initialize ClassifierTool with settings and resources.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
config (Dict, optional): Configuration overrides for ClassifierSettings.
|
|
225
|
+
|
|
226
|
+
Raises:
|
|
227
|
+
ValueError: If config contains invalid settings.
|
|
228
|
+
"""
|
|
229
|
+
super().__init__(config)
|
|
230
|
+
|
|
231
|
+
# Parse configuration
|
|
232
|
+
self.config = self.Config(**(config or {}))
|
|
233
|
+
|
|
234
|
+
# Set up logger
|
|
235
|
+
self.logger = logging.getLogger(__name__)
|
|
236
|
+
if not self.logger.handlers:
|
|
237
|
+
handler = logging.StreamHandler()
|
|
238
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
239
|
+
self.logger.addHandler(handler)
|
|
240
|
+
self.logger.setLevel(logging.INFO)
|
|
241
|
+
|
|
242
|
+
# Initialize resources
|
|
243
|
+
self._spacy_nlp = {} # Language -> spaCy pipeline
|
|
244
|
+
self._metrics = {"requests": 0, "cache_hits": 0, "processing_time": []}
|
|
245
|
+
self._request_timestamps = []
|
|
246
|
+
|
|
247
|
+
def _get_sentiment_lexicon(self, language: str) -> Dict[str, float]:
|
|
248
|
+
"""
|
|
249
|
+
Get sentiment lexicon for the specified language.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
language (str): Language code ('en', 'zh').
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Dict[str, float]: Sentiment lexicon with word -> score mapping.
|
|
256
|
+
"""
|
|
257
|
+
if language == "en":
|
|
258
|
+
# Simple English sentiment lexicon
|
|
259
|
+
return {
|
|
260
|
+
"good": 1.0,
|
|
261
|
+
"great": 1.5,
|
|
262
|
+
"excellent": 2.0,
|
|
263
|
+
"amazing": 2.0,
|
|
264
|
+
"wonderful": 1.5,
|
|
265
|
+
"fantastic": 2.0,
|
|
266
|
+
"awesome": 1.5,
|
|
267
|
+
"perfect": 2.0,
|
|
268
|
+
"love": 1.5,
|
|
269
|
+
"like": 1.0,
|
|
270
|
+
"happy": 1.5,
|
|
271
|
+
"pleased": 1.0,
|
|
272
|
+
"satisfied": 1.0,
|
|
273
|
+
"positive": 1.0,
|
|
274
|
+
"best": 2.0,
|
|
275
|
+
"bad": -1.0,
|
|
276
|
+
"terrible": -2.0,
|
|
277
|
+
"awful": -2.0,
|
|
278
|
+
"horrible": -2.0,
|
|
279
|
+
"hate": -2.0,
|
|
280
|
+
"dislike": -1.0,
|
|
281
|
+
"sad": -1.5,
|
|
282
|
+
"angry": -1.5,
|
|
283
|
+
"disappointed": -1.5,
|
|
284
|
+
"negative": -1.0,
|
|
285
|
+
"worst": -2.0,
|
|
286
|
+
"poor": -1.0,
|
|
287
|
+
"fail": -1.5,
|
|
288
|
+
"wrong": -1.0,
|
|
289
|
+
"problem": -1.0,
|
|
290
|
+
}
|
|
291
|
+
else: # Chinese
|
|
292
|
+
return {
|
|
293
|
+
"好": 1.0,
|
|
294
|
+
"很好": 1.5,
|
|
295
|
+
"非常好": 2.0,
|
|
296
|
+
"棒": 1.5,
|
|
297
|
+
"优秀": 2.0,
|
|
298
|
+
"完美": 2.0,
|
|
299
|
+
"喜欢": 1.5,
|
|
300
|
+
"爱": 2.0,
|
|
301
|
+
"满意": 1.0,
|
|
302
|
+
"开心": 1.5,
|
|
303
|
+
"高兴": 1.5,
|
|
304
|
+
"积极": 1.0,
|
|
305
|
+
"坏": -1.0,
|
|
306
|
+
"很坏": -1.5,
|
|
307
|
+
"糟糕": -2.0,
|
|
308
|
+
"讨厌": -2.0,
|
|
309
|
+
"恨": -2.0,
|
|
310
|
+
"失望": -1.5,
|
|
311
|
+
"生气": -1.5,
|
|
312
|
+
"愤怒": -2.0,
|
|
313
|
+
"消极": -1.0,
|
|
314
|
+
"问题": -1.0,
|
|
315
|
+
"错误": -1.0,
|
|
316
|
+
"失败": -1.5,
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
def _get_spacy(self, language: str) -> Any:
|
|
320
|
+
"""
|
|
321
|
+
Get a spaCy pipeline for the specified language.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
language (str): Language code ('en', 'zh').
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Any: spaCy NLP object.
|
|
328
|
+
"""
|
|
329
|
+
global spacy
|
|
330
|
+
if spacy is None:
|
|
331
|
+
try:
|
|
332
|
+
import spacy as spacy_module
|
|
333
|
+
|
|
334
|
+
spacy = spacy_module
|
|
335
|
+
except ImportError:
|
|
336
|
+
raise ImportError(
|
|
337
|
+
"spaCy is required but not installed. Please install it with: pip install spacy"
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
model = self.config.spacy_model_zh if language == "zh" else self.config.spacy_model_en
|
|
341
|
+
return spacy.load(model, disable=["textcat"])
|
|
342
|
+
|
|
343
|
+
def _detect_language(self, text: str) -> str:
|
|
344
|
+
"""
|
|
345
|
+
Detect the language of the input text using character analysis.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
text (str): Input text.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
str: Language code ('en', 'zh', or 'en' for unknown).
|
|
352
|
+
"""
|
|
353
|
+
try:
|
|
354
|
+
# Count Chinese characters (CJK Unified Ideographs)
|
|
355
|
+
chinese_chars = sum(1 for char in text if "\u4e00" <= char <= "\u9fff")
|
|
356
|
+
total_chars = len([char for char in text if char.isalpha()])
|
|
357
|
+
|
|
358
|
+
if total_chars == 0:
|
|
359
|
+
return "en"
|
|
360
|
+
|
|
361
|
+
# If more than 30% are Chinese characters, consider it Chinese
|
|
362
|
+
chinese_ratio = chinese_chars / total_chars
|
|
363
|
+
return "zh" if chinese_ratio > 0.3 else "en"
|
|
364
|
+
except Exception:
|
|
365
|
+
return "en"
|
|
366
|
+
|
|
367
|
+
def _check_rate_limit(self) -> bool:
|
|
368
|
+
"""
|
|
369
|
+
Check if the request is within rate limits.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
bool: True if within limits, False otherwise.
|
|
373
|
+
"""
|
|
374
|
+
if not self.config.rate_limit_enabled:
|
|
375
|
+
return True
|
|
376
|
+
|
|
377
|
+
current_time = time.time()
|
|
378
|
+
|
|
379
|
+
# Get lock from executor
|
|
380
|
+
with self._executor.get_lock("rate_limit"):
|
|
381
|
+
# Remove timestamps outside the window
|
|
382
|
+
self._request_timestamps = [
|
|
383
|
+
ts
|
|
384
|
+
for ts in self._request_timestamps
|
|
385
|
+
if current_time - ts <= self.config.rate_limit_window
|
|
386
|
+
]
|
|
387
|
+
|
|
388
|
+
# Check if we're at the limit
|
|
389
|
+
if len(self._request_timestamps) >= self.config.rate_limit_requests:
|
|
390
|
+
return False
|
|
391
|
+
|
|
392
|
+
# Add current timestamp
|
|
393
|
+
self._request_timestamps.append(current_time)
|
|
394
|
+
return True
|
|
395
|
+
|
|
396
|
+
def _extract_english_phrases(self, text: str, top_k: int) -> List[str]:
|
|
397
|
+
"""
|
|
398
|
+
Extract key phrases from English text using RAKE.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
text (str): Input text.
|
|
402
|
+
top_k (int): Number of phrases to extract.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
List[str]: Extracted phrases.
|
|
406
|
+
"""
|
|
407
|
+
try:
|
|
408
|
+
# Initialize heavy dependencies if needed
|
|
409
|
+
_init_heavy_dependencies()
|
|
410
|
+
|
|
411
|
+
if rake_nltk is None:
|
|
412
|
+
raise ImportError("rake_nltk not available")
|
|
413
|
+
|
|
414
|
+
rake = rake_nltk.Rake()
|
|
415
|
+
rake.extract_keywords_from_text(text)
|
|
416
|
+
phrases = rake.get_ranked_phrases()[:top_k]
|
|
417
|
+
return phrases
|
|
418
|
+
except Exception as e:
|
|
419
|
+
self.logger.error(f"Error extracting English phrases: {e}")
|
|
420
|
+
# Fallback to simple keyword extraction
|
|
421
|
+
nlp = self._get_spacy("en")
|
|
422
|
+
doc = nlp(text)
|
|
423
|
+
keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
|
|
424
|
+
return keywords
|
|
425
|
+
|
|
426
|
+
def _extract_chinese_phrases(self, text: str, top_k: int) -> List[str]:
|
|
427
|
+
"""
|
|
428
|
+
Extract key phrases from Chinese text using spaCy.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
text (str): Input text.
|
|
432
|
+
top_k (int): Number of phrases to extract.
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
List[str]: Extracted phrases.
|
|
436
|
+
"""
|
|
437
|
+
try:
|
|
438
|
+
nlp = self._get_spacy("zh")
|
|
439
|
+
doc = nlp(text)
|
|
440
|
+
|
|
441
|
+
# Extract noun phrases and named entities
|
|
442
|
+
phrases = []
|
|
443
|
+
|
|
444
|
+
# Add noun chunks
|
|
445
|
+
for chunk in doc.noun_chunks:
|
|
446
|
+
if len(chunk.text.strip()) > 1:
|
|
447
|
+
phrases.append(chunk.text.strip())
|
|
448
|
+
|
|
449
|
+
# Add named entities
|
|
450
|
+
for ent in doc.ents:
|
|
451
|
+
if len(ent.text.strip()) > 1:
|
|
452
|
+
phrases.append(ent.text.strip())
|
|
453
|
+
|
|
454
|
+
# Add important nouns and proper nouns
|
|
455
|
+
for token in doc:
|
|
456
|
+
if token.pos_ in ("NOUN", "PROPN") and len(token.text.strip()) > 1:
|
|
457
|
+
phrases.append(token.text.strip())
|
|
458
|
+
|
|
459
|
+
# Remove duplicates and return top_k
|
|
460
|
+
unique_phrases = list(dict.fromkeys(phrases)) # Preserve order
|
|
461
|
+
return unique_phrases[:top_k]
|
|
462
|
+
|
|
463
|
+
except Exception as e:
|
|
464
|
+
self.logger.error(f"Error extracting Chinese phrases with spaCy: {e}")
|
|
465
|
+
# Fallback to simple noun extraction
|
|
466
|
+
try:
|
|
467
|
+
nlp = self._get_spacy("zh")
|
|
468
|
+
doc = nlp(text)
|
|
469
|
+
nouns = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")]
|
|
470
|
+
return nouns[:top_k]
|
|
471
|
+
except Exception:
|
|
472
|
+
return []
|
|
473
|
+
|
|
474
|
+
def _get_hf_pipeline(self, task: str, model: str):
|
|
475
|
+
"""
|
|
476
|
+
Get a Hugging Face transformers pipeline for the specified task and model.
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
task (str): The task type (e.g., "summarization").
|
|
480
|
+
model (str): The model name.
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
Any: Hugging Face pipeline object.
|
|
484
|
+
|
|
485
|
+
Raises:
|
|
486
|
+
ImportError: If transformers library is not available.
|
|
487
|
+
ValueError: If the pipeline creation fails.
|
|
488
|
+
"""
|
|
489
|
+
try:
|
|
490
|
+
from transformers import pipeline
|
|
491
|
+
|
|
492
|
+
return pipeline(task, model=model)
|
|
493
|
+
except ImportError:
|
|
494
|
+
raise ImportError(
|
|
495
|
+
"transformers library is required for summarization but not installed. Please install it with: pip install transformers"
|
|
496
|
+
)
|
|
497
|
+
except Exception as e:
|
|
498
|
+
raise ValueError(f"Error creating pipeline for task '{task}' with model '{model}': {e}")
|
|
499
|
+
|
|
500
|
+
async def classify(
|
|
501
|
+
self,
|
|
502
|
+
text: str,
|
|
503
|
+
model: Optional[str] = None,
|
|
504
|
+
language: Optional[str] = None,
|
|
505
|
+
) -> List[Dict[str, Any]]:
|
|
506
|
+
"""
|
|
507
|
+
Perform sentiment classification on text using spaCy and lexicon-based approach.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
text (str): Text to classify.
|
|
511
|
+
model (Optional[str]): spaCy model to use (optional, auto-detected).
|
|
512
|
+
language (Optional[str]): Language of the text.
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
List[Dict[str, Any]]: Classification results [{'label': str, 'score': float}].
|
|
516
|
+
"""
|
|
517
|
+
if not self._check_rate_limit():
|
|
518
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
519
|
+
|
|
520
|
+
language = language or self._detect_language(text)
|
|
521
|
+
|
|
522
|
+
# Get spaCy pipeline and sentiment lexicon
|
|
523
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
524
|
+
|
|
525
|
+
sentiment_lexicon = self._get_sentiment_lexicon(language)
|
|
526
|
+
|
|
527
|
+
# Process text with spaCy
|
|
528
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
529
|
+
|
|
530
|
+
# Calculate sentiment score
|
|
531
|
+
sentiment_score = 0.0
|
|
532
|
+
word_count = 0
|
|
533
|
+
|
|
534
|
+
for token in doc:
|
|
535
|
+
if not token.is_stop and not token.is_punct and token.text.lower() in sentiment_lexicon:
|
|
536
|
+
sentiment_score += sentiment_lexicon[token.text.lower()]
|
|
537
|
+
word_count += 1
|
|
538
|
+
|
|
539
|
+
# Normalize score
|
|
540
|
+
if word_count > 0:
|
|
541
|
+
sentiment_score = sentiment_score / word_count
|
|
542
|
+
|
|
543
|
+
# Determine label and confidence
|
|
544
|
+
if sentiment_score > 0.1:
|
|
545
|
+
label = "POSITIVE"
|
|
546
|
+
confidence = min(0.9, 0.5 + abs(sentiment_score) * 0.4)
|
|
547
|
+
elif sentiment_score < -0.1:
|
|
548
|
+
label = "NEGATIVE"
|
|
549
|
+
confidence = min(0.9, 0.5 + abs(sentiment_score) * 0.4)
|
|
550
|
+
else:
|
|
551
|
+
label = "NEUTRAL"
|
|
552
|
+
confidence = 0.6
|
|
553
|
+
|
|
554
|
+
return [{"label": label, "score": confidence}]
|
|
555
|
+
|
|
556
|
+
async def tokenize(self, text: str, language: Optional[str] = None) -> List[str]:
|
|
557
|
+
"""
|
|
558
|
+
Tokenize text into words or tokens using spaCy.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
text (str): Text to tokenize.
|
|
562
|
+
language (Optional[str]): Language of the text.
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
List[str]: List of tokens.
|
|
566
|
+
"""
|
|
567
|
+
if not self._check_rate_limit():
|
|
568
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
569
|
+
|
|
570
|
+
language = language or self._detect_language(text)
|
|
571
|
+
|
|
572
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
573
|
+
|
|
574
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
575
|
+
|
|
576
|
+
return [token.text for token in doc]
|
|
577
|
+
|
|
578
|
+
async def pos_tag(self, text: str, language: Optional[str] = None) -> List[Tuple[str, str]]:
|
|
579
|
+
"""
|
|
580
|
+
Perform part-of-speech tagging using spaCy, returning (token, pos) pairs.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
text (str): Text to tag.
|
|
584
|
+
language (Optional[str]): Language of the text.
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
List[Tuple[str, str]]: List of (token, POS tag) tuples.
|
|
588
|
+
"""
|
|
589
|
+
if not self._check_rate_limit():
|
|
590
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
591
|
+
|
|
592
|
+
language = language or self._detect_language(text)
|
|
593
|
+
|
|
594
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
595
|
+
|
|
596
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
597
|
+
|
|
598
|
+
return [(token.text, token.pos_) for token in doc]
|
|
599
|
+
|
|
600
|
+
@validate_input(NERSchema)
|
|
601
|
+
async def ner(self, text: str, language: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
602
|
+
"""
|
|
603
|
+
Perform named entity recognition.
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
text (str): Text to analyze.
|
|
607
|
+
language (Optional[str]): Language of the text.
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
List[Dict[str, Any]]: List of named entities with text, label, start, and end.
|
|
611
|
+
"""
|
|
612
|
+
if not self._check_rate_limit():
|
|
613
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
614
|
+
|
|
615
|
+
language = language or self._detect_language(text)
|
|
616
|
+
|
|
617
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
618
|
+
|
|
619
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
620
|
+
|
|
621
|
+
return [
|
|
622
|
+
{
|
|
623
|
+
"text": ent.text,
|
|
624
|
+
"label": ent.label_,
|
|
625
|
+
"start": ent.start_char,
|
|
626
|
+
"end": ent.end_char,
|
|
627
|
+
}
|
|
628
|
+
for ent in doc.ents
|
|
629
|
+
]
|
|
630
|
+
|
|
631
|
+
@validate_input(LemmatizeSchema)
|
|
632
|
+
async def lemmatize(self, text: str, language: Optional[str] = None) -> List[str]:
|
|
633
|
+
"""
|
|
634
|
+
Lemmatize tokens in text using spaCy.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
text (str): Text to lemmatize.
|
|
638
|
+
language (Optional[str]): Language of the text.
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
List[str]: List of lemmatized tokens.
|
|
642
|
+
"""
|
|
643
|
+
if not self._check_rate_limit():
|
|
644
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
645
|
+
|
|
646
|
+
language = language or self._detect_language(text)
|
|
647
|
+
|
|
648
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
649
|
+
|
|
650
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
651
|
+
|
|
652
|
+
# For Chinese, lemma might be the same as text, but spaCy handles it
|
|
653
|
+
# consistently
|
|
654
|
+
return [token.lemma_ for token in doc]
|
|
655
|
+
|
|
656
|
+
@validate_input(DependencyParseSchema)
|
|
657
|
+
async def dependency_parse(
|
|
658
|
+
self, text: str, language: Optional[str] = None
|
|
659
|
+
) -> List[Dict[str, Any]]:
|
|
660
|
+
"""
|
|
661
|
+
Perform dependency parsing using spaCy (supports both English and Chinese).
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
text (str): Text to parse.
|
|
665
|
+
language (Optional[str]): Language of the text.
|
|
666
|
+
|
|
667
|
+
Returns:
|
|
668
|
+
List[Dict[str, Any]]: List of tokens with dependency information.
|
|
669
|
+
"""
|
|
670
|
+
if not self._check_rate_limit():
|
|
671
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
672
|
+
|
|
673
|
+
language = language or self._detect_language(text)
|
|
674
|
+
|
|
675
|
+
nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
|
|
676
|
+
|
|
677
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
678
|
+
|
|
679
|
+
return [
|
|
680
|
+
{
|
|
681
|
+
"text": token.text,
|
|
682
|
+
"head": token.head.text,
|
|
683
|
+
"dep": token.dep_,
|
|
684
|
+
"pos": token.pos_,
|
|
685
|
+
}
|
|
686
|
+
for token in doc
|
|
687
|
+
]
|
|
688
|
+
|
|
689
|
+
@validate_input(KeywordExtractSchema)
|
|
690
|
+
async def keyword_extract(
|
|
691
|
+
self,
|
|
692
|
+
text: str,
|
|
693
|
+
top_k: int = 10,
|
|
694
|
+
language: Optional[str] = None,
|
|
695
|
+
extract_phrases: bool = True,
|
|
696
|
+
) -> List[str]:
|
|
697
|
+
"""
|
|
698
|
+
Extract keywords or key phrases from text using spaCy.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
text (str): Text to analyze.
|
|
702
|
+
top_k (int): Number of keywords to extract.
|
|
703
|
+
language (Optional[str]): Language of the text.
|
|
704
|
+
extract_phrases (bool): Whether to extract phrases or just keywords.
|
|
705
|
+
|
|
706
|
+
Returns:
|
|
707
|
+
List[str]: List of extracted keywords or phrases.
|
|
708
|
+
"""
|
|
709
|
+
if not self._check_rate_limit():
|
|
710
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
711
|
+
|
|
712
|
+
language = language or self._detect_language(text)
|
|
713
|
+
|
|
714
|
+
if language == "zh":
|
|
715
|
+
if extract_phrases:
|
|
716
|
+
return await asyncio.get_event_loop().run_in_executor(
|
|
717
|
+
None, self._extract_chinese_phrases, text, top_k
|
|
718
|
+
)
|
|
719
|
+
else:
|
|
720
|
+
# Extract simple keywords using spaCy
|
|
721
|
+
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
722
|
+
None, self._get_spacy, language
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
726
|
+
|
|
727
|
+
keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
|
|
728
|
+
return keywords
|
|
729
|
+
else: # English or other languages
|
|
730
|
+
if extract_phrases and self.config.use_rake_for_english:
|
|
731
|
+
return await asyncio.get_event_loop().run_in_executor(
|
|
732
|
+
None, self._extract_english_phrases, text, top_k
|
|
733
|
+
)
|
|
734
|
+
else:
|
|
735
|
+
nlp = await asyncio.get_event_loop().run_in_executor(
|
|
736
|
+
None, self._get_spacy, language
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
|
|
740
|
+
|
|
741
|
+
keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
|
|
742
|
+
return keywords
|
|
743
|
+
|
|
744
|
+
@validate_input(SummarizeSchema)
|
|
745
|
+
async def summarize(
|
|
746
|
+
self, text: str, max_length: int = 150, language: Optional[str] = None
|
|
747
|
+
) -> str:
|
|
748
|
+
"""
|
|
749
|
+
Summarize text.
|
|
750
|
+
|
|
751
|
+
Args:
|
|
752
|
+
text (str): Text to summarize.
|
|
753
|
+
max_length (int): Maximum length of the summary.
|
|
754
|
+
language (Optional[str]): Language of the text.
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
str: Summarized text.
|
|
758
|
+
"""
|
|
759
|
+
if not self._check_rate_limit():
|
|
760
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
761
|
+
|
|
762
|
+
language = language or self._detect_language(text)
|
|
763
|
+
# Use appropriate models for summarization
|
|
764
|
+
if language == "en":
|
|
765
|
+
model = "facebook/bart-large-cnn"
|
|
766
|
+
else:
|
|
767
|
+
# For Chinese and other languages, use a multilingual model
|
|
768
|
+
# For now, use t5-base, but consider using a Chinese-specific model
|
|
769
|
+
# in the future
|
|
770
|
+
model = "t5-base"
|
|
771
|
+
|
|
772
|
+
pipe = await asyncio.get_event_loop().run_in_executor(
|
|
773
|
+
None, self._get_hf_pipeline, "summarization", model
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
# Different models use different parameter names for length control
|
|
777
|
+
if model.startswith("t5"):
|
|
778
|
+
# T5 models use max_new_tokens instead of max_length
|
|
779
|
+
# For Chinese text, use a more conservative approach
|
|
780
|
+
if language == "zh":
|
|
781
|
+
# Chinese text: use character count and be more conservative
|
|
782
|
+
input_chars = len(text)
|
|
783
|
+
max_new_tokens = min(max_length, max(input_chars // 4, 5))
|
|
784
|
+
min_new_tokens = 2
|
|
785
|
+
else:
|
|
786
|
+
# English text: use word count
|
|
787
|
+
input_words = len(text.split())
|
|
788
|
+
max_new_tokens = min(max_length, max(input_words // 2, 10))
|
|
789
|
+
min_new_tokens = 5
|
|
790
|
+
|
|
791
|
+
result = await asyncio.get_event_loop().run_in_executor(
|
|
792
|
+
None,
|
|
793
|
+
lambda: pipe(
|
|
794
|
+
text,
|
|
795
|
+
max_new_tokens=max_new_tokens,
|
|
796
|
+
min_new_tokens=min_new_tokens,
|
|
797
|
+
do_sample=False,
|
|
798
|
+
)[0]["summary_text"],
|
|
799
|
+
)
|
|
800
|
+
else:
|
|
801
|
+
# BART and other models use max_length
|
|
802
|
+
if language == "zh":
|
|
803
|
+
# Chinese text: use character count
|
|
804
|
+
input_chars = len(text)
|
|
805
|
+
max_len = min(max_length, max(input_chars // 4, 10))
|
|
806
|
+
min_len = 5
|
|
807
|
+
else:
|
|
808
|
+
# English text: use word count
|
|
809
|
+
input_words = len(text.split())
|
|
810
|
+
max_len = min(max_length, max(input_words // 2, 20))
|
|
811
|
+
min_len = 10
|
|
812
|
+
|
|
813
|
+
result = await asyncio.get_event_loop().run_in_executor(
|
|
814
|
+
None,
|
|
815
|
+
lambda: pipe(
|
|
816
|
+
text,
|
|
817
|
+
max_length=max_len,
|
|
818
|
+
min_length=min_len,
|
|
819
|
+
do_sample=False,
|
|
820
|
+
)[
|
|
821
|
+
0
|
|
822
|
+
]["summary_text"],
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
return result
|
|
826
|
+
|
|
827
|
+
@validate_input(BatchProcessSchema)
|
|
828
|
+
async def batch_process(
|
|
829
|
+
self,
|
|
830
|
+
texts: List[str],
|
|
831
|
+
operation: str,
|
|
832
|
+
language: Optional[str] = None,
|
|
833
|
+
model: Optional[str] = None,
|
|
834
|
+
top_k: Optional[int] = None,
|
|
835
|
+
max_length: Optional[int] = None,
|
|
836
|
+
) -> List[Any]:
|
|
837
|
+
"""
|
|
838
|
+
Process multiple texts with the specified operation.
|
|
839
|
+
|
|
840
|
+
Args:
|
|
841
|
+
texts (List[str]): List of texts to process.
|
|
842
|
+
operation (str): Operation to perform on each text.
|
|
843
|
+
language (Optional[str]): Language of the texts.
|
|
844
|
+
model (Optional[str]): Model to use for processing.
|
|
845
|
+
top_k (Optional[int]): Number of keywords to extract (for keyword_extract).
|
|
846
|
+
max_length (Optional[int]): Maximum length of the summary (for summarize).
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
List[Any]: List of operation results.
|
|
850
|
+
"""
|
|
851
|
+
if not self._check_rate_limit():
|
|
852
|
+
raise ValueError("Rate limit exceeded. Please try again later.")
|
|
853
|
+
|
|
854
|
+
# Prepare operations to execute in batch
|
|
855
|
+
operations = []
|
|
856
|
+
for text in texts:
|
|
857
|
+
kwargs = {"text": text}
|
|
858
|
+
if language:
|
|
859
|
+
kwargs["language"] = language
|
|
860
|
+
if model and operation == "classify":
|
|
861
|
+
kwargs["model"] = model
|
|
862
|
+
if top_k and operation == "keyword_extract":
|
|
863
|
+
kwargs["top_k"] = top_k
|
|
864
|
+
if max_length and operation == "summarize":
|
|
865
|
+
kwargs["max_length"] = max_length
|
|
866
|
+
|
|
867
|
+
operations.append({"op": operation, "kwargs": kwargs})
|
|
868
|
+
|
|
869
|
+
# Execute batch operations
|
|
870
|
+
return await self.run_batch(operations)
|
|
871
|
+
|
|
872
|
+
async def health_check(self) -> Dict[str, Any]:
|
|
873
|
+
"""
|
|
874
|
+
Perform a health check on the tool.
|
|
875
|
+
|
|
876
|
+
Returns:
|
|
877
|
+
Dict[str, Any]: Health check results.
|
|
878
|
+
"""
|
|
879
|
+
result = {
|
|
880
|
+
"status": "ok",
|
|
881
|
+
"metrics": {
|
|
882
|
+
"requests": self._metrics["requests"],
|
|
883
|
+
"cache_hits": self._metrics["cache_hits"],
|
|
884
|
+
"avg_processing_time": (
|
|
885
|
+
sum(self._metrics["processing_time"]) / len(self._metrics["processing_time"])
|
|
886
|
+
if self._metrics["processing_time"]
|
|
887
|
+
else 0.0
|
|
888
|
+
),
|
|
889
|
+
},
|
|
890
|
+
"config": {
|
|
891
|
+
"max_workers": self.config.max_workers,
|
|
892
|
+
"pipeline_cache_size": self.config.pipeline_cache_size,
|
|
893
|
+
"rate_limit_enabled": self.config.rate_limit_enabled,
|
|
894
|
+
"rate_limit_requests": self.config.rate_limit_requests,
|
|
895
|
+
"rate_limit_window": self.config.rate_limit_window,
|
|
896
|
+
},
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
# Check if models can be loaded
|
|
900
|
+
try:
|
|
901
|
+
await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, "en")
|
|
902
|
+
result["models"] = {"spacy_en": "ok"}
|
|
903
|
+
except Exception as e:
|
|
904
|
+
result["status"] = "warning"
|
|
905
|
+
result["models"] = {"spacy_en": f"error: {str(e)}"}
|
|
906
|
+
|
|
907
|
+
return result
|
|
908
|
+
|
|
909
|
+
async def cleanup(self) -> None:
|
|
910
|
+
"""
|
|
911
|
+
Clean up resources used by the tool.
|
|
912
|
+
"""
|
|
913
|
+
# Clear spaCy models
|
|
914
|
+
self._spacy_nlp.clear()
|
|
915
|
+
|
|
916
|
+
# Clear metrics
|
|
917
|
+
self._metrics = {"requests": 0, "cache_hits": 0, "processing_time": []}
|
|
918
|
+
|
|
919
|
+
# Clear rate limiting data
|
|
920
|
+
self._request_timestamps = []
|
|
921
|
+
|
|
922
|
+
self.logger.info("ClassifierTool resources cleaned up")
|