aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,922 @@
1
+ from aiecs.tools import register_tool
2
+ from aiecs.tools.tool_executor import (
3
+ validate_input,
4
+ )
5
+ from aiecs.tools.base_tool import BaseTool
6
+ import os
7
+ import re
8
+ import logging
9
+ import asyncio
10
+ import time
11
+ from typing import Dict, Any, List, Optional, Tuple
12
+ from enum import Enum
13
+
14
+ from pydantic import BaseModel, Field, field_validator, ConfigDict
15
+
16
+ # Lazy imports for heavy dependencies
17
+ rake_nltk = None
18
+ spacy = None
19
+
20
+
21
+ def _init_heavy_dependencies():
22
+ """Initialize heavy dependencies when actually needed"""
23
+ global rake_nltk, spacy
24
+
25
+ if rake_nltk is None:
26
+ try:
27
+ import rake_nltk as _rake_nltk
28
+
29
+ rake_nltk = _rake_nltk
30
+ except ImportError:
31
+ import logging
32
+
33
+ logging.getLogger(__name__).error("rake_nltk not available")
34
+
35
+ if spacy is None:
36
+ try:
37
+ import spacy as _spacy
38
+
39
+ spacy = _spacy
40
+ except ImportError:
41
+ import logging
42
+
43
+ logging.getLogger(__name__).warning("spacy not available (optional)")
44
+
45
+
46
+ # Enums for configuration options
47
+
48
+
49
+ class Language(str, Enum):
50
+ ENGLISH = "en"
51
+ CHINESE = "zh"
52
+ AUTO = "auto"
53
+
54
+
55
+ class ModelType(str, Enum):
56
+ SPACY_ENGLISH = "en_core_web_sm"
57
+ SPACY_CHINESE = "zh_core_web_sm"
58
+
59
+
60
+ @register_tool("classifier")
61
+ class ClassifierTool(BaseTool):
62
+ """
63
+ Text classification, tokenization, POS tagging, NER, lemmatization, dependency parsing,
64
+ keyword extraction, and summarization tool.
65
+
66
+ Operations:
67
+ - classify: Sentiment or topic classification.
68
+ - tokenize: Tokenize text.
69
+ - pos_tag: Part-of-speech tagging.
70
+ - ner: Named entity recognition.
71
+ - lemmatize: Lemmatize tokens.
72
+ - dependency_parse: Dependency parsing.
73
+ - keyword_extract: Extract key phrases.
74
+ - summarize: Summarize text.
75
+ - batch_process: Process multiple texts with any operation.
76
+
77
+ Supports English (spaCy) and Chinese (Jieba, spaCy).
78
+ """
79
+
80
+ # Configuration schema
81
+ class Config(BaseModel):
82
+ """Configuration for the classifier tool"""
83
+
84
+ max_workers: int = Field(
85
+ default=min(32, (os.cpu_count() or 4) * 2),
86
+ description="Maximum number of worker threads",
87
+ )
88
+ pipeline_cache_ttl: int = Field(
89
+ default=3600,
90
+ description="Time-to-live for pipeline cache in seconds",
91
+ )
92
+ pipeline_cache_size: int = Field(
93
+ default=10, description="Maximum number of pipeline cache entries"
94
+ )
95
+ max_text_length: int = Field(
96
+ default=10_000, description="Maximum text length in characters"
97
+ )
98
+ spacy_model_en: str = Field(default="en_core_web_sm", description="spaCy model for English")
99
+ spacy_model_zh: str = Field(default="zh_core_web_sm", description="spaCy model for Chinese")
100
+ allowed_models: List[str] = Field(
101
+ default=["en_core_web_sm", "zh_core_web_sm"],
102
+ description="List of allowed spaCy models",
103
+ )
104
+ rate_limit_enabled: bool = Field(default=True, description="Enable rate limiting")
105
+ rate_limit_requests: int = Field(default=100, description="Maximum requests per window")
106
+ rate_limit_window: int = Field(default=60, description="Rate limit window in seconds")
107
+ use_rake_for_english: bool = Field(
108
+ default=True, description="Use RAKE for English phrase extraction"
109
+ )
110
+
111
+ model_config = ConfigDict(env_prefix="CLASSIFIER_TOOL_")
112
+
113
+ # Base schema for text operations
114
+ class BaseTextSchema(BaseModel):
115
+ """Base schema for text operations"""
116
+
117
+ text: str = Field(description="Text to process")
118
+
119
+ @field_validator("text")
120
+ @classmethod
121
+ def check_length_and_content(cls, v: str) -> str:
122
+ if len(v) > 10_000: # Using a constant here for validation
123
+ raise ValueError("Text length exceeds 10,000 characters")
124
+ # Check for malicious patterns (e.g., SQL injection)
125
+ if re.search(
126
+ r"(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)",
127
+ v,
128
+ re.IGNORECASE,
129
+ ):
130
+ raise ValueError("Text contains potentially malicious content")
131
+ return v
132
+
133
+ # Input schemas for operations
134
+ class ClassifySchema(BaseTextSchema):
135
+ """Schema for text classification"""
136
+
137
+ model: Optional[str] = Field(default=None, description="Model to use for classification")
138
+ language: Optional[Language] = Field(default=None, description="Language of the text")
139
+
140
+ @field_validator("model")
141
+ @classmethod
142
+ def check_model(cls, v: Optional[str]) -> Optional[str]:
143
+ allowed_models = ["en_core_web_sm", "zh_core_web_sm"]
144
+ if v and v not in allowed_models:
145
+ raise ValueError(f"Model '{v}' not in allowed spaCy models: {allowed_models}")
146
+ return v
147
+
148
+ class TokenizeSchema(BaseTextSchema):
149
+ """Schema for text tokenization"""
150
+
151
+ language: Optional[Language] = Field(default=None, description="Language of the text")
152
+
153
+ class PosTagSchema(BaseTextSchema):
154
+ """Schema for part-of-speech tagging"""
155
+
156
+ language: Optional[Language] = Field(default=None, description="Language of the text")
157
+
158
+ class NERSchema(BaseTextSchema):
159
+ """Schema for named entity recognition"""
160
+
161
+ language: Optional[Language] = Field(default=None, description="Language of the text")
162
+
163
+ class LemmatizeSchema(BaseTextSchema):
164
+ """Schema for lemmatization"""
165
+
166
+ language: Optional[Language] = Field(default=None, description="Language of the text")
167
+
168
+ class DependencyParseSchema(BaseTextSchema):
169
+ """Schema for dependency parsing"""
170
+
171
+ language: Optional[Language] = Field(default=None, description="Language of the text")
172
+
173
+ class KeywordExtractSchema(BaseTextSchema):
174
+ """Schema for keyword extraction"""
175
+
176
+ top_k: int = Field(default=10, description="Number of keywords to extract")
177
+ language: Optional[Language] = Field(default=None, description="Language of the text")
178
+ extract_phrases: bool = Field(
179
+ default=True,
180
+ description="Whether to extract phrases or just keywords",
181
+ )
182
+
183
+ class SummarizeSchema(BaseTextSchema):
184
+ """Schema for text summarization"""
185
+
186
+ max_length: int = Field(default=150, description="Maximum length of the summary")
187
+ language: Optional[Language] = Field(default=None, description="Language of the text")
188
+
189
+ class BatchProcessSchema(BaseModel):
190
+ """Schema for batch processing"""
191
+
192
+ texts: List[str] = Field(description="List of texts to process")
193
+ operation: str = Field(description="Operation to perform on each text")
194
+ language: Optional[Language] = Field(default=None, description="Language of the texts")
195
+ model: Optional[str] = Field(default=None, description="Model to use for processing")
196
+ top_k: Optional[int] = Field(
197
+ default=None,
198
+ description="Number of keywords to extract (for keyword_extract)",
199
+ )
200
+ max_length: Optional[int] = Field(
201
+ default=None,
202
+ description="Maximum length of the summary (for summarize)",
203
+ )
204
+
205
+ @field_validator("texts")
206
+ @classmethod
207
+ def check_texts(cls, v: List[str]) -> List[str]:
208
+ for text in v:
209
+ if len(text) > 10_000: # Using a constant here for validation
210
+ raise ValueError("Text length exceeds 10,000 characters")
211
+ if re.search(
212
+ r"(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)",
213
+ text,
214
+ re.IGNORECASE,
215
+ ):
216
+ raise ValueError("Text contains potentially malicious content")
217
+ return v
218
+
219
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
220
+ """
221
+ Initialize ClassifierTool with settings and resources.
222
+
223
+ Args:
224
+ config (Dict, optional): Configuration overrides for ClassifierSettings.
225
+
226
+ Raises:
227
+ ValueError: If config contains invalid settings.
228
+ """
229
+ super().__init__(config)
230
+
231
+ # Parse configuration
232
+ self.config = self.Config(**(config or {}))
233
+
234
+ # Set up logger
235
+ self.logger = logging.getLogger(__name__)
236
+ if not self.logger.handlers:
237
+ handler = logging.StreamHandler()
238
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
239
+ self.logger.addHandler(handler)
240
+ self.logger.setLevel(logging.INFO)
241
+
242
+ # Initialize resources
243
+ self._spacy_nlp = {} # Language -> spaCy pipeline
244
+ self._metrics = {"requests": 0, "cache_hits": 0, "processing_time": []}
245
+ self._request_timestamps = []
246
+
247
+ def _get_sentiment_lexicon(self, language: str) -> Dict[str, float]:
248
+ """
249
+ Get sentiment lexicon for the specified language.
250
+
251
+ Args:
252
+ language (str): Language code ('en', 'zh').
253
+
254
+ Returns:
255
+ Dict[str, float]: Sentiment lexicon with word -> score mapping.
256
+ """
257
+ if language == "en":
258
+ # Simple English sentiment lexicon
259
+ return {
260
+ "good": 1.0,
261
+ "great": 1.5,
262
+ "excellent": 2.0,
263
+ "amazing": 2.0,
264
+ "wonderful": 1.5,
265
+ "fantastic": 2.0,
266
+ "awesome": 1.5,
267
+ "perfect": 2.0,
268
+ "love": 1.5,
269
+ "like": 1.0,
270
+ "happy": 1.5,
271
+ "pleased": 1.0,
272
+ "satisfied": 1.0,
273
+ "positive": 1.0,
274
+ "best": 2.0,
275
+ "bad": -1.0,
276
+ "terrible": -2.0,
277
+ "awful": -2.0,
278
+ "horrible": -2.0,
279
+ "hate": -2.0,
280
+ "dislike": -1.0,
281
+ "sad": -1.5,
282
+ "angry": -1.5,
283
+ "disappointed": -1.5,
284
+ "negative": -1.0,
285
+ "worst": -2.0,
286
+ "poor": -1.0,
287
+ "fail": -1.5,
288
+ "wrong": -1.0,
289
+ "problem": -1.0,
290
+ }
291
+ else: # Chinese
292
+ return {
293
+ "好": 1.0,
294
+ "很好": 1.5,
295
+ "非常好": 2.0,
296
+ "棒": 1.5,
297
+ "优秀": 2.0,
298
+ "完美": 2.0,
299
+ "喜欢": 1.5,
300
+ "爱": 2.0,
301
+ "满意": 1.0,
302
+ "开心": 1.5,
303
+ "高兴": 1.5,
304
+ "积极": 1.0,
305
+ "坏": -1.0,
306
+ "很坏": -1.5,
307
+ "糟糕": -2.0,
308
+ "讨厌": -2.0,
309
+ "恨": -2.0,
310
+ "失望": -1.5,
311
+ "生气": -1.5,
312
+ "愤怒": -2.0,
313
+ "消极": -1.0,
314
+ "问题": -1.0,
315
+ "错误": -1.0,
316
+ "失败": -1.5,
317
+ }
318
+
319
+ def _get_spacy(self, language: str) -> Any:
320
+ """
321
+ Get a spaCy pipeline for the specified language.
322
+
323
+ Args:
324
+ language (str): Language code ('en', 'zh').
325
+
326
+ Returns:
327
+ Any: spaCy NLP object.
328
+ """
329
+ global spacy
330
+ if spacy is None:
331
+ try:
332
+ import spacy as spacy_module
333
+
334
+ spacy = spacy_module
335
+ except ImportError:
336
+ raise ImportError(
337
+ "spaCy is required but not installed. Please install it with: pip install spacy"
338
+ )
339
+
340
+ model = self.config.spacy_model_zh if language == "zh" else self.config.spacy_model_en
341
+ return spacy.load(model, disable=["textcat"])
342
+
343
+ def _detect_language(self, text: str) -> str:
344
+ """
345
+ Detect the language of the input text using character analysis.
346
+
347
+ Args:
348
+ text (str): Input text.
349
+
350
+ Returns:
351
+ str: Language code ('en', 'zh', or 'en' for unknown).
352
+ """
353
+ try:
354
+ # Count Chinese characters (CJK Unified Ideographs)
355
+ chinese_chars = sum(1 for char in text if "\u4e00" <= char <= "\u9fff")
356
+ total_chars = len([char for char in text if char.isalpha()])
357
+
358
+ if total_chars == 0:
359
+ return "en"
360
+
361
+ # If more than 30% are Chinese characters, consider it Chinese
362
+ chinese_ratio = chinese_chars / total_chars
363
+ return "zh" if chinese_ratio > 0.3 else "en"
364
+ except Exception:
365
+ return "en"
366
+
367
+ def _check_rate_limit(self) -> bool:
368
+ """
369
+ Check if the request is within rate limits.
370
+
371
+ Returns:
372
+ bool: True if within limits, False otherwise.
373
+ """
374
+ if not self.config.rate_limit_enabled:
375
+ return True
376
+
377
+ current_time = time.time()
378
+
379
+ # Get lock from executor
380
+ with self._executor.get_lock("rate_limit"):
381
+ # Remove timestamps outside the window
382
+ self._request_timestamps = [
383
+ ts
384
+ for ts in self._request_timestamps
385
+ if current_time - ts <= self.config.rate_limit_window
386
+ ]
387
+
388
+ # Check if we're at the limit
389
+ if len(self._request_timestamps) >= self.config.rate_limit_requests:
390
+ return False
391
+
392
+ # Add current timestamp
393
+ self._request_timestamps.append(current_time)
394
+ return True
395
+
396
+ def _extract_english_phrases(self, text: str, top_k: int) -> List[str]:
397
+ """
398
+ Extract key phrases from English text using RAKE.
399
+
400
+ Args:
401
+ text (str): Input text.
402
+ top_k (int): Number of phrases to extract.
403
+
404
+ Returns:
405
+ List[str]: Extracted phrases.
406
+ """
407
+ try:
408
+ # Initialize heavy dependencies if needed
409
+ _init_heavy_dependencies()
410
+
411
+ if rake_nltk is None:
412
+ raise ImportError("rake_nltk not available")
413
+
414
+ rake = rake_nltk.Rake()
415
+ rake.extract_keywords_from_text(text)
416
+ phrases = rake.get_ranked_phrases()[:top_k]
417
+ return phrases
418
+ except Exception as e:
419
+ self.logger.error(f"Error extracting English phrases: {e}")
420
+ # Fallback to simple keyword extraction
421
+ nlp = self._get_spacy("en")
422
+ doc = nlp(text)
423
+ keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
424
+ return keywords
425
+
426
+ def _extract_chinese_phrases(self, text: str, top_k: int) -> List[str]:
427
+ """
428
+ Extract key phrases from Chinese text using spaCy.
429
+
430
+ Args:
431
+ text (str): Input text.
432
+ top_k (int): Number of phrases to extract.
433
+
434
+ Returns:
435
+ List[str]: Extracted phrases.
436
+ """
437
+ try:
438
+ nlp = self._get_spacy("zh")
439
+ doc = nlp(text)
440
+
441
+ # Extract noun phrases and named entities
442
+ phrases = []
443
+
444
+ # Add noun chunks
445
+ for chunk in doc.noun_chunks:
446
+ if len(chunk.text.strip()) > 1:
447
+ phrases.append(chunk.text.strip())
448
+
449
+ # Add named entities
450
+ for ent in doc.ents:
451
+ if len(ent.text.strip()) > 1:
452
+ phrases.append(ent.text.strip())
453
+
454
+ # Add important nouns and proper nouns
455
+ for token in doc:
456
+ if token.pos_ in ("NOUN", "PROPN") and len(token.text.strip()) > 1:
457
+ phrases.append(token.text.strip())
458
+
459
+ # Remove duplicates and return top_k
460
+ unique_phrases = list(dict.fromkeys(phrases)) # Preserve order
461
+ return unique_phrases[:top_k]
462
+
463
+ except Exception as e:
464
+ self.logger.error(f"Error extracting Chinese phrases with spaCy: {e}")
465
+ # Fallback to simple noun extraction
466
+ try:
467
+ nlp = self._get_spacy("zh")
468
+ doc = nlp(text)
469
+ nouns = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")]
470
+ return nouns[:top_k]
471
+ except Exception:
472
+ return []
473
+
474
+ def _get_hf_pipeline(self, task: str, model: str):
475
+ """
476
+ Get a Hugging Face transformers pipeline for the specified task and model.
477
+
478
+ Args:
479
+ task (str): The task type (e.g., "summarization").
480
+ model (str): The model name.
481
+
482
+ Returns:
483
+ Any: Hugging Face pipeline object.
484
+
485
+ Raises:
486
+ ImportError: If transformers library is not available.
487
+ ValueError: If the pipeline creation fails.
488
+ """
489
+ try:
490
+ from transformers import pipeline
491
+
492
+ return pipeline(task, model=model)
493
+ except ImportError:
494
+ raise ImportError(
495
+ "transformers library is required for summarization but not installed. Please install it with: pip install transformers"
496
+ )
497
+ except Exception as e:
498
+ raise ValueError(f"Error creating pipeline for task '{task}' with model '{model}': {e}")
499
+
500
+ async def classify(
501
+ self,
502
+ text: str,
503
+ model: Optional[str] = None,
504
+ language: Optional[str] = None,
505
+ ) -> List[Dict[str, Any]]:
506
+ """
507
+ Perform sentiment classification on text using spaCy and lexicon-based approach.
508
+
509
+ Args:
510
+ text (str): Text to classify.
511
+ model (Optional[str]): spaCy model to use (optional, auto-detected).
512
+ language (Optional[str]): Language of the text.
513
+
514
+ Returns:
515
+ List[Dict[str, Any]]: Classification results [{'label': str, 'score': float}].
516
+ """
517
+ if not self._check_rate_limit():
518
+ raise ValueError("Rate limit exceeded. Please try again later.")
519
+
520
+ language = language or self._detect_language(text)
521
+
522
+ # Get spaCy pipeline and sentiment lexicon
523
+ nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
524
+
525
+ sentiment_lexicon = self._get_sentiment_lexicon(language)
526
+
527
+ # Process text with spaCy
528
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
529
+
530
+ # Calculate sentiment score
531
+ sentiment_score = 0.0
532
+ word_count = 0
533
+
534
+ for token in doc:
535
+ if not token.is_stop and not token.is_punct and token.text.lower() in sentiment_lexicon:
536
+ sentiment_score += sentiment_lexicon[token.text.lower()]
537
+ word_count += 1
538
+
539
+ # Normalize score
540
+ if word_count > 0:
541
+ sentiment_score = sentiment_score / word_count
542
+
543
+ # Determine label and confidence
544
+ if sentiment_score > 0.1:
545
+ label = "POSITIVE"
546
+ confidence = min(0.9, 0.5 + abs(sentiment_score) * 0.4)
547
+ elif sentiment_score < -0.1:
548
+ label = "NEGATIVE"
549
+ confidence = min(0.9, 0.5 + abs(sentiment_score) * 0.4)
550
+ else:
551
+ label = "NEUTRAL"
552
+ confidence = 0.6
553
+
554
+ return [{"label": label, "score": confidence}]
555
+
556
+ async def tokenize(self, text: str, language: Optional[str] = None) -> List[str]:
557
+ """
558
+ Tokenize text into words or tokens using spaCy.
559
+
560
+ Args:
561
+ text (str): Text to tokenize.
562
+ language (Optional[str]): Language of the text.
563
+
564
+ Returns:
565
+ List[str]: List of tokens.
566
+ """
567
+ if not self._check_rate_limit():
568
+ raise ValueError("Rate limit exceeded. Please try again later.")
569
+
570
+ language = language or self._detect_language(text)
571
+
572
+ nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
573
+
574
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
575
+
576
+ return [token.text for token in doc]
577
+
578
+ async def pos_tag(self, text: str, language: Optional[str] = None) -> List[Tuple[str, str]]:
579
+ """
580
+ Perform part-of-speech tagging using spaCy, returning (token, pos) pairs.
581
+
582
+ Args:
583
+ text (str): Text to tag.
584
+ language (Optional[str]): Language of the text.
585
+
586
+ Returns:
587
+ List[Tuple[str, str]]: List of (token, POS tag) tuples.
588
+ """
589
+ if not self._check_rate_limit():
590
+ raise ValueError("Rate limit exceeded. Please try again later.")
591
+
592
+ language = language or self._detect_language(text)
593
+
594
+ nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
595
+
596
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
597
+
598
+ return [(token.text, token.pos_) for token in doc]
599
+
600
+ @validate_input(NERSchema)
601
+ async def ner(self, text: str, language: Optional[str] = None) -> List[Dict[str, Any]]:
602
+ """
603
+ Perform named entity recognition.
604
+
605
+ Args:
606
+ text (str): Text to analyze.
607
+ language (Optional[str]): Language of the text.
608
+
609
+ Returns:
610
+ List[Dict[str, Any]]: List of named entities with text, label, start, and end.
611
+ """
612
+ if not self._check_rate_limit():
613
+ raise ValueError("Rate limit exceeded. Please try again later.")
614
+
615
+ language = language or self._detect_language(text)
616
+
617
+ nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
618
+
619
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
620
+
621
+ return [
622
+ {
623
+ "text": ent.text,
624
+ "label": ent.label_,
625
+ "start": ent.start_char,
626
+ "end": ent.end_char,
627
+ }
628
+ for ent in doc.ents
629
+ ]
630
+
631
+ @validate_input(LemmatizeSchema)
632
+ async def lemmatize(self, text: str, language: Optional[str] = None) -> List[str]:
633
+ """
634
+ Lemmatize tokens in text using spaCy.
635
+
636
+ Args:
637
+ text (str): Text to lemmatize.
638
+ language (Optional[str]): Language of the text.
639
+
640
+ Returns:
641
+ List[str]: List of lemmatized tokens.
642
+ """
643
+ if not self._check_rate_limit():
644
+ raise ValueError("Rate limit exceeded. Please try again later.")
645
+
646
+ language = language or self._detect_language(text)
647
+
648
+ nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
649
+
650
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
651
+
652
+ # For Chinese, lemma might be the same as text, but spaCy handles it
653
+ # consistently
654
+ return [token.lemma_ for token in doc]
655
+
656
+ @validate_input(DependencyParseSchema)
657
+ async def dependency_parse(
658
+ self, text: str, language: Optional[str] = None
659
+ ) -> List[Dict[str, Any]]:
660
+ """
661
+ Perform dependency parsing using spaCy (supports both English and Chinese).
662
+
663
+ Args:
664
+ text (str): Text to parse.
665
+ language (Optional[str]): Language of the text.
666
+
667
+ Returns:
668
+ List[Dict[str, Any]]: List of tokens with dependency information.
669
+ """
670
+ if not self._check_rate_limit():
671
+ raise ValueError("Rate limit exceeded. Please try again later.")
672
+
673
+ language = language or self._detect_language(text)
674
+
675
+ nlp = await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, language)
676
+
677
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
678
+
679
+ return [
680
+ {
681
+ "text": token.text,
682
+ "head": token.head.text,
683
+ "dep": token.dep_,
684
+ "pos": token.pos_,
685
+ }
686
+ for token in doc
687
+ ]
688
+
689
+ @validate_input(KeywordExtractSchema)
690
+ async def keyword_extract(
691
+ self,
692
+ text: str,
693
+ top_k: int = 10,
694
+ language: Optional[str] = None,
695
+ extract_phrases: bool = True,
696
+ ) -> List[str]:
697
+ """
698
+ Extract keywords or key phrases from text using spaCy.
699
+
700
+ Args:
701
+ text (str): Text to analyze.
702
+ top_k (int): Number of keywords to extract.
703
+ language (Optional[str]): Language of the text.
704
+ extract_phrases (bool): Whether to extract phrases or just keywords.
705
+
706
+ Returns:
707
+ List[str]: List of extracted keywords or phrases.
708
+ """
709
+ if not self._check_rate_limit():
710
+ raise ValueError("Rate limit exceeded. Please try again later.")
711
+
712
+ language = language or self._detect_language(text)
713
+
714
+ if language == "zh":
715
+ if extract_phrases:
716
+ return await asyncio.get_event_loop().run_in_executor(
717
+ None, self._extract_chinese_phrases, text, top_k
718
+ )
719
+ else:
720
+ # Extract simple keywords using spaCy
721
+ nlp = await asyncio.get_event_loop().run_in_executor(
722
+ None, self._get_spacy, language
723
+ )
724
+
725
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
726
+
727
+ keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
728
+ return keywords
729
+ else: # English or other languages
730
+ if extract_phrases and self.config.use_rake_for_english:
731
+ return await asyncio.get_event_loop().run_in_executor(
732
+ None, self._extract_english_phrases, text, top_k
733
+ )
734
+ else:
735
+ nlp = await asyncio.get_event_loop().run_in_executor(
736
+ None, self._get_spacy, language
737
+ )
738
+
739
+ doc = await asyncio.get_event_loop().run_in_executor(None, nlp, text)
740
+
741
+ keywords = [token.text for token in doc if token.pos_ in ("NOUN", "PROPN")][:top_k]
742
+ return keywords
743
+
744
+ @validate_input(SummarizeSchema)
745
+ async def summarize(
746
+ self, text: str, max_length: int = 150, language: Optional[str] = None
747
+ ) -> str:
748
+ """
749
+ Summarize text.
750
+
751
+ Args:
752
+ text (str): Text to summarize.
753
+ max_length (int): Maximum length of the summary.
754
+ language (Optional[str]): Language of the text.
755
+
756
+ Returns:
757
+ str: Summarized text.
758
+ """
759
+ if not self._check_rate_limit():
760
+ raise ValueError("Rate limit exceeded. Please try again later.")
761
+
762
+ language = language or self._detect_language(text)
763
+ # Use appropriate models for summarization
764
+ if language == "en":
765
+ model = "facebook/bart-large-cnn"
766
+ else:
767
+ # For Chinese and other languages, use a multilingual model
768
+ # For now, use t5-base, but consider using a Chinese-specific model
769
+ # in the future
770
+ model = "t5-base"
771
+
772
+ pipe = await asyncio.get_event_loop().run_in_executor(
773
+ None, self._get_hf_pipeline, "summarization", model
774
+ )
775
+
776
+ # Different models use different parameter names for length control
777
+ if model.startswith("t5"):
778
+ # T5 models use max_new_tokens instead of max_length
779
+ # For Chinese text, use a more conservative approach
780
+ if language == "zh":
781
+ # Chinese text: use character count and be more conservative
782
+ input_chars = len(text)
783
+ max_new_tokens = min(max_length, max(input_chars // 4, 5))
784
+ min_new_tokens = 2
785
+ else:
786
+ # English text: use word count
787
+ input_words = len(text.split())
788
+ max_new_tokens = min(max_length, max(input_words // 2, 10))
789
+ min_new_tokens = 5
790
+
791
+ result = await asyncio.get_event_loop().run_in_executor(
792
+ None,
793
+ lambda: pipe(
794
+ text,
795
+ max_new_tokens=max_new_tokens,
796
+ min_new_tokens=min_new_tokens,
797
+ do_sample=False,
798
+ )[0]["summary_text"],
799
+ )
800
+ else:
801
+ # BART and other models use max_length
802
+ if language == "zh":
803
+ # Chinese text: use character count
804
+ input_chars = len(text)
805
+ max_len = min(max_length, max(input_chars // 4, 10))
806
+ min_len = 5
807
+ else:
808
+ # English text: use word count
809
+ input_words = len(text.split())
810
+ max_len = min(max_length, max(input_words // 2, 20))
811
+ min_len = 10
812
+
813
+ result = await asyncio.get_event_loop().run_in_executor(
814
+ None,
815
+ lambda: pipe(
816
+ text,
817
+ max_length=max_len,
818
+ min_length=min_len,
819
+ do_sample=False,
820
+ )[
821
+ 0
822
+ ]["summary_text"],
823
+ )
824
+
825
+ return result
826
+
827
+ @validate_input(BatchProcessSchema)
828
+ async def batch_process(
829
+ self,
830
+ texts: List[str],
831
+ operation: str,
832
+ language: Optional[str] = None,
833
+ model: Optional[str] = None,
834
+ top_k: Optional[int] = None,
835
+ max_length: Optional[int] = None,
836
+ ) -> List[Any]:
837
+ """
838
+ Process multiple texts with the specified operation.
839
+
840
+ Args:
841
+ texts (List[str]): List of texts to process.
842
+ operation (str): Operation to perform on each text.
843
+ language (Optional[str]): Language of the texts.
844
+ model (Optional[str]): Model to use for processing.
845
+ top_k (Optional[int]): Number of keywords to extract (for keyword_extract).
846
+ max_length (Optional[int]): Maximum length of the summary (for summarize).
847
+
848
+ Returns:
849
+ List[Any]: List of operation results.
850
+ """
851
+ if not self._check_rate_limit():
852
+ raise ValueError("Rate limit exceeded. Please try again later.")
853
+
854
+ # Prepare operations to execute in batch
855
+ operations = []
856
+ for text in texts:
857
+ kwargs = {"text": text}
858
+ if language:
859
+ kwargs["language"] = language
860
+ if model and operation == "classify":
861
+ kwargs["model"] = model
862
+ if top_k and operation == "keyword_extract":
863
+ kwargs["top_k"] = top_k
864
+ if max_length and operation == "summarize":
865
+ kwargs["max_length"] = max_length
866
+
867
+ operations.append({"op": operation, "kwargs": kwargs})
868
+
869
+ # Execute batch operations
870
+ return await self.run_batch(operations)
871
+
872
+ async def health_check(self) -> Dict[str, Any]:
873
+ """
874
+ Perform a health check on the tool.
875
+
876
+ Returns:
877
+ Dict[str, Any]: Health check results.
878
+ """
879
+ result = {
880
+ "status": "ok",
881
+ "metrics": {
882
+ "requests": self._metrics["requests"],
883
+ "cache_hits": self._metrics["cache_hits"],
884
+ "avg_processing_time": (
885
+ sum(self._metrics["processing_time"]) / len(self._metrics["processing_time"])
886
+ if self._metrics["processing_time"]
887
+ else 0.0
888
+ ),
889
+ },
890
+ "config": {
891
+ "max_workers": self.config.max_workers,
892
+ "pipeline_cache_size": self.config.pipeline_cache_size,
893
+ "rate_limit_enabled": self.config.rate_limit_enabled,
894
+ "rate_limit_requests": self.config.rate_limit_requests,
895
+ "rate_limit_window": self.config.rate_limit_window,
896
+ },
897
+ }
898
+
899
+ # Check if models can be loaded
900
+ try:
901
+ await asyncio.get_event_loop().run_in_executor(None, self._get_spacy, "en")
902
+ result["models"] = {"spacy_en": "ok"}
903
+ except Exception as e:
904
+ result["status"] = "warning"
905
+ result["models"] = {"spacy_en": f"error: {str(e)}"}
906
+
907
+ return result
908
+
909
+ async def cleanup(self) -> None:
910
+ """
911
+ Clean up resources used by the tool.
912
+ """
913
+ # Clear spaCy models
914
+ self._spacy_nlp.clear()
915
+
916
+ # Clear metrics
917
+ self._metrics = {"requests": 0, "cache_hits": 0, "processing_time": []}
918
+
919
+ # Clear rate limiting data
920
+ self._request_timestamps = []
921
+
922
+ self.logger.info("ClassifierTool resources cleaned up")