aiecs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (90) hide show
  1. aiecs/__init__.py +75 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +295 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +341 -0
  7. aiecs/config/__init__.py +15 -0
  8. aiecs/config/config.py +117 -0
  9. aiecs/config/registry.py +19 -0
  10. aiecs/core/__init__.py +46 -0
  11. aiecs/core/interface/__init__.py +34 -0
  12. aiecs/core/interface/execution_interface.py +150 -0
  13. aiecs/core/interface/storage_interface.py +214 -0
  14. aiecs/domain/__init__.py +20 -0
  15. aiecs/domain/context/__init__.py +28 -0
  16. aiecs/domain/context/content_engine.py +982 -0
  17. aiecs/domain/context/conversation_models.py +306 -0
  18. aiecs/domain/execution/__init__.py +12 -0
  19. aiecs/domain/execution/model.py +49 -0
  20. aiecs/domain/task/__init__.py +13 -0
  21. aiecs/domain/task/dsl_processor.py +460 -0
  22. aiecs/domain/task/model.py +50 -0
  23. aiecs/domain/task/task_context.py +257 -0
  24. aiecs/infrastructure/__init__.py +26 -0
  25. aiecs/infrastructure/messaging/__init__.py +13 -0
  26. aiecs/infrastructure/messaging/celery_task_manager.py +341 -0
  27. aiecs/infrastructure/messaging/websocket_manager.py +289 -0
  28. aiecs/infrastructure/monitoring/__init__.py +12 -0
  29. aiecs/infrastructure/monitoring/executor_metrics.py +138 -0
  30. aiecs/infrastructure/monitoring/structured_logger.py +50 -0
  31. aiecs/infrastructure/monitoring/tracing_manager.py +376 -0
  32. aiecs/infrastructure/persistence/__init__.py +12 -0
  33. aiecs/infrastructure/persistence/database_manager.py +286 -0
  34. aiecs/infrastructure/persistence/file_storage.py +671 -0
  35. aiecs/infrastructure/persistence/redis_client.py +162 -0
  36. aiecs/llm/__init__.py +54 -0
  37. aiecs/llm/base_client.py +99 -0
  38. aiecs/llm/client_factory.py +339 -0
  39. aiecs/llm/custom_callbacks.py +228 -0
  40. aiecs/llm/openai_client.py +125 -0
  41. aiecs/llm/vertex_client.py +186 -0
  42. aiecs/llm/xai_client.py +184 -0
  43. aiecs/main.py +351 -0
  44. aiecs/scripts/DEPENDENCY_SYSTEM_SUMMARY.md +241 -0
  45. aiecs/scripts/README_DEPENDENCY_CHECKER.md +309 -0
  46. aiecs/scripts/README_WEASEL_PATCH.md +126 -0
  47. aiecs/scripts/__init__.py +3 -0
  48. aiecs/scripts/dependency_checker.py +825 -0
  49. aiecs/scripts/dependency_fixer.py +348 -0
  50. aiecs/scripts/download_nlp_data.py +348 -0
  51. aiecs/scripts/fix_weasel_validator.py +121 -0
  52. aiecs/scripts/fix_weasel_validator.sh +82 -0
  53. aiecs/scripts/patch_weasel_library.sh +188 -0
  54. aiecs/scripts/quick_dependency_check.py +269 -0
  55. aiecs/scripts/run_weasel_patch.sh +41 -0
  56. aiecs/scripts/setup_nlp_data.sh +217 -0
  57. aiecs/tasks/__init__.py +2 -0
  58. aiecs/tasks/worker.py +111 -0
  59. aiecs/tools/__init__.py +196 -0
  60. aiecs/tools/base_tool.py +202 -0
  61. aiecs/tools/langchain_adapter.py +361 -0
  62. aiecs/tools/task_tools/__init__.py +82 -0
  63. aiecs/tools/task_tools/chart_tool.py +704 -0
  64. aiecs/tools/task_tools/classfire_tool.py +901 -0
  65. aiecs/tools/task_tools/image_tool.py +397 -0
  66. aiecs/tools/task_tools/office_tool.py +600 -0
  67. aiecs/tools/task_tools/pandas_tool.py +565 -0
  68. aiecs/tools/task_tools/report_tool.py +499 -0
  69. aiecs/tools/task_tools/research_tool.py +363 -0
  70. aiecs/tools/task_tools/scraper_tool.py +548 -0
  71. aiecs/tools/task_tools/search_api.py +7 -0
  72. aiecs/tools/task_tools/stats_tool.py +513 -0
  73. aiecs/tools/temp_file_manager.py +126 -0
  74. aiecs/tools/tool_executor/__init__.py +35 -0
  75. aiecs/tools/tool_executor/tool_executor.py +518 -0
  76. aiecs/utils/LLM_output_structor.py +409 -0
  77. aiecs/utils/__init__.py +23 -0
  78. aiecs/utils/base_callback.py +50 -0
  79. aiecs/utils/execution_utils.py +158 -0
  80. aiecs/utils/logging.py +1 -0
  81. aiecs/utils/prompt_loader.py +13 -0
  82. aiecs/utils/token_usage_repository.py +279 -0
  83. aiecs/ws/__init__.py +0 -0
  84. aiecs/ws/socket_server.py +41 -0
  85. aiecs-1.0.0.dist-info/METADATA +610 -0
  86. aiecs-1.0.0.dist-info/RECORD +90 -0
  87. aiecs-1.0.0.dist-info/WHEEL +5 -0
  88. aiecs-1.0.0.dist-info/entry_points.txt +7 -0
  89. aiecs-1.0.0.dist-info/licenses/LICENSE +225 -0
  90. aiecs-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,901 @@
1
+ import os
2
+ import re
3
+ import logging
4
+ import asyncio
5
+ import time
6
+ from typing import Dict, Any, List, Optional, Union, Tuple
7
+ from enum import Enum
8
+
9
+ from pydantic import BaseModel, Field, field_validator, ValidationError, ConfigDict
10
+
11
+ # Lazy imports for heavy dependencies
12
+ rake_nltk = None
13
+ spacy = None
14
+
15
+ def _init_heavy_dependencies():
16
+ """Initialize heavy dependencies when actually needed"""
17
+ global rake_nltk, spacy
18
+
19
+ if rake_nltk is None:
20
+ try:
21
+ import rake_nltk as _rake_nltk
22
+ rake_nltk = _rake_nltk
23
+ except ImportError:
24
+ import logging
25
+ logging.getLogger(__name__).error("rake_nltk not available")
26
+
27
+ if spacy is None:
28
+ try:
29
+ import spacy as _spacy
30
+ spacy = _spacy
31
+ except ImportError:
32
+ import logging
33
+ logging.getLogger(__name__).warning("spacy not available (optional)")
34
+
35
+ from aiecs.tools import register_tool
36
+ from aiecs.tools.base_tool import BaseTool
37
+ from aiecs.tools.tool_executor import (
38
+ validate_input,
39
+ )
40
+
41
+ # Enums for configuration options
42
+ class Language(str, Enum):
43
+ ENGLISH = "en"
44
+ CHINESE = "zh"
45
+ AUTO = "auto"
46
+
47
+ class ModelType(str, Enum):
48
+ SPACY_ENGLISH = "en_core_web_sm"
49
+ SPACY_CHINESE = "zh_core_web_sm"
50
+
51
+ @register_tool("classifier")
52
+ class ClassifierTool(BaseTool):
53
+ """
54
+ Text classification, tokenization, POS tagging, NER, lemmatization, dependency parsing,
55
+ keyword extraction, and summarization tool.
56
+
57
+ Operations:
58
+ - classify: Sentiment or topic classification.
59
+ - tokenize: Tokenize text.
60
+ - pos_tag: Part-of-speech tagging.
61
+ - ner: Named entity recognition.
62
+ - lemmatize: Lemmatize tokens.
63
+ - dependency_parse: Dependency parsing.
64
+ - keyword_extract: Extract key phrases.
65
+ - summarize: Summarize text.
66
+ - batch_process: Process multiple texts with any operation.
67
+
68
+ Supports English (spaCy) and Chinese (Jieba, spaCy).
69
+ """
70
+
71
+ # Configuration schema
72
+ class Config(BaseModel):
73
+ """Configuration for the classifier tool"""
74
+ max_workers: int = Field(
75
+ default=min(32, (os.cpu_count() or 4) * 2),
76
+ description="Maximum number of worker threads"
77
+ )
78
+ pipeline_cache_ttl: int = Field(
79
+ default=3600,
80
+ description="Time-to-live for pipeline cache in seconds"
81
+ )
82
+ pipeline_cache_size: int = Field(
83
+ default=10,
84
+ description="Maximum number of pipeline cache entries"
85
+ )
86
+ max_text_length: int = Field(
87
+ default=10_000,
88
+ description="Maximum text length in characters"
89
+ )
90
+ spacy_model_en: str = Field(
91
+ default="en_core_web_sm",
92
+ description="spaCy model for English"
93
+ )
94
+ spacy_model_zh: str = Field(
95
+ default="zh_core_web_sm",
96
+ description="spaCy model for Chinese"
97
+ )
98
+ allowed_models: List[str] = Field(
99
+ default=[
100
+ "en_core_web_sm",
101
+ "zh_core_web_sm"
102
+ ],
103
+ description="List of allowed spaCy models"
104
+ )
105
+ rate_limit_enabled: bool = Field(
106
+ default=True,
107
+ description="Enable rate limiting"
108
+ )
109
+ rate_limit_requests: int = Field(
110
+ default=100,
111
+ description="Maximum requests per window"
112
+ )
113
+ rate_limit_window: int = Field(
114
+ default=60,
115
+ description="Rate limit window in seconds"
116
+ )
117
+ use_rake_for_english: bool = Field(
118
+ default=True,
119
+ description="Use RAKE for English phrase extraction"
120
+ )
121
+
122
+ model_config = ConfigDict(env_prefix="CLASSIFIER_TOOL_")
123
+
124
+ # Base schema for text operations
125
+ class BaseTextSchema(BaseModel):
126
+ """Base schema for text operations"""
127
+ text: str = Field(
128
+ description="Text to process"
129
+ )
130
+
131
+ @field_validator("text")
132
+ @classmethod
133
+ def check_length_and_content(cls, v: str) -> str:
134
+ if len(v) > 10_000: # Using a constant here for validation
135
+ raise ValueError(f"Text length exceeds 10,000 characters")
136
+ # Check for malicious patterns (e.g., SQL injection)
137
+ if re.search(r'(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)', v, re.IGNORECASE):
138
+ raise ValueError("Text contains potentially malicious content")
139
+ return v
140
+
141
+ # Input schemas for operations
142
+ class ClassifySchema(BaseTextSchema):
143
+ """Schema for text classification"""
144
+ model: Optional[str] = Field(
145
+ default=None,
146
+ description="Model to use for classification"
147
+ )
148
+ language: Optional[Language] = Field(
149
+ default=None,
150
+ description="Language of the text"
151
+ )
152
+
153
+ @field_validator("model")
154
+ @classmethod
155
+ def check_model(cls, v: Optional[str]) -> Optional[str]:
156
+ allowed_models = [
157
+ "en_core_web_sm",
158
+ "zh_core_web_sm"
159
+ ]
160
+ if v and v not in allowed_models:
161
+ raise ValueError(f"Model '{v}' not in allowed spaCy models: {allowed_models}")
162
+ return v
163
+
164
+ class TokenizeSchema(BaseTextSchema):
165
+ """Schema for text tokenization"""
166
+ language: Optional[Language] = Field(
167
+ default=None,
168
+ description="Language of the text"
169
+ )
170
+
171
+ class PosTagSchema(BaseTextSchema):
172
+ """Schema for part-of-speech tagging"""
173
+ language: Optional[Language] = Field(
174
+ default=None,
175
+ description="Language of the text"
176
+ )
177
+
178
+ class NERSchema(BaseTextSchema):
179
+ """Schema for named entity recognition"""
180
+ language: Optional[Language] = Field(
181
+ default=None,
182
+ description="Language of the text"
183
+ )
184
+
185
+ class LemmatizeSchema(BaseTextSchema):
186
+ """Schema for lemmatization"""
187
+ language: Optional[Language] = Field(
188
+ default=None,
189
+ description="Language of the text"
190
+ )
191
+
192
+ class DependencyParseSchema(BaseTextSchema):
193
+ """Schema for dependency parsing"""
194
+ language: Optional[Language] = Field(
195
+ default=None,
196
+ description="Language of the text"
197
+ )
198
+
199
+ class KeywordExtractSchema(BaseTextSchema):
200
+ """Schema for keyword extraction"""
201
+ top_k: int = Field(
202
+ default=10,
203
+ description="Number of keywords to extract"
204
+ )
205
+ language: Optional[Language] = Field(
206
+ default=None,
207
+ description="Language of the text"
208
+ )
209
+ extract_phrases: bool = Field(
210
+ default=True,
211
+ description="Whether to extract phrases or just keywords"
212
+ )
213
+
214
+ class SummarizeSchema(BaseTextSchema):
215
+ """Schema for text summarization"""
216
+ max_length: int = Field(
217
+ default=150,
218
+ description="Maximum length of the summary"
219
+ )
220
+ language: Optional[Language] = Field(
221
+ default=None,
222
+ description="Language of the text"
223
+ )
224
+
225
+ class BatchProcessSchema(BaseModel):
226
+ """Schema for batch processing"""
227
+ texts: List[str] = Field(
228
+ description="List of texts to process"
229
+ )
230
+ operation: str = Field(
231
+ description="Operation to perform on each text"
232
+ )
233
+ language: Optional[Language] = Field(
234
+ default=None,
235
+ description="Language of the texts"
236
+ )
237
+ model: Optional[str] = Field(
238
+ default=None,
239
+ description="Model to use for processing"
240
+ )
241
+ top_k: Optional[int] = Field(
242
+ default=None,
243
+ description="Number of keywords to extract (for keyword_extract)"
244
+ )
245
+ max_length: Optional[int] = Field(
246
+ default=None,
247
+ description="Maximum length of the summary (for summarize)"
248
+ )
249
+
250
+ @field_validator("texts")
251
+ @classmethod
252
+ def check_texts(cls, v: List[str]) -> List[str]:
253
+ for text in v:
254
+ if len(text) > 10_000: # Using a constant here for validation
255
+ raise ValueError(f"Text length exceeds 10,000 characters")
256
+ if re.search(r'(\bSELECT\b|\bINSERT\b|\bDELETE\b|--|;|/\*)', text, re.IGNORECASE):
257
+ raise ValueError("Text contains potentially malicious content")
258
+ return v
259
+
260
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
261
+ """
262
+ Initialize ClassifierTool with settings and resources.
263
+
264
+ Args:
265
+ config (Dict, optional): Configuration overrides for ClassifierSettings.
266
+
267
+ Raises:
268
+ ValueError: If config contains invalid settings.
269
+ """
270
+ super().__init__(config)
271
+
272
+ # Parse configuration
273
+ self.config = self.Config(**(config or {}))
274
+
275
+ # Set up logger
276
+ self.logger = logging.getLogger(__name__)
277
+ if not self.logger.handlers:
278
+ handler = logging.StreamHandler()
279
+ handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
280
+ self.logger.addHandler(handler)
281
+ self.logger.setLevel(logging.INFO)
282
+
283
+ # Initialize resources
284
+ self._spacy_nlp = {} # Language -> spaCy pipeline
285
+ self._metrics = {'requests': 0, 'cache_hits': 0, 'processing_time': []}
286
+ self._request_timestamps = []
287
+
288
+ def _get_sentiment_lexicon(self, language: str) -> Dict[str, float]:
289
+ """
290
+ Get sentiment lexicon for the specified language.
291
+
292
+ Args:
293
+ language (str): Language code ('en', 'zh').
294
+
295
+ Returns:
296
+ Dict[str, float]: Sentiment lexicon with word -> score mapping.
297
+ """
298
+ if language == 'en':
299
+ # Simple English sentiment lexicon
300
+ return {
301
+ 'good': 1.0, 'great': 1.5, 'excellent': 2.0, 'amazing': 2.0, 'wonderful': 1.5,
302
+ 'fantastic': 2.0, 'awesome': 1.5, 'perfect': 2.0, 'love': 1.5, 'like': 1.0,
303
+ 'happy': 1.5, 'pleased': 1.0, 'satisfied': 1.0, 'positive': 1.0, 'best': 2.0,
304
+ 'bad': -1.0, 'terrible': -2.0, 'awful': -2.0, 'horrible': -2.0, 'hate': -2.0,
305
+ 'dislike': -1.0, 'sad': -1.5, 'angry': -1.5, 'disappointed': -1.5, 'negative': -1.0,
306
+ 'worst': -2.0, 'poor': -1.0, 'fail': -1.5, 'wrong': -1.0, 'problem': -1.0
307
+ }
308
+ else: # Chinese
309
+ return {
310
+ '好': 1.0, '很好': 1.5, '非常好': 2.0, '棒': 1.5, '优秀': 2.0, '完美': 2.0,
311
+ '喜欢': 1.5, '爱': 2.0, '满意': 1.0, '开心': 1.5, '高兴': 1.5, '积极': 1.0,
312
+ '坏': -1.0, '很坏': -1.5, '糟糕': -2.0, '讨厌': -2.0, '恨': -2.0, '失望': -1.5,
313
+ '生气': -1.5, '愤怒': -2.0, '消极': -1.0, '问题': -1.0, '错误': -1.0, '失败': -1.5
314
+ }
315
+
316
+ def _get_spacy(self, language: str) -> Any:
317
+ """
318
+ Get a spaCy pipeline for the specified language.
319
+
320
+ Args:
321
+ language (str): Language code ('en', 'zh').
322
+
323
+ Returns:
324
+ Any: spaCy NLP object.
325
+ """
326
+ global spacy
327
+ if spacy is None:
328
+ try:
329
+ import spacy as spacy_module
330
+ spacy = spacy_module
331
+ except ImportError:
332
+ raise ImportError("spaCy is required but not installed. Please install it with: pip install spacy")
333
+
334
+ model = self.config.spacy_model_zh if language == 'zh' else self.config.spacy_model_en
335
+ return spacy.load(model, disable=["textcat"])
336
+
337
+ def _detect_language(self, text: str) -> str:
338
+ """
339
+ Detect the language of the input text using character analysis.
340
+
341
+ Args:
342
+ text (str): Input text.
343
+
344
+ Returns:
345
+ str: Language code ('en', 'zh', or 'en' for unknown).
346
+ """
347
+ try:
348
+ # Count Chinese characters (CJK Unified Ideographs)
349
+ chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
350
+ total_chars = len([char for char in text if char.isalpha()])
351
+
352
+ if total_chars == 0:
353
+ return 'en'
354
+
355
+ # If more than 30% are Chinese characters, consider it Chinese
356
+ chinese_ratio = chinese_chars / total_chars
357
+ return 'zh' if chinese_ratio > 0.3 else 'en'
358
+ except Exception:
359
+ return 'en'
360
+
361
+ def _check_rate_limit(self) -> bool:
362
+ """
363
+ Check if the request is within rate limits.
364
+
365
+ Returns:
366
+ bool: True if within limits, False otherwise.
367
+ """
368
+ if not self.config.rate_limit_enabled:
369
+ return True
370
+
371
+ current_time = time.time()
372
+
373
+ # Get lock from executor
374
+ with self._executor.get_lock("rate_limit"):
375
+ # Remove timestamps outside the window
376
+ self._request_timestamps = [ts for ts in self._request_timestamps
377
+ if current_time - ts <= self.config.rate_limit_window]
378
+
379
+ # Check if we're at the limit
380
+ if len(self._request_timestamps) >= self.config.rate_limit_requests:
381
+ return False
382
+
383
+ # Add current timestamp
384
+ self._request_timestamps.append(current_time)
385
+ return True
386
+
387
+ def _extract_english_phrases(self, text: str, top_k: int) -> List[str]:
388
+ """
389
+ Extract key phrases from English text using RAKE.
390
+
391
+ Args:
392
+ text (str): Input text.
393
+ top_k (int): Number of phrases to extract.
394
+
395
+ Returns:
396
+ List[str]: Extracted phrases.
397
+ """
398
+ try:
399
+ # Initialize heavy dependencies if needed
400
+ _init_heavy_dependencies()
401
+
402
+ if rake_nltk is None:
403
+ raise ImportError("rake_nltk not available")
404
+
405
+ rake = rake_nltk.Rake()
406
+ rake.extract_keywords_from_text(text)
407
+ phrases = rake.get_ranked_phrases()[:top_k]
408
+ return phrases
409
+ except Exception as e:
410
+ self.logger.error(f"Error extracting English phrases: {e}")
411
+ # Fallback to simple keyword extraction
412
+ nlp = self._get_spacy('en')
413
+ doc = nlp(text)
414
+ keywords = [token.text for token in doc if token.pos_ in ('NOUN', 'PROPN')][:top_k]
415
+ return keywords
416
+
417
+ def _extract_chinese_phrases(self, text: str, top_k: int) -> List[str]:
418
+ """
419
+ Extract key phrases from Chinese text using spaCy.
420
+
421
+ Args:
422
+ text (str): Input text.
423
+ top_k (int): Number of phrases to extract.
424
+
425
+ Returns:
426
+ List[str]: Extracted phrases.
427
+ """
428
+ try:
429
+ nlp = self._get_spacy('zh')
430
+ doc = nlp(text)
431
+
432
+ # Extract noun phrases and named entities
433
+ phrases = []
434
+
435
+ # Add noun chunks
436
+ for chunk in doc.noun_chunks:
437
+ if len(chunk.text.strip()) > 1:
438
+ phrases.append(chunk.text.strip())
439
+
440
+ # Add named entities
441
+ for ent in doc.ents:
442
+ if len(ent.text.strip()) > 1:
443
+ phrases.append(ent.text.strip())
444
+
445
+ # Add important nouns and proper nouns
446
+ for token in doc:
447
+ if token.pos_ in ('NOUN', 'PROPN') and len(token.text.strip()) > 1:
448
+ phrases.append(token.text.strip())
449
+
450
+ # Remove duplicates and return top_k
451
+ unique_phrases = list(dict.fromkeys(phrases)) # Preserve order
452
+ return unique_phrases[:top_k]
453
+
454
+ except Exception as e:
455
+ self.logger.error(f"Error extracting Chinese phrases with spaCy: {e}")
456
+ # Fallback to simple noun extraction
457
+ try:
458
+ nlp = self._get_spacy('zh')
459
+ doc = nlp(text)
460
+ nouns = [token.text for token in doc if token.pos_ in ('NOUN', 'PROPN')]
461
+ return nouns[:top_k]
462
+ except Exception:
463
+ return []
464
+
465
+ def _get_hf_pipeline(self, task: str, model: str):
466
+ """
467
+ Get a Hugging Face transformers pipeline for the specified task and model.
468
+
469
+ Args:
470
+ task (str): The task type (e.g., "summarization").
471
+ model (str): The model name.
472
+
473
+ Returns:
474
+ Any: Hugging Face pipeline object.
475
+
476
+ Raises:
477
+ ImportError: If transformers library is not available.
478
+ ValueError: If the pipeline creation fails.
479
+ """
480
+ try:
481
+ from transformers import pipeline
482
+ return pipeline(task, model=model)
483
+ except ImportError:
484
+ raise ImportError("transformers library is required for summarization but not installed. Please install it with: pip install transformers")
485
+ except Exception as e:
486
+ raise ValueError(f"Error creating pipeline for task '{task}' with model '{model}': {e}")
487
+
488
+ async def classify(self, text: str, model: Optional[str] = None, language: Optional[str] = None) -> List[Dict[str, Any]]:
489
+ """
490
+ Perform sentiment classification on text using spaCy and lexicon-based approach.
491
+
492
+ Args:
493
+ text (str): Text to classify.
494
+ model (Optional[str]): spaCy model to use (optional, auto-detected).
495
+ language (Optional[str]): Language of the text.
496
+
497
+ Returns:
498
+ List[Dict[str, Any]]: Classification results [{'label': str, 'score': float}].
499
+ """
500
+ if not self._check_rate_limit():
501
+ raise ValueError("Rate limit exceeded. Please try again later.")
502
+
503
+ language = language or self._detect_language(text)
504
+
505
+ # Get spaCy pipeline and sentiment lexicon
506
+ nlp = await asyncio.get_event_loop().run_in_executor(
507
+ None, self._get_spacy, language
508
+ )
509
+
510
+ sentiment_lexicon = self._get_sentiment_lexicon(language)
511
+
512
+ # Process text with spaCy
513
+ doc = await asyncio.get_event_loop().run_in_executor(
514
+ None, nlp, text
515
+ )
516
+
517
+ # Calculate sentiment score
518
+ sentiment_score = 0.0
519
+ word_count = 0
520
+
521
+ for token in doc:
522
+ if not token.is_stop and not token.is_punct and token.text.lower() in sentiment_lexicon:
523
+ sentiment_score += sentiment_lexicon[token.text.lower()]
524
+ word_count += 1
525
+
526
+ # Normalize score
527
+ if word_count > 0:
528
+ sentiment_score = sentiment_score / word_count
529
+
530
+ # Determine label and confidence
531
+ if sentiment_score > 0.1:
532
+ label = "POSITIVE"
533
+ confidence = min(0.9, 0.5 + abs(sentiment_score) * 0.4)
534
+ elif sentiment_score < -0.1:
535
+ label = "NEGATIVE"
536
+ confidence = min(0.9, 0.5 + abs(sentiment_score) * 0.4)
537
+ else:
538
+ label = "NEUTRAL"
539
+ confidence = 0.6
540
+
541
+ return [{"label": label, "score": confidence}]
542
+
543
+ async def tokenize(self, text: str, language: Optional[str] = None) -> List[str]:
544
+ """
545
+ Tokenize text into words or tokens using spaCy.
546
+
547
+ Args:
548
+ text (str): Text to tokenize.
549
+ language (Optional[str]): Language of the text.
550
+
551
+ Returns:
552
+ List[str]: List of tokens.
553
+ """
554
+ if not self._check_rate_limit():
555
+ raise ValueError("Rate limit exceeded. Please try again later.")
556
+
557
+ language = language or self._detect_language(text)
558
+
559
+ nlp = await asyncio.get_event_loop().run_in_executor(
560
+ None, self._get_spacy, language
561
+ )
562
+
563
+ doc = await asyncio.get_event_loop().run_in_executor(
564
+ None, nlp, text
565
+ )
566
+
567
+ return [token.text for token in doc]
568
+
569
+ async def pos_tag(self, text: str, language: Optional[str] = None) -> List[Tuple[str, str]]:
570
+ """
571
+ Perform part-of-speech tagging using spaCy, returning (token, pos) pairs.
572
+
573
+ Args:
574
+ text (str): Text to tag.
575
+ language (Optional[str]): Language of the text.
576
+
577
+ Returns:
578
+ List[Tuple[str, str]]: List of (token, POS tag) tuples.
579
+ """
580
+ if not self._check_rate_limit():
581
+ raise ValueError("Rate limit exceeded. Please try again later.")
582
+
583
+ language = language or self._detect_language(text)
584
+
585
+ nlp = await asyncio.get_event_loop().run_in_executor(
586
+ None, self._get_spacy, language
587
+ )
588
+
589
+ doc = await asyncio.get_event_loop().run_in_executor(
590
+ None, nlp, text
591
+ )
592
+
593
+ return [(token.text, token.pos_) for token in doc]
594
+
595
+ @validate_input(NERSchema)
596
+
597
+ async def ner(self, text: str, language: Optional[str] = None) -> List[Dict[str, Any]]:
598
+ """
599
+ Perform named entity recognition.
600
+
601
+ Args:
602
+ text (str): Text to analyze.
603
+ language (Optional[str]): Language of the text.
604
+
605
+ Returns:
606
+ List[Dict[str, Any]]: List of named entities with text, label, start, and end.
607
+ """
608
+ if not self._check_rate_limit():
609
+ raise ValueError("Rate limit exceeded. Please try again later.")
610
+
611
+ language = language or self._detect_language(text)
612
+
613
+ nlp = await asyncio.get_event_loop().run_in_executor(
614
+ None, self._get_spacy, language
615
+ )
616
+
617
+ doc = await asyncio.get_event_loop().run_in_executor(
618
+ None, nlp, text
619
+ )
620
+
621
+ return [
622
+ {"text": ent.text, "label": ent.label_, "start": ent.start_char, "end": ent.end_char}
623
+ for ent in doc.ents
624
+ ]
625
+
626
+ @validate_input(LemmatizeSchema)
627
+
628
+ async def lemmatize(self, text: str, language: Optional[str] = None) -> List[str]:
629
+ """
630
+ Lemmatize tokens in text using spaCy.
631
+
632
+ Args:
633
+ text (str): Text to lemmatize.
634
+ language (Optional[str]): Language of the text.
635
+
636
+ Returns:
637
+ List[str]: List of lemmatized tokens.
638
+ """
639
+ if not self._check_rate_limit():
640
+ raise ValueError("Rate limit exceeded. Please try again later.")
641
+
642
+ language = language or self._detect_language(text)
643
+
644
+ nlp = await asyncio.get_event_loop().run_in_executor(
645
+ None, self._get_spacy, language
646
+ )
647
+
648
+ doc = await asyncio.get_event_loop().run_in_executor(
649
+ None, nlp, text
650
+ )
651
+
652
+ # For Chinese, lemma might be the same as text, but spaCy handles it consistently
653
+ return [token.lemma_ for token in doc]
654
+
655
+ @validate_input(DependencyParseSchema)
656
+
657
+ async def dependency_parse(self, text: str, language: Optional[str] = None) -> List[Dict[str, Any]]:
658
+ """
659
+ Perform dependency parsing using spaCy (supports both English and Chinese).
660
+
661
+ Args:
662
+ text (str): Text to parse.
663
+ language (Optional[str]): Language of the text.
664
+
665
+ Returns:
666
+ List[Dict[str, Any]]: List of tokens with dependency information.
667
+ """
668
+ if not self._check_rate_limit():
669
+ raise ValueError("Rate limit exceeded. Please try again later.")
670
+
671
+ language = language or self._detect_language(text)
672
+
673
+ nlp = await asyncio.get_event_loop().run_in_executor(
674
+ None, self._get_spacy, language
675
+ )
676
+
677
+ doc = await asyncio.get_event_loop().run_in_executor(
678
+ None, nlp, text
679
+ )
680
+
681
+ return [
682
+ {
683
+ "text": token.text,
684
+ "head": token.head.text,
685
+ "dep": token.dep_,
686
+ "pos": token.pos_
687
+ }
688
+ for token in doc
689
+ ]
690
+
691
+ @validate_input(KeywordExtractSchema)
692
+
693
+ async def keyword_extract(self, text: str, top_k: int = 10, language: Optional[str] = None, extract_phrases: bool = True) -> List[str]:
694
+ """
695
+ Extract keywords or key phrases from text using spaCy.
696
+
697
+ Args:
698
+ text (str): Text to analyze.
699
+ top_k (int): Number of keywords to extract.
700
+ language (Optional[str]): Language of the text.
701
+ extract_phrases (bool): Whether to extract phrases or just keywords.
702
+
703
+ Returns:
704
+ List[str]: List of extracted keywords or phrases.
705
+ """
706
+ if not self._check_rate_limit():
707
+ raise ValueError("Rate limit exceeded. Please try again later.")
708
+
709
+ language = language or self._detect_language(text)
710
+
711
+ if language == 'zh':
712
+ if extract_phrases:
713
+ return await asyncio.get_event_loop().run_in_executor(
714
+ None, self._extract_chinese_phrases, text, top_k
715
+ )
716
+ else:
717
+ # Extract simple keywords using spaCy
718
+ nlp = await asyncio.get_event_loop().run_in_executor(
719
+ None, self._get_spacy, language
720
+ )
721
+
722
+ doc = await asyncio.get_event_loop().run_in_executor(
723
+ None, nlp, text
724
+ )
725
+
726
+ keywords = [token.text for token in doc if token.pos_ in ('NOUN', 'PROPN')][:top_k]
727
+ return keywords
728
+ else: # English or other languages
729
+ if extract_phrases and self.config.use_rake_for_english:
730
+ return await asyncio.get_event_loop().run_in_executor(
731
+ None, self._extract_english_phrases, text, top_k
732
+ )
733
+ else:
734
+ nlp = await asyncio.get_event_loop().run_in_executor(
735
+ None, self._get_spacy, language
736
+ )
737
+
738
+ doc = await asyncio.get_event_loop().run_in_executor(
739
+ None, nlp, text
740
+ )
741
+
742
+ keywords = [token.text for token in doc if token.pos_ in ('NOUN', 'PROPN')][:top_k]
743
+ return keywords
744
+
745
+ @validate_input(SummarizeSchema)
746
+
747
+ async def summarize(self, text: str, max_length: int = 150, language: Optional[str] = None) -> str:
748
+ """
749
+ Summarize text.
750
+
751
+ Args:
752
+ text (str): Text to summarize.
753
+ max_length (int): Maximum length of the summary.
754
+ language (Optional[str]): Language of the text.
755
+
756
+ Returns:
757
+ str: Summarized text.
758
+ """
759
+ if not self._check_rate_limit():
760
+ raise ValueError("Rate limit exceeded. Please try again later.")
761
+
762
+ language = language or self._detect_language(text)
763
+ # Use appropriate models for summarization
764
+ if language == 'en':
765
+ model = "facebook/bart-large-cnn"
766
+ else:
767
+ # For Chinese and other languages, use a multilingual model
768
+ # For now, use t5-base, but consider using a Chinese-specific model in the future
769
+ model = "t5-base"
770
+
771
+ pipe = await asyncio.get_event_loop().run_in_executor(
772
+ None, self._get_hf_pipeline, "summarization", model
773
+ )
774
+
775
+ # Different models use different parameter names for length control
776
+ if model.startswith("t5"):
777
+ # T5 models use max_new_tokens instead of max_length
778
+ # For Chinese text, use a more conservative approach
779
+ if language == 'zh':
780
+ # Chinese text: use character count and be more conservative
781
+ input_chars = len(text)
782
+ max_new_tokens = min(max_length, max(input_chars // 4, 5))
783
+ min_new_tokens = 2
784
+ else:
785
+ # English text: use word count
786
+ input_words = len(text.split())
787
+ max_new_tokens = min(max_length, max(input_words // 2, 10))
788
+ min_new_tokens = 5
789
+
790
+ result = await asyncio.get_event_loop().run_in_executor(
791
+ None, lambda: pipe(text, max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens, do_sample=False)[0]['summary_text']
792
+ )
793
+ else:
794
+ # BART and other models use max_length
795
+ if language == 'zh':
796
+ # Chinese text: use character count
797
+ input_chars = len(text)
798
+ max_len = min(max_length, max(input_chars // 4, 10))
799
+ min_len = 5
800
+ else:
801
+ # English text: use word count
802
+ input_words = len(text.split())
803
+ max_len = min(max_length, max(input_words // 2, 20))
804
+ min_len = 10
805
+
806
+ result = await asyncio.get_event_loop().run_in_executor(
807
+ None, lambda: pipe(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
808
+ )
809
+
810
+ return result
811
+
812
+ @validate_input(BatchProcessSchema)
813
+
814
+ async def batch_process(self, texts: List[str], operation: str, language: Optional[str] = None,
815
+ model: Optional[str] = None, top_k: Optional[int] = None,
816
+ max_length: Optional[int] = None) -> List[Any]:
817
+ """
818
+ Process multiple texts with the specified operation.
819
+
820
+ Args:
821
+ texts (List[str]): List of texts to process.
822
+ operation (str): Operation to perform on each text.
823
+ language (Optional[str]): Language of the texts.
824
+ model (Optional[str]): Model to use for processing.
825
+ top_k (Optional[int]): Number of keywords to extract (for keyword_extract).
826
+ max_length (Optional[int]): Maximum length of the summary (for summarize).
827
+
828
+ Returns:
829
+ List[Any]: List of operation results.
830
+ """
831
+ if not self._check_rate_limit():
832
+ raise ValueError("Rate limit exceeded. Please try again later.")
833
+
834
+ # Prepare operations to execute in batch
835
+ operations = []
836
+ for text in texts:
837
+ kwargs = {"text": text}
838
+ if language:
839
+ kwargs["language"] = language
840
+ if model and operation == "classify":
841
+ kwargs["model"] = model
842
+ if top_k and operation == "keyword_extract":
843
+ kwargs["top_k"] = top_k
844
+ if max_length and operation == "summarize":
845
+ kwargs["max_length"] = max_length
846
+
847
+ operations.append({"op": operation, "kwargs": kwargs})
848
+
849
+ # Execute batch operations
850
+ return await self.run_batch(operations)
851
+
852
+ async def health_check(self) -> Dict[str, Any]:
853
+ """
854
+ Perform a health check on the tool.
855
+
856
+ Returns:
857
+ Dict[str, Any]: Health check results.
858
+ """
859
+ result = {
860
+ "status": "ok",
861
+ "metrics": {
862
+ "requests": self._metrics["requests"],
863
+ "cache_hits": self._metrics["cache_hits"],
864
+ "avg_processing_time": sum(self._metrics["processing_time"]) / len(self._metrics["processing_time"])
865
+ if self._metrics["processing_time"] else 0.0
866
+ },
867
+ "config": {
868
+ "max_workers": self.config.max_workers,
869
+ "pipeline_cache_size": self.config.pipeline_cache_size,
870
+ "rate_limit_enabled": self.config.rate_limit_enabled,
871
+ "rate_limit_requests": self.config.rate_limit_requests,
872
+ "rate_limit_window": self.config.rate_limit_window
873
+ }
874
+ }
875
+
876
+ # Check if models can be loaded
877
+ try:
878
+ await asyncio.get_event_loop().run_in_executor(
879
+ None, self._get_spacy, "en"
880
+ )
881
+ result["models"] = {"spacy_en": "ok"}
882
+ except Exception as e:
883
+ result["status"] = "warning"
884
+ result["models"] = {"spacy_en": f"error: {str(e)}"}
885
+
886
+ return result
887
+
888
+ async def cleanup(self) -> None:
889
+ """
890
+ Clean up resources used by the tool.
891
+ """
892
+ # Clear spaCy models
893
+ self._spacy_nlp.clear()
894
+
895
+ # Clear metrics
896
+ self._metrics = {'requests': 0, 'cache_hits': 0, 'processing_time': []}
897
+
898
+ # Clear rate limiting data
899
+ self._request_timestamps = []
900
+
901
+ self.logger.info("ClassifierTool resources cleaned up")