aiecs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (90) hide show
  1. aiecs/__init__.py +75 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +295 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +341 -0
  7. aiecs/config/__init__.py +15 -0
  8. aiecs/config/config.py +117 -0
  9. aiecs/config/registry.py +19 -0
  10. aiecs/core/__init__.py +46 -0
  11. aiecs/core/interface/__init__.py +34 -0
  12. aiecs/core/interface/execution_interface.py +150 -0
  13. aiecs/core/interface/storage_interface.py +214 -0
  14. aiecs/domain/__init__.py +20 -0
  15. aiecs/domain/context/__init__.py +28 -0
  16. aiecs/domain/context/content_engine.py +982 -0
  17. aiecs/domain/context/conversation_models.py +306 -0
  18. aiecs/domain/execution/__init__.py +12 -0
  19. aiecs/domain/execution/model.py +49 -0
  20. aiecs/domain/task/__init__.py +13 -0
  21. aiecs/domain/task/dsl_processor.py +460 -0
  22. aiecs/domain/task/model.py +50 -0
  23. aiecs/domain/task/task_context.py +257 -0
  24. aiecs/infrastructure/__init__.py +26 -0
  25. aiecs/infrastructure/messaging/__init__.py +13 -0
  26. aiecs/infrastructure/messaging/celery_task_manager.py +341 -0
  27. aiecs/infrastructure/messaging/websocket_manager.py +289 -0
  28. aiecs/infrastructure/monitoring/__init__.py +12 -0
  29. aiecs/infrastructure/monitoring/executor_metrics.py +138 -0
  30. aiecs/infrastructure/monitoring/structured_logger.py +50 -0
  31. aiecs/infrastructure/monitoring/tracing_manager.py +376 -0
  32. aiecs/infrastructure/persistence/__init__.py +12 -0
  33. aiecs/infrastructure/persistence/database_manager.py +286 -0
  34. aiecs/infrastructure/persistence/file_storage.py +671 -0
  35. aiecs/infrastructure/persistence/redis_client.py +162 -0
  36. aiecs/llm/__init__.py +54 -0
  37. aiecs/llm/base_client.py +99 -0
  38. aiecs/llm/client_factory.py +339 -0
  39. aiecs/llm/custom_callbacks.py +228 -0
  40. aiecs/llm/openai_client.py +125 -0
  41. aiecs/llm/vertex_client.py +186 -0
  42. aiecs/llm/xai_client.py +184 -0
  43. aiecs/main.py +351 -0
  44. aiecs/scripts/DEPENDENCY_SYSTEM_SUMMARY.md +241 -0
  45. aiecs/scripts/README_DEPENDENCY_CHECKER.md +309 -0
  46. aiecs/scripts/README_WEASEL_PATCH.md +126 -0
  47. aiecs/scripts/__init__.py +3 -0
  48. aiecs/scripts/dependency_checker.py +825 -0
  49. aiecs/scripts/dependency_fixer.py +348 -0
  50. aiecs/scripts/download_nlp_data.py +348 -0
  51. aiecs/scripts/fix_weasel_validator.py +121 -0
  52. aiecs/scripts/fix_weasel_validator.sh +82 -0
  53. aiecs/scripts/patch_weasel_library.sh +188 -0
  54. aiecs/scripts/quick_dependency_check.py +269 -0
  55. aiecs/scripts/run_weasel_patch.sh +41 -0
  56. aiecs/scripts/setup_nlp_data.sh +217 -0
  57. aiecs/tasks/__init__.py +2 -0
  58. aiecs/tasks/worker.py +111 -0
  59. aiecs/tools/__init__.py +196 -0
  60. aiecs/tools/base_tool.py +202 -0
  61. aiecs/tools/langchain_adapter.py +361 -0
  62. aiecs/tools/task_tools/__init__.py +82 -0
  63. aiecs/tools/task_tools/chart_tool.py +704 -0
  64. aiecs/tools/task_tools/classfire_tool.py +901 -0
  65. aiecs/tools/task_tools/image_tool.py +397 -0
  66. aiecs/tools/task_tools/office_tool.py +600 -0
  67. aiecs/tools/task_tools/pandas_tool.py +565 -0
  68. aiecs/tools/task_tools/report_tool.py +499 -0
  69. aiecs/tools/task_tools/research_tool.py +363 -0
  70. aiecs/tools/task_tools/scraper_tool.py +548 -0
  71. aiecs/tools/task_tools/search_api.py +7 -0
  72. aiecs/tools/task_tools/stats_tool.py +513 -0
  73. aiecs/tools/temp_file_manager.py +126 -0
  74. aiecs/tools/tool_executor/__init__.py +35 -0
  75. aiecs/tools/tool_executor/tool_executor.py +518 -0
  76. aiecs/utils/LLM_output_structor.py +409 -0
  77. aiecs/utils/__init__.py +23 -0
  78. aiecs/utils/base_callback.py +50 -0
  79. aiecs/utils/execution_utils.py +158 -0
  80. aiecs/utils/logging.py +1 -0
  81. aiecs/utils/prompt_loader.py +13 -0
  82. aiecs/utils/token_usage_repository.py +279 -0
  83. aiecs/ws/__init__.py +0 -0
  84. aiecs/ws/socket_server.py +41 -0
  85. aiecs-1.0.0.dist-info/METADATA +610 -0
  86. aiecs-1.0.0.dist-info/RECORD +90 -0
  87. aiecs-1.0.0.dist-info/WHEEL +5 -0
  88. aiecs-1.0.0.dist-info/entry_points.txt +7 -0
  89. aiecs-1.0.0.dist-info/licenses/LICENSE +225 -0
  90. aiecs-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,348 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dependency fixer for AIECS tools.
4
+
5
+ This script automatically installs missing dependencies based on the
6
+ dependency checker results.
7
+ """
8
+
9
+ import os
10
+ import sys
11
+ import subprocess
12
+ import platform
13
+ import logging
14
+ from typing import Dict, List, Tuple, Optional
15
+ from pathlib import Path
16
+
17
+
18
+ class DependencyFixer:
19
+ """Automatic dependency fixer for AIECS."""
20
+
21
+ def __init__(self, interactive: bool = True):
22
+ self.logger = self._setup_logging()
23
+ self.system = platform.system().lower()
24
+ self.interactive = interactive
25
+ self.fixes_applied = []
26
+ self.fixes_failed = []
27
+
28
+ def _setup_logging(self) -> logging.Logger:
29
+ """Setup logging configuration."""
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(levelname)s - %(message)s',
33
+ handlers=[
34
+ logging.StreamHandler(sys.stdout),
35
+ logging.FileHandler('dependency_fix.log')
36
+ ]
37
+ )
38
+ return logging.getLogger(__name__)
39
+
40
+ def _run_command(self, cmd: List[str], description: str) -> bool:
41
+ """Run a command and return success status."""
42
+ try:
43
+ self.logger.info(f"Running: {description}")
44
+ self.logger.info(f"Command: {' '.join(cmd)}")
45
+
46
+ result = subprocess.run(
47
+ cmd,
48
+ capture_output=True,
49
+ text=True,
50
+ check=True,
51
+ timeout=300 # 5 minutes timeout
52
+ )
53
+
54
+ self.logger.info(f"Success: {description}")
55
+ if result.stdout:
56
+ self.logger.info(f"Output: {result.stdout}")
57
+
58
+ return True
59
+
60
+ except subprocess.CalledProcessError as e:
61
+ self.logger.error(f"Failed: {description}")
62
+ self.logger.error(f"Error: {e.stderr}")
63
+ return False
64
+ except subprocess.TimeoutExpired:
65
+ self.logger.error(f"Timeout: {description}")
66
+ return False
67
+ except Exception as e:
68
+ self.logger.error(f"Error: {description} - {e}")
69
+ return False
70
+
71
+ def _ask_confirmation(self, message: str) -> bool:
72
+ """Ask for user confirmation if in interactive mode."""
73
+ if not self.interactive:
74
+ return True
75
+
76
+ while True:
77
+ response = input(f"{message} (y/n): ").lower().strip()
78
+ if response in ['y', 'yes']:
79
+ return True
80
+ elif response in ['n', 'no']:
81
+ return False
82
+ else:
83
+ print("Please enter 'y' or 'n'")
84
+
85
+ def fix_system_dependencies(self, missing_deps: List[str]) -> bool:
86
+ """Fix missing system dependencies."""
87
+ if not missing_deps:
88
+ return True
89
+
90
+ self.logger.info("Fixing system dependencies...")
91
+
92
+ # Group dependencies by package manager
93
+ apt_packages = []
94
+ brew_packages = []
95
+ pip_packages = []
96
+
97
+ for dep in missing_deps:
98
+ if dep == "java":
99
+ if self.system == "linux":
100
+ apt_packages.append("openjdk-11-jdk")
101
+ elif self.system == "darwin":
102
+ brew_packages.append("openjdk@11")
103
+ elif dep == "tesseract":
104
+ if self.system == "linux":
105
+ apt_packages.extend(["tesseract-ocr", "tesseract-ocr-eng"])
106
+ elif self.system == "darwin":
107
+ brew_packages.append("tesseract")
108
+ elif dep == "tesseract_lang_packs":
109
+ if self.system == "linux":
110
+ apt_packages.extend([
111
+ "tesseract-ocr-chi-sim", "tesseract-ocr-chi-tra",
112
+ "tesseract-ocr-fra", "tesseract-ocr-deu",
113
+ "tesseract-ocr-jpn", "tesseract-ocr-kor",
114
+ "tesseract-ocr-rus", "tesseract-ocr-spa"
115
+ ])
116
+ elif dep == "pillow_system_deps":
117
+ if self.system == "linux":
118
+ apt_packages.extend([
119
+ "libjpeg-dev", "zlib1g-dev", "libpng-dev",
120
+ "libtiff-dev", "libwebp-dev", "libopenjp2-7-dev"
121
+ ])
122
+ elif self.system == "darwin":
123
+ brew_packages.extend(["libjpeg", "zlib", "libpng", "libtiff", "webp", "openjpeg"])
124
+ elif dep == "pyreadstat_deps":
125
+ if self.system == "linux":
126
+ apt_packages.append("libreadstat-dev")
127
+ elif self.system == "darwin":
128
+ brew_packages.append("readstat")
129
+ elif dep == "weasyprint_deps":
130
+ if self.system == "linux":
131
+ apt_packages.extend([
132
+ "libcairo2-dev", "libpango1.0-dev", "libgdk-pixbuf2.0-dev",
133
+ "libffi-dev", "shared-mime-info"
134
+ ])
135
+ elif self.system == "darwin":
136
+ brew_packages.extend(["cairo", "pango", "gdk-pixbuf", "libffi"])
137
+ elif dep == "matplotlib_deps":
138
+ if self.system == "linux":
139
+ apt_packages.extend([
140
+ "libfreetype6-dev", "libpng-dev", "libjpeg-dev",
141
+ "libtiff-dev", "libwebp-dev"
142
+ ])
143
+ elif self.system == "darwin":
144
+ brew_packages.extend(["freetype", "libpng", "libjpeg", "libtiff", "webp"])
145
+
146
+ # Install apt packages
147
+ if apt_packages and self.system == "linux":
148
+ if self._ask_confirmation(f"Install system packages: {', '.join(apt_packages)}?"):
149
+ cmd = ["sudo", "apt-get", "update"]
150
+ if self._run_command(cmd, "Update package list"):
151
+ cmd = ["sudo", "apt-get", "install", "-y"] + apt_packages
152
+ if self._run_command(cmd, f"Install packages: {', '.join(apt_packages)}"):
153
+ self.fixes_applied.append(f"System packages: {', '.join(apt_packages)}")
154
+ else:
155
+ self.fixes_failed.append(f"System packages: {', '.join(apt_packages)}")
156
+ return False
157
+
158
+ # Install brew packages
159
+ if brew_packages and self.system == "darwin":
160
+ if self._ask_confirmation(f"Install Homebrew packages: {', '.join(brew_packages)}?"):
161
+ for package in brew_packages:
162
+ cmd = ["brew", "install", package]
163
+ if self._run_command(cmd, f"Install {package}"):
164
+ self.fixes_applied.append(f"Homebrew package: {package}")
165
+ else:
166
+ self.fixes_failed.append(f"Homebrew package: {package}")
167
+ return False
168
+
169
+ return True
170
+
171
+ def fix_python_dependencies(self, missing_packages: List[str]) -> bool:
172
+ """Fix missing Python packages."""
173
+ if not missing_packages:
174
+ return True
175
+
176
+ self.logger.info("Fixing Python dependencies...")
177
+
178
+ if self._ask_confirmation(f"Install Python packages: {', '.join(missing_packages)}?"):
179
+ cmd = [sys.executable, "-m", "pip", "install"] + missing_packages
180
+ if self._run_command(cmd, f"Install Python packages: {', '.join(missing_packages)}"):
181
+ self.fixes_applied.append(f"Python packages: {', '.join(missing_packages)}")
182
+ return True
183
+ else:
184
+ self.fixes_failed.append(f"Python packages: {', '.join(missing_packages)}")
185
+ return False
186
+
187
+ return True
188
+
189
+ def fix_model_dependencies(self, missing_models: List[str]) -> bool:
190
+ """Fix missing model dependencies."""
191
+ if not missing_models:
192
+ return True
193
+
194
+ self.logger.info("Fixing model dependencies...")
195
+
196
+ # spaCy models
197
+ spacy_models = [m for m in missing_models if m.startswith("spacy_")]
198
+ if spacy_models:
199
+ if self._ask_confirmation(f"Download spaCy models: {', '.join(spacy_models)}?"):
200
+ for model in spacy_models:
201
+ model_name = model.replace("spacy_", "")
202
+ cmd = [sys.executable, "-m", "spacy", "download", model_name]
203
+ if self._run_command(cmd, f"Download spaCy model: {model_name}"):
204
+ self.fixes_applied.append(f"spaCy model: {model_name}")
205
+ else:
206
+ self.fixes_failed.append(f"spaCy model: {model_name}")
207
+
208
+ # NLTK data
209
+ nltk_data = [m for m in missing_models if m.startswith("nltk_")]
210
+ if nltk_data:
211
+ if self._ask_confirmation(f"Download NLTK data: {', '.join(nltk_data)}?"):
212
+ for data in nltk_data:
213
+ data_name = data.replace("nltk_", "")
214
+ cmd = [sys.executable, "-c", f"import nltk; nltk.download('{data_name}')"]
215
+ if self._run_command(cmd, f"Download NLTK data: {data_name}"):
216
+ self.fixes_applied.append(f"NLTK data: {data_name}")
217
+ else:
218
+ self.fixes_failed.append(f"NLTK data: {data_name}")
219
+
220
+ # Playwright browsers
221
+ if "playwright_browsers" in missing_models:
222
+ if self._ask_confirmation("Install Playwright browsers?"):
223
+ cmd = [sys.executable, "-m", "playwright", "install"]
224
+ if self._run_command(cmd, "Install Playwright browsers"):
225
+ self.fixes_applied.append("Playwright browsers")
226
+ else:
227
+ self.fixes_failed.append("Playwright browsers")
228
+
229
+ return True
230
+
231
+ def fix_dependencies_from_checker(self, checker_results: Dict) -> bool:
232
+ """Fix dependencies based on checker results."""
233
+ self.logger.info("Starting dependency fixing process...")
234
+
235
+ # Extract missing dependencies
236
+ missing_system = []
237
+ missing_python = []
238
+ missing_models = []
239
+
240
+ for tool_name, tool_deps in checker_results.items():
241
+ if isinstance(tool_deps, dict):
242
+ # Handle tool-specific dependencies
243
+ for dep_type, deps in tool_deps.items():
244
+ if dep_type == "system_deps":
245
+ for dep in deps:
246
+ if dep.status.value == "missing":
247
+ missing_system.append(dep.name.lower().replace(" ", "_"))
248
+ elif dep_type == "python_deps":
249
+ for dep in deps:
250
+ if dep.status.value == "missing":
251
+ missing_python.append(dep.name)
252
+ elif dep_type == "model_deps":
253
+ for dep in deps:
254
+ if dep.status.value == "missing":
255
+ missing_models.append(f"{dep.name.lower().replace(' ', '_')}")
256
+
257
+ # Apply fixes
258
+ success = True
259
+
260
+ if missing_system:
261
+ if not self.fix_system_dependencies(missing_system):
262
+ success = False
263
+
264
+ if missing_python:
265
+ if not self.fix_python_dependencies(missing_python):
266
+ success = False
267
+
268
+ if missing_models:
269
+ if not self.fix_model_dependencies(missing_models):
270
+ success = False
271
+
272
+ return success
273
+
274
+ def generate_fix_report(self) -> str:
275
+ """Generate a report of fixes applied."""
276
+ report = []
277
+ report.append("=" * 60)
278
+ report.append("AIECS DEPENDENCY FIX REPORT")
279
+ report.append("=" * 60)
280
+
281
+ if self.fixes_applied:
282
+ report.append("\n✅ Successfully Applied Fixes:")
283
+ for fix in self.fixes_applied:
284
+ report.append(f" • {fix}")
285
+
286
+ if self.fixes_failed:
287
+ report.append("\n❌ Failed Fixes:")
288
+ for fix in self.fixes_failed:
289
+ report.append(f" • {fix}")
290
+
291
+ if not self.fixes_applied and not self.fixes_failed:
292
+ report.append("\nℹ️ No fixes were applied.")
293
+
294
+ report.append(f"\nTotal fixes applied: {len(self.fixes_applied)}")
295
+ report.append(f"Total fixes failed: {len(self.fixes_failed)}")
296
+
297
+ if self.fixes_failed:
298
+ report.append("\n⚠️ Some fixes failed. Please check the logs and try manual installation.")
299
+ else:
300
+ report.append("\n🎉 All fixes applied successfully!")
301
+
302
+ return "\n".join(report)
303
+
304
+
305
+ def main():
306
+ """Main function."""
307
+ import argparse
308
+
309
+ parser = argparse.ArgumentParser(description="Fix AIECS dependencies")
310
+ parser.add_argument("--non-interactive", action="store_true",
311
+ help="Run in non-interactive mode (auto-approve all fixes)")
312
+ parser.add_argument("--check-only", action="store_true",
313
+ help="Only check dependencies, don't fix them")
314
+
315
+ args = parser.parse_args()
316
+
317
+ # Import and run dependency checker first
318
+ try:
319
+ from aiecs.scripts.dependency_checker import DependencyChecker
320
+ checker = DependencyChecker()
321
+ tools = checker.check_all_dependencies()
322
+
323
+ if args.check_only:
324
+ report = checker.generate_report(tools)
325
+ print(report)
326
+ return 0
327
+
328
+ # Run fixer
329
+ fixer = DependencyFixer(interactive=not args.non_interactive)
330
+ success = fixer.fix_dependencies_from_checker(tools)
331
+
332
+ # Generate and display report
333
+ report = fixer.generate_fix_report()
334
+ print(report)
335
+
336
+ return 0 if success else 1
337
+
338
+ except Exception as e:
339
+ print(f"Error: {e}")
340
+ return 1
341
+
342
+
343
+ if __name__ == "__main__":
344
+ sys.exit(main())
345
+
346
+
347
+
348
+
@@ -0,0 +1,348 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Automated script to download required NLP data for AIECS ClassifierTool.
4
+
5
+ This script downloads:
6
+ 1. NLTK stopwords data package for keyword extraction
7
+ 2. spaCy English model (en_core_web_sm) for text processing
8
+ 3. spaCy Chinese model (zh_core_web_sm) for Chinese text processing
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import subprocess
14
+ import logging
15
+ from pathlib import Path
16
+ from typing import List, Tuple, Optional
17
+
18
+
19
+ def setup_logging():
20
+ """Setup logging configuration."""
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - %(message)s',
24
+ handlers=[
25
+ logging.StreamHandler(sys.stdout),
26
+ logging.FileHandler('nlp_data_download.log')
27
+ ]
28
+ )
29
+ return logging.getLogger(__name__)
30
+
31
+
32
+ def run_command(cmd: List[str], logger: logging.Logger) -> Tuple[bool, str]:
33
+ """
34
+ Run a shell command and return success status and output.
35
+
36
+ Args:
37
+ cmd: List of command arguments
38
+ logger: Logger instance
39
+
40
+ Returns:
41
+ Tuple of (success, output)
42
+ """
43
+ try:
44
+ logger.info(f"Running command: {' '.join(cmd)}")
45
+ result = subprocess.run(
46
+ cmd,
47
+ capture_output=True,
48
+ text=True,
49
+ check=True
50
+ )
51
+ logger.info(f"Command succeeded: {result.stdout}")
52
+ return True, result.stdout
53
+ except subprocess.CalledProcessError as e:
54
+ error_msg = f"Command failed with exit code {e.returncode}: {e.stderr}"
55
+ logger.error(error_msg)
56
+ return False, error_msg
57
+ except FileNotFoundError:
58
+ error_msg = f"Command not found: {cmd[0]}"
59
+ logger.error(error_msg)
60
+ return False, error_msg
61
+
62
+
63
+ def check_python_package(package_name: str, logger: logging.Logger) -> bool:
64
+ """
65
+ Check if a Python package is installed.
66
+
67
+ Args:
68
+ package_name: Name of the package to check
69
+ logger: Logger instance
70
+
71
+ Returns:
72
+ True if package is installed, False otherwise
73
+ """
74
+ try:
75
+ __import__(package_name)
76
+ logger.info(f"Package {package_name} is already installed")
77
+ return True
78
+ except ImportError:
79
+ logger.warning(f"Package {package_name} is not installed")
80
+ return False
81
+
82
+
83
+ def download_nltk_data(logger: logging.Logger) -> bool:
84
+ """
85
+ Download required NLTK data packages.
86
+
87
+ Args:
88
+ logger: Logger instance
89
+
90
+ Returns:
91
+ True if successful, False otherwise
92
+ """
93
+ logger.info("Starting NLTK data download...")
94
+
95
+ if not check_python_package('nltk', logger):
96
+ logger.error("NLTK is not installed. Please install it first with: pip install nltk")
97
+ return False
98
+
99
+ try:
100
+ import nltk
101
+
102
+ # Download required NLTK data
103
+ packages_to_download = [
104
+ 'stopwords',
105
+ 'punkt',
106
+ 'wordnet',
107
+ 'averaged_perceptron_tagger'
108
+ ]
109
+
110
+ for package in packages_to_download:
111
+ try:
112
+ logger.info(f"Downloading NLTK package: {package}")
113
+ nltk.download(package, quiet=True)
114
+ logger.info(f"Successfully downloaded NLTK package: {package}")
115
+ except Exception as e:
116
+ logger.error(f"Failed to download NLTK package {package}: {e}")
117
+ return False
118
+
119
+ logger.info("All NLTK data packages downloaded successfully")
120
+ return True
121
+
122
+ except Exception as e:
123
+ logger.error(f"Error downloading NLTK data: {e}")
124
+ return False
125
+
126
+
127
+ def download_spacy_model(model_name: str, logger: logging.Logger) -> bool:
128
+ """
129
+ Download a spaCy model.
130
+
131
+ Args:
132
+ model_name: Name of the spaCy model to download
133
+ logger: Logger instance
134
+
135
+ Returns:
136
+ True if successful, False otherwise
137
+ """
138
+ logger.info(f"Starting spaCy model download: {model_name}")
139
+
140
+ if not check_python_package('spacy', logger):
141
+ logger.error("spaCy is not installed. Please install it first with: pip install spacy")
142
+ return False
143
+
144
+ # Check if model is already installed
145
+ try:
146
+ import spacy
147
+ spacy.load(model_name)
148
+ logger.info(f"spaCy model {model_name} is already installed")
149
+ return True
150
+ except OSError:
151
+ # Model not installed, proceed with download
152
+ pass
153
+ except Exception as e:
154
+ logger.error(f"Error checking spaCy model {model_name}: {e}")
155
+ return False
156
+
157
+ # Download the model
158
+ cmd = [sys.executable, "-m", "spacy", "download", model_name]
159
+ success, output = run_command(cmd, logger)
160
+
161
+ if success:
162
+ logger.info(f"Successfully downloaded spaCy model: {model_name}")
163
+
164
+ # Verify the model can be loaded
165
+ try:
166
+ import spacy
167
+ spacy.load(model_name)
168
+ logger.info(f"Verified spaCy model {model_name} can be loaded")
169
+ return True
170
+ except Exception as e:
171
+ logger.error(f"Downloaded model {model_name} cannot be loaded: {e}")
172
+ return False
173
+ else:
174
+ logger.error(f"Failed to download spaCy model {model_name}: {output}")
175
+ return False
176
+
177
+
178
+ def download_spacy_pkuseg_model(logger: logging.Logger) -> bool:
179
+ """
180
+ Download and install spaCy PKUSeg model for Chinese text segmentation.
181
+
182
+ Args:
183
+ logger: Logger instance
184
+
185
+ Returns:
186
+ True if successful, False otherwise
187
+ """
188
+ logger.info("Starting spaCy PKUSeg model installation...")
189
+
190
+ if not check_python_package('spacy', logger):
191
+ logger.error("spaCy is not installed. Please install it first with: pip install spacy")
192
+ return False
193
+
194
+ # Check if spacy_pkuseg is already installed
195
+ if check_python_package('spacy_pkuseg', logger):
196
+ logger.info("spacy_pkuseg is already installed")
197
+ return True
198
+
199
+ # Install spacy_pkuseg package
200
+ cmd = [sys.executable, "-m", "pip", "install", "spacy_pkuseg"]
201
+ success, output = run_command(cmd, logger)
202
+
203
+ if success:
204
+ logger.info("Successfully installed spacy_pkuseg")
205
+
206
+ # Verify the package can be imported
207
+ try:
208
+ import spacy_pkuseg
209
+ logger.info("Verified spacy_pkuseg can be imported")
210
+
211
+ # Test basic functionality
212
+ seg = spacy_pkuseg.pkuseg()
213
+ test_result = seg.cut("这是一个测试句子")
214
+ logger.info(f"spacy_pkuseg test successful: {list(test_result)}")
215
+ return True
216
+ except Exception as e:
217
+ logger.error(f"Installed spacy_pkuseg cannot be used: {e}")
218
+ return False
219
+ else:
220
+ logger.error(f"Failed to install spacy_pkuseg: {output}")
221
+ return False
222
+
223
+
224
+ def download_rake_nltk_data(logger: logging.Logger) -> bool:
225
+ """
226
+ Ensure RAKE-NLTK has required data.
227
+
228
+ Args:
229
+ logger: Logger instance
230
+
231
+ Returns:
232
+ True if successful, False otherwise
233
+ """
234
+ logger.info("Checking RAKE-NLTK data...")
235
+
236
+ if not check_python_package('rake_nltk', logger):
237
+ logger.warning("RAKE-NLTK is not installed. This is optional for English keyword extraction.")
238
+ return True # Not critical, return True
239
+
240
+ try:
241
+ from rake_nltk import Rake
242
+ # Test RAKE functionality
243
+ rake = Rake()
244
+ rake.extract_keywords_from_text("This is a test sentence for RAKE.")
245
+ keywords = rake.get_ranked_phrases()
246
+ logger.info("RAKE-NLTK is working correctly")
247
+ return True
248
+ except Exception as e:
249
+ logger.warning(f"RAKE-NLTK test failed: {e}. This is not critical.")
250
+ return True # Not critical, return True
251
+
252
+
253
+ def verify_installation(logger: logging.Logger) -> bool:
254
+ """
255
+ Verify all NLP components are properly installed.
256
+
257
+ Args:
258
+ logger: Logger instance
259
+
260
+ Returns:
261
+ True if all components work, False otherwise
262
+ """
263
+ logger.info("Verifying NLP data installation...")
264
+
265
+ success = True
266
+
267
+ # Test NLTK
268
+ try:
269
+ import nltk
270
+ from nltk.corpus import stopwords
271
+ english_stopwords = stopwords.words('english')
272
+ logger.info(f"NLTK verification successful. Loaded {len(english_stopwords)} English stopwords")
273
+ except Exception as e:
274
+ logger.error(f"NLTK verification failed: {e}")
275
+ success = False
276
+
277
+ # Test spaCy English model
278
+ try:
279
+ import spacy
280
+ nlp_en = spacy.load('en_core_web_sm')
281
+ doc = nlp_en("This is a test sentence.")
282
+ logger.info(f"spaCy English model verification successful. Processed {len(doc)} tokens")
283
+ except Exception as e:
284
+ logger.error(f"spaCy English model verification failed: {e}")
285
+ success = False
286
+
287
+ # Test spaCy Chinese model (optional)
288
+ try:
289
+ import spacy
290
+ nlp_zh = spacy.load('zh_core_web_sm')
291
+ doc = nlp_zh("这是一个测试句子。")
292
+ logger.info(f"spaCy Chinese model verification successful. Processed {len(doc)} tokens")
293
+ except Exception as e:
294
+ logger.warning(f"spaCy Chinese model verification failed: {e}. This is optional.")
295
+
296
+ # Test spaCy PKUSeg model (optional)
297
+ try:
298
+ import spacy_pkuseg
299
+ seg = spacy_pkuseg.pkuseg()
300
+ result = list(seg.cut("这是一个测试句子"))
301
+ logger.info(f"spaCy PKUSeg model verification successful. Segmented: {result}")
302
+ except Exception as e:
303
+ logger.warning(f"spaCy PKUSeg model verification failed: {e}. This is optional.")
304
+
305
+ return success
306
+
307
+
308
+ def main():
309
+ """Main function to download all required NLP data."""
310
+ logger = setup_logging()
311
+ logger.info("Starting AIECS NLP data download process...")
312
+
313
+ success = True
314
+
315
+ # Download NLTK data
316
+ if not download_nltk_data(logger):
317
+ success = False
318
+
319
+ # Download spaCy English model
320
+ if not download_spacy_model('en_core_web_sm', logger):
321
+ success = False
322
+
323
+ # Download spaCy Chinese model (optional)
324
+ if not download_spacy_model('zh_core_web_sm', logger):
325
+ logger.warning("Chinese model download failed, but this is optional")
326
+ # Don't mark as failure for Chinese model
327
+
328
+ # Download spaCy Chinese segmentation model (optional)
329
+ if not download_spacy_pkuseg_model(logger):
330
+ logger.warning("spaCy PKUSeg model download failed, but this is optional")
331
+ # Don't mark as failure for PKUSeg model
332
+
333
+ # Check RAKE-NLTK (optional)
334
+ download_rake_nltk_data(logger)
335
+
336
+ # Verify installation
337
+ if success and verify_installation(logger):
338
+ logger.info("✅ All NLP data downloaded and verified successfully!")
339
+ logger.info("AIECS ClassifierTool is ready to use.")
340
+ return 0
341
+ else:
342
+ logger.error("❌ Some NLP data downloads failed. Please check the logs above.")
343
+ logger.error("You may need to install missing packages or run this script again.")
344
+ return 1
345
+
346
+
347
+ if __name__ == "__main__":
348
+ sys.exit(main())