aiecs 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +75 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +295 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +341 -0
- aiecs/config/__init__.py +15 -0
- aiecs/config/config.py +117 -0
- aiecs/config/registry.py +19 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +150 -0
- aiecs/core/interface/storage_interface.py +214 -0
- aiecs/domain/__init__.py +20 -0
- aiecs/domain/context/__init__.py +28 -0
- aiecs/domain/context/content_engine.py +982 -0
- aiecs/domain/context/conversation_models.py +306 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +49 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +460 -0
- aiecs/domain/task/model.py +50 -0
- aiecs/domain/task/task_context.py +257 -0
- aiecs/infrastructure/__init__.py +26 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +341 -0
- aiecs/infrastructure/messaging/websocket_manager.py +289 -0
- aiecs/infrastructure/monitoring/__init__.py +12 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +138 -0
- aiecs/infrastructure/monitoring/structured_logger.py +50 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +376 -0
- aiecs/infrastructure/persistence/__init__.py +12 -0
- aiecs/infrastructure/persistence/database_manager.py +286 -0
- aiecs/infrastructure/persistence/file_storage.py +671 -0
- aiecs/infrastructure/persistence/redis_client.py +162 -0
- aiecs/llm/__init__.py +54 -0
- aiecs/llm/base_client.py +99 -0
- aiecs/llm/client_factory.py +339 -0
- aiecs/llm/custom_callbacks.py +228 -0
- aiecs/llm/openai_client.py +125 -0
- aiecs/llm/vertex_client.py +186 -0
- aiecs/llm/xai_client.py +184 -0
- aiecs/main.py +351 -0
- aiecs/scripts/DEPENDENCY_SYSTEM_SUMMARY.md +241 -0
- aiecs/scripts/README_DEPENDENCY_CHECKER.md +309 -0
- aiecs/scripts/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/dependency_checker.py +825 -0
- aiecs/scripts/dependency_fixer.py +348 -0
- aiecs/scripts/download_nlp_data.py +348 -0
- aiecs/scripts/fix_weasel_validator.py +121 -0
- aiecs/scripts/fix_weasel_validator.sh +82 -0
- aiecs/scripts/patch_weasel_library.sh +188 -0
- aiecs/scripts/quick_dependency_check.py +269 -0
- aiecs/scripts/run_weasel_patch.sh +41 -0
- aiecs/scripts/setup_nlp_data.sh +217 -0
- aiecs/tasks/__init__.py +2 -0
- aiecs/tasks/worker.py +111 -0
- aiecs/tools/__init__.py +196 -0
- aiecs/tools/base_tool.py +202 -0
- aiecs/tools/langchain_adapter.py +361 -0
- aiecs/tools/task_tools/__init__.py +82 -0
- aiecs/tools/task_tools/chart_tool.py +704 -0
- aiecs/tools/task_tools/classfire_tool.py +901 -0
- aiecs/tools/task_tools/image_tool.py +397 -0
- aiecs/tools/task_tools/office_tool.py +600 -0
- aiecs/tools/task_tools/pandas_tool.py +565 -0
- aiecs/tools/task_tools/report_tool.py +499 -0
- aiecs/tools/task_tools/research_tool.py +363 -0
- aiecs/tools/task_tools/scraper_tool.py +548 -0
- aiecs/tools/task_tools/search_api.py +7 -0
- aiecs/tools/task_tools/stats_tool.py +513 -0
- aiecs/tools/temp_file_manager.py +126 -0
- aiecs/tools/tool_executor/__init__.py +35 -0
- aiecs/tools/tool_executor/tool_executor.py +518 -0
- aiecs/utils/LLM_output_structor.py +409 -0
- aiecs/utils/__init__.py +23 -0
- aiecs/utils/base_callback.py +50 -0
- aiecs/utils/execution_utils.py +158 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +13 -0
- aiecs/utils/token_usage_repository.py +279 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +41 -0
- aiecs-1.0.0.dist-info/METADATA +610 -0
- aiecs-1.0.0.dist-info/RECORD +90 -0
- aiecs-1.0.0.dist-info/WHEEL +5 -0
- aiecs-1.0.0.dist-info/entry_points.txt +7 -0
- aiecs-1.0.0.dist-info/licenses/LICENSE +225 -0
- aiecs-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Dependency fixer for AIECS tools.
|
|
4
|
+
|
|
5
|
+
This script automatically installs missing dependencies based on the
|
|
6
|
+
dependency checker results.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
import subprocess
|
|
12
|
+
import platform
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Dict, List, Tuple, Optional
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DependencyFixer:
|
|
19
|
+
"""Automatic dependency fixer for AIECS."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, interactive: bool = True):
|
|
22
|
+
self.logger = self._setup_logging()
|
|
23
|
+
self.system = platform.system().lower()
|
|
24
|
+
self.interactive = interactive
|
|
25
|
+
self.fixes_applied = []
|
|
26
|
+
self.fixes_failed = []
|
|
27
|
+
|
|
28
|
+
def _setup_logging(self) -> logging.Logger:
|
|
29
|
+
"""Setup logging configuration."""
|
|
30
|
+
logging.basicConfig(
|
|
31
|
+
level=logging.INFO,
|
|
32
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
33
|
+
handlers=[
|
|
34
|
+
logging.StreamHandler(sys.stdout),
|
|
35
|
+
logging.FileHandler('dependency_fix.log')
|
|
36
|
+
]
|
|
37
|
+
)
|
|
38
|
+
return logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
def _run_command(self, cmd: List[str], description: str) -> bool:
|
|
41
|
+
"""Run a command and return success status."""
|
|
42
|
+
try:
|
|
43
|
+
self.logger.info(f"Running: {description}")
|
|
44
|
+
self.logger.info(f"Command: {' '.join(cmd)}")
|
|
45
|
+
|
|
46
|
+
result = subprocess.run(
|
|
47
|
+
cmd,
|
|
48
|
+
capture_output=True,
|
|
49
|
+
text=True,
|
|
50
|
+
check=True,
|
|
51
|
+
timeout=300 # 5 minutes timeout
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
self.logger.info(f"Success: {description}")
|
|
55
|
+
if result.stdout:
|
|
56
|
+
self.logger.info(f"Output: {result.stdout}")
|
|
57
|
+
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
except subprocess.CalledProcessError as e:
|
|
61
|
+
self.logger.error(f"Failed: {description}")
|
|
62
|
+
self.logger.error(f"Error: {e.stderr}")
|
|
63
|
+
return False
|
|
64
|
+
except subprocess.TimeoutExpired:
|
|
65
|
+
self.logger.error(f"Timeout: {description}")
|
|
66
|
+
return False
|
|
67
|
+
except Exception as e:
|
|
68
|
+
self.logger.error(f"Error: {description} - {e}")
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
def _ask_confirmation(self, message: str) -> bool:
|
|
72
|
+
"""Ask for user confirmation if in interactive mode."""
|
|
73
|
+
if not self.interactive:
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
while True:
|
|
77
|
+
response = input(f"{message} (y/n): ").lower().strip()
|
|
78
|
+
if response in ['y', 'yes']:
|
|
79
|
+
return True
|
|
80
|
+
elif response in ['n', 'no']:
|
|
81
|
+
return False
|
|
82
|
+
else:
|
|
83
|
+
print("Please enter 'y' or 'n'")
|
|
84
|
+
|
|
85
|
+
def fix_system_dependencies(self, missing_deps: List[str]) -> bool:
|
|
86
|
+
"""Fix missing system dependencies."""
|
|
87
|
+
if not missing_deps:
|
|
88
|
+
return True
|
|
89
|
+
|
|
90
|
+
self.logger.info("Fixing system dependencies...")
|
|
91
|
+
|
|
92
|
+
# Group dependencies by package manager
|
|
93
|
+
apt_packages = []
|
|
94
|
+
brew_packages = []
|
|
95
|
+
pip_packages = []
|
|
96
|
+
|
|
97
|
+
for dep in missing_deps:
|
|
98
|
+
if dep == "java":
|
|
99
|
+
if self.system == "linux":
|
|
100
|
+
apt_packages.append("openjdk-11-jdk")
|
|
101
|
+
elif self.system == "darwin":
|
|
102
|
+
brew_packages.append("openjdk@11")
|
|
103
|
+
elif dep == "tesseract":
|
|
104
|
+
if self.system == "linux":
|
|
105
|
+
apt_packages.extend(["tesseract-ocr", "tesseract-ocr-eng"])
|
|
106
|
+
elif self.system == "darwin":
|
|
107
|
+
brew_packages.append("tesseract")
|
|
108
|
+
elif dep == "tesseract_lang_packs":
|
|
109
|
+
if self.system == "linux":
|
|
110
|
+
apt_packages.extend([
|
|
111
|
+
"tesseract-ocr-chi-sim", "tesseract-ocr-chi-tra",
|
|
112
|
+
"tesseract-ocr-fra", "tesseract-ocr-deu",
|
|
113
|
+
"tesseract-ocr-jpn", "tesseract-ocr-kor",
|
|
114
|
+
"tesseract-ocr-rus", "tesseract-ocr-spa"
|
|
115
|
+
])
|
|
116
|
+
elif dep == "pillow_system_deps":
|
|
117
|
+
if self.system == "linux":
|
|
118
|
+
apt_packages.extend([
|
|
119
|
+
"libjpeg-dev", "zlib1g-dev", "libpng-dev",
|
|
120
|
+
"libtiff-dev", "libwebp-dev", "libopenjp2-7-dev"
|
|
121
|
+
])
|
|
122
|
+
elif self.system == "darwin":
|
|
123
|
+
brew_packages.extend(["libjpeg", "zlib", "libpng", "libtiff", "webp", "openjpeg"])
|
|
124
|
+
elif dep == "pyreadstat_deps":
|
|
125
|
+
if self.system == "linux":
|
|
126
|
+
apt_packages.append("libreadstat-dev")
|
|
127
|
+
elif self.system == "darwin":
|
|
128
|
+
brew_packages.append("readstat")
|
|
129
|
+
elif dep == "weasyprint_deps":
|
|
130
|
+
if self.system == "linux":
|
|
131
|
+
apt_packages.extend([
|
|
132
|
+
"libcairo2-dev", "libpango1.0-dev", "libgdk-pixbuf2.0-dev",
|
|
133
|
+
"libffi-dev", "shared-mime-info"
|
|
134
|
+
])
|
|
135
|
+
elif self.system == "darwin":
|
|
136
|
+
brew_packages.extend(["cairo", "pango", "gdk-pixbuf", "libffi"])
|
|
137
|
+
elif dep == "matplotlib_deps":
|
|
138
|
+
if self.system == "linux":
|
|
139
|
+
apt_packages.extend([
|
|
140
|
+
"libfreetype6-dev", "libpng-dev", "libjpeg-dev",
|
|
141
|
+
"libtiff-dev", "libwebp-dev"
|
|
142
|
+
])
|
|
143
|
+
elif self.system == "darwin":
|
|
144
|
+
brew_packages.extend(["freetype", "libpng", "libjpeg", "libtiff", "webp"])
|
|
145
|
+
|
|
146
|
+
# Install apt packages
|
|
147
|
+
if apt_packages and self.system == "linux":
|
|
148
|
+
if self._ask_confirmation(f"Install system packages: {', '.join(apt_packages)}?"):
|
|
149
|
+
cmd = ["sudo", "apt-get", "update"]
|
|
150
|
+
if self._run_command(cmd, "Update package list"):
|
|
151
|
+
cmd = ["sudo", "apt-get", "install", "-y"] + apt_packages
|
|
152
|
+
if self._run_command(cmd, f"Install packages: {', '.join(apt_packages)}"):
|
|
153
|
+
self.fixes_applied.append(f"System packages: {', '.join(apt_packages)}")
|
|
154
|
+
else:
|
|
155
|
+
self.fixes_failed.append(f"System packages: {', '.join(apt_packages)}")
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
# Install brew packages
|
|
159
|
+
if brew_packages and self.system == "darwin":
|
|
160
|
+
if self._ask_confirmation(f"Install Homebrew packages: {', '.join(brew_packages)}?"):
|
|
161
|
+
for package in brew_packages:
|
|
162
|
+
cmd = ["brew", "install", package]
|
|
163
|
+
if self._run_command(cmd, f"Install {package}"):
|
|
164
|
+
self.fixes_applied.append(f"Homebrew package: {package}")
|
|
165
|
+
else:
|
|
166
|
+
self.fixes_failed.append(f"Homebrew package: {package}")
|
|
167
|
+
return False
|
|
168
|
+
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
def fix_python_dependencies(self, missing_packages: List[str]) -> bool:
|
|
172
|
+
"""Fix missing Python packages."""
|
|
173
|
+
if not missing_packages:
|
|
174
|
+
return True
|
|
175
|
+
|
|
176
|
+
self.logger.info("Fixing Python dependencies...")
|
|
177
|
+
|
|
178
|
+
if self._ask_confirmation(f"Install Python packages: {', '.join(missing_packages)}?"):
|
|
179
|
+
cmd = [sys.executable, "-m", "pip", "install"] + missing_packages
|
|
180
|
+
if self._run_command(cmd, f"Install Python packages: {', '.join(missing_packages)}"):
|
|
181
|
+
self.fixes_applied.append(f"Python packages: {', '.join(missing_packages)}")
|
|
182
|
+
return True
|
|
183
|
+
else:
|
|
184
|
+
self.fixes_failed.append(f"Python packages: {', '.join(missing_packages)}")
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
def fix_model_dependencies(self, missing_models: List[str]) -> bool:
|
|
190
|
+
"""Fix missing model dependencies."""
|
|
191
|
+
if not missing_models:
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
self.logger.info("Fixing model dependencies...")
|
|
195
|
+
|
|
196
|
+
# spaCy models
|
|
197
|
+
spacy_models = [m for m in missing_models if m.startswith("spacy_")]
|
|
198
|
+
if spacy_models:
|
|
199
|
+
if self._ask_confirmation(f"Download spaCy models: {', '.join(spacy_models)}?"):
|
|
200
|
+
for model in spacy_models:
|
|
201
|
+
model_name = model.replace("spacy_", "")
|
|
202
|
+
cmd = [sys.executable, "-m", "spacy", "download", model_name]
|
|
203
|
+
if self._run_command(cmd, f"Download spaCy model: {model_name}"):
|
|
204
|
+
self.fixes_applied.append(f"spaCy model: {model_name}")
|
|
205
|
+
else:
|
|
206
|
+
self.fixes_failed.append(f"spaCy model: {model_name}")
|
|
207
|
+
|
|
208
|
+
# NLTK data
|
|
209
|
+
nltk_data = [m for m in missing_models if m.startswith("nltk_")]
|
|
210
|
+
if nltk_data:
|
|
211
|
+
if self._ask_confirmation(f"Download NLTK data: {', '.join(nltk_data)}?"):
|
|
212
|
+
for data in nltk_data:
|
|
213
|
+
data_name = data.replace("nltk_", "")
|
|
214
|
+
cmd = [sys.executable, "-c", f"import nltk; nltk.download('{data_name}')"]
|
|
215
|
+
if self._run_command(cmd, f"Download NLTK data: {data_name}"):
|
|
216
|
+
self.fixes_applied.append(f"NLTK data: {data_name}")
|
|
217
|
+
else:
|
|
218
|
+
self.fixes_failed.append(f"NLTK data: {data_name}")
|
|
219
|
+
|
|
220
|
+
# Playwright browsers
|
|
221
|
+
if "playwright_browsers" in missing_models:
|
|
222
|
+
if self._ask_confirmation("Install Playwright browsers?"):
|
|
223
|
+
cmd = [sys.executable, "-m", "playwright", "install"]
|
|
224
|
+
if self._run_command(cmd, "Install Playwright browsers"):
|
|
225
|
+
self.fixes_applied.append("Playwright browsers")
|
|
226
|
+
else:
|
|
227
|
+
self.fixes_failed.append("Playwright browsers")
|
|
228
|
+
|
|
229
|
+
return True
|
|
230
|
+
|
|
231
|
+
def fix_dependencies_from_checker(self, checker_results: Dict) -> bool:
|
|
232
|
+
"""Fix dependencies based on checker results."""
|
|
233
|
+
self.logger.info("Starting dependency fixing process...")
|
|
234
|
+
|
|
235
|
+
# Extract missing dependencies
|
|
236
|
+
missing_system = []
|
|
237
|
+
missing_python = []
|
|
238
|
+
missing_models = []
|
|
239
|
+
|
|
240
|
+
for tool_name, tool_deps in checker_results.items():
|
|
241
|
+
if isinstance(tool_deps, dict):
|
|
242
|
+
# Handle tool-specific dependencies
|
|
243
|
+
for dep_type, deps in tool_deps.items():
|
|
244
|
+
if dep_type == "system_deps":
|
|
245
|
+
for dep in deps:
|
|
246
|
+
if dep.status.value == "missing":
|
|
247
|
+
missing_system.append(dep.name.lower().replace(" ", "_"))
|
|
248
|
+
elif dep_type == "python_deps":
|
|
249
|
+
for dep in deps:
|
|
250
|
+
if dep.status.value == "missing":
|
|
251
|
+
missing_python.append(dep.name)
|
|
252
|
+
elif dep_type == "model_deps":
|
|
253
|
+
for dep in deps:
|
|
254
|
+
if dep.status.value == "missing":
|
|
255
|
+
missing_models.append(f"{dep.name.lower().replace(' ', '_')}")
|
|
256
|
+
|
|
257
|
+
# Apply fixes
|
|
258
|
+
success = True
|
|
259
|
+
|
|
260
|
+
if missing_system:
|
|
261
|
+
if not self.fix_system_dependencies(missing_system):
|
|
262
|
+
success = False
|
|
263
|
+
|
|
264
|
+
if missing_python:
|
|
265
|
+
if not self.fix_python_dependencies(missing_python):
|
|
266
|
+
success = False
|
|
267
|
+
|
|
268
|
+
if missing_models:
|
|
269
|
+
if not self.fix_model_dependencies(missing_models):
|
|
270
|
+
success = False
|
|
271
|
+
|
|
272
|
+
return success
|
|
273
|
+
|
|
274
|
+
def generate_fix_report(self) -> str:
|
|
275
|
+
"""Generate a report of fixes applied."""
|
|
276
|
+
report = []
|
|
277
|
+
report.append("=" * 60)
|
|
278
|
+
report.append("AIECS DEPENDENCY FIX REPORT")
|
|
279
|
+
report.append("=" * 60)
|
|
280
|
+
|
|
281
|
+
if self.fixes_applied:
|
|
282
|
+
report.append("\n✅ Successfully Applied Fixes:")
|
|
283
|
+
for fix in self.fixes_applied:
|
|
284
|
+
report.append(f" • {fix}")
|
|
285
|
+
|
|
286
|
+
if self.fixes_failed:
|
|
287
|
+
report.append("\n❌ Failed Fixes:")
|
|
288
|
+
for fix in self.fixes_failed:
|
|
289
|
+
report.append(f" • {fix}")
|
|
290
|
+
|
|
291
|
+
if not self.fixes_applied and not self.fixes_failed:
|
|
292
|
+
report.append("\nℹ️ No fixes were applied.")
|
|
293
|
+
|
|
294
|
+
report.append(f"\nTotal fixes applied: {len(self.fixes_applied)}")
|
|
295
|
+
report.append(f"Total fixes failed: {len(self.fixes_failed)}")
|
|
296
|
+
|
|
297
|
+
if self.fixes_failed:
|
|
298
|
+
report.append("\n⚠️ Some fixes failed. Please check the logs and try manual installation.")
|
|
299
|
+
else:
|
|
300
|
+
report.append("\n🎉 All fixes applied successfully!")
|
|
301
|
+
|
|
302
|
+
return "\n".join(report)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def main():
|
|
306
|
+
"""Main function."""
|
|
307
|
+
import argparse
|
|
308
|
+
|
|
309
|
+
parser = argparse.ArgumentParser(description="Fix AIECS dependencies")
|
|
310
|
+
parser.add_argument("--non-interactive", action="store_true",
|
|
311
|
+
help="Run in non-interactive mode (auto-approve all fixes)")
|
|
312
|
+
parser.add_argument("--check-only", action="store_true",
|
|
313
|
+
help="Only check dependencies, don't fix them")
|
|
314
|
+
|
|
315
|
+
args = parser.parse_args()
|
|
316
|
+
|
|
317
|
+
# Import and run dependency checker first
|
|
318
|
+
try:
|
|
319
|
+
from aiecs.scripts.dependency_checker import DependencyChecker
|
|
320
|
+
checker = DependencyChecker()
|
|
321
|
+
tools = checker.check_all_dependencies()
|
|
322
|
+
|
|
323
|
+
if args.check_only:
|
|
324
|
+
report = checker.generate_report(tools)
|
|
325
|
+
print(report)
|
|
326
|
+
return 0
|
|
327
|
+
|
|
328
|
+
# Run fixer
|
|
329
|
+
fixer = DependencyFixer(interactive=not args.non_interactive)
|
|
330
|
+
success = fixer.fix_dependencies_from_checker(tools)
|
|
331
|
+
|
|
332
|
+
# Generate and display report
|
|
333
|
+
report = fixer.generate_fix_report()
|
|
334
|
+
print(report)
|
|
335
|
+
|
|
336
|
+
return 0 if success else 1
|
|
337
|
+
|
|
338
|
+
except Exception as e:
|
|
339
|
+
print(f"Error: {e}")
|
|
340
|
+
return 1
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
if __name__ == "__main__":
|
|
344
|
+
sys.exit(main())
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Automated script to download required NLP data for AIECS ClassifierTool.
|
|
4
|
+
|
|
5
|
+
This script downloads:
|
|
6
|
+
1. NLTK stopwords data package for keyword extraction
|
|
7
|
+
2. spaCy English model (en_core_web_sm) for text processing
|
|
8
|
+
3. spaCy Chinese model (zh_core_web_sm) for Chinese text processing
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
import subprocess
|
|
14
|
+
import logging
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import List, Tuple, Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def setup_logging():
|
|
20
|
+
"""Setup logging configuration."""
|
|
21
|
+
logging.basicConfig(
|
|
22
|
+
level=logging.INFO,
|
|
23
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
24
|
+
handlers=[
|
|
25
|
+
logging.StreamHandler(sys.stdout),
|
|
26
|
+
logging.FileHandler('nlp_data_download.log')
|
|
27
|
+
]
|
|
28
|
+
)
|
|
29
|
+
return logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def run_command(cmd: List[str], logger: logging.Logger) -> Tuple[bool, str]:
|
|
33
|
+
"""
|
|
34
|
+
Run a shell command and return success status and output.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
cmd: List of command arguments
|
|
38
|
+
logger: Logger instance
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple of (success, output)
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
logger.info(f"Running command: {' '.join(cmd)}")
|
|
45
|
+
result = subprocess.run(
|
|
46
|
+
cmd,
|
|
47
|
+
capture_output=True,
|
|
48
|
+
text=True,
|
|
49
|
+
check=True
|
|
50
|
+
)
|
|
51
|
+
logger.info(f"Command succeeded: {result.stdout}")
|
|
52
|
+
return True, result.stdout
|
|
53
|
+
except subprocess.CalledProcessError as e:
|
|
54
|
+
error_msg = f"Command failed with exit code {e.returncode}: {e.stderr}"
|
|
55
|
+
logger.error(error_msg)
|
|
56
|
+
return False, error_msg
|
|
57
|
+
except FileNotFoundError:
|
|
58
|
+
error_msg = f"Command not found: {cmd[0]}"
|
|
59
|
+
logger.error(error_msg)
|
|
60
|
+
return False, error_msg
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def check_python_package(package_name: str, logger: logging.Logger) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Check if a Python package is installed.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
package_name: Name of the package to check
|
|
69
|
+
logger: Logger instance
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
True if package is installed, False otherwise
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
__import__(package_name)
|
|
76
|
+
logger.info(f"Package {package_name} is already installed")
|
|
77
|
+
return True
|
|
78
|
+
except ImportError:
|
|
79
|
+
logger.warning(f"Package {package_name} is not installed")
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def download_nltk_data(logger: logging.Logger) -> bool:
|
|
84
|
+
"""
|
|
85
|
+
Download required NLTK data packages.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
logger: Logger instance
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
True if successful, False otherwise
|
|
92
|
+
"""
|
|
93
|
+
logger.info("Starting NLTK data download...")
|
|
94
|
+
|
|
95
|
+
if not check_python_package('nltk', logger):
|
|
96
|
+
logger.error("NLTK is not installed. Please install it first with: pip install nltk")
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
import nltk
|
|
101
|
+
|
|
102
|
+
# Download required NLTK data
|
|
103
|
+
packages_to_download = [
|
|
104
|
+
'stopwords',
|
|
105
|
+
'punkt',
|
|
106
|
+
'wordnet',
|
|
107
|
+
'averaged_perceptron_tagger'
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
for package in packages_to_download:
|
|
111
|
+
try:
|
|
112
|
+
logger.info(f"Downloading NLTK package: {package}")
|
|
113
|
+
nltk.download(package, quiet=True)
|
|
114
|
+
logger.info(f"Successfully downloaded NLTK package: {package}")
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.error(f"Failed to download NLTK package {package}: {e}")
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
logger.info("All NLTK data packages downloaded successfully")
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Error downloading NLTK data: {e}")
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def download_spacy_model(model_name: str, logger: logging.Logger) -> bool:
|
|
128
|
+
"""
|
|
129
|
+
Download a spaCy model.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
model_name: Name of the spaCy model to download
|
|
133
|
+
logger: Logger instance
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
True if successful, False otherwise
|
|
137
|
+
"""
|
|
138
|
+
logger.info(f"Starting spaCy model download: {model_name}")
|
|
139
|
+
|
|
140
|
+
if not check_python_package('spacy', logger):
|
|
141
|
+
logger.error("spaCy is not installed. Please install it first with: pip install spacy")
|
|
142
|
+
return False
|
|
143
|
+
|
|
144
|
+
# Check if model is already installed
|
|
145
|
+
try:
|
|
146
|
+
import spacy
|
|
147
|
+
spacy.load(model_name)
|
|
148
|
+
logger.info(f"spaCy model {model_name} is already installed")
|
|
149
|
+
return True
|
|
150
|
+
except OSError:
|
|
151
|
+
# Model not installed, proceed with download
|
|
152
|
+
pass
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"Error checking spaCy model {model_name}: {e}")
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
# Download the model
|
|
158
|
+
cmd = [sys.executable, "-m", "spacy", "download", model_name]
|
|
159
|
+
success, output = run_command(cmd, logger)
|
|
160
|
+
|
|
161
|
+
if success:
|
|
162
|
+
logger.info(f"Successfully downloaded spaCy model: {model_name}")
|
|
163
|
+
|
|
164
|
+
# Verify the model can be loaded
|
|
165
|
+
try:
|
|
166
|
+
import spacy
|
|
167
|
+
spacy.load(model_name)
|
|
168
|
+
logger.info(f"Verified spaCy model {model_name} can be loaded")
|
|
169
|
+
return True
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Downloaded model {model_name} cannot be loaded: {e}")
|
|
172
|
+
return False
|
|
173
|
+
else:
|
|
174
|
+
logger.error(f"Failed to download spaCy model {model_name}: {output}")
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def download_spacy_pkuseg_model(logger: logging.Logger) -> bool:
|
|
179
|
+
"""
|
|
180
|
+
Download and install spaCy PKUSeg model for Chinese text segmentation.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
logger: Logger instance
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
True if successful, False otherwise
|
|
187
|
+
"""
|
|
188
|
+
logger.info("Starting spaCy PKUSeg model installation...")
|
|
189
|
+
|
|
190
|
+
if not check_python_package('spacy', logger):
|
|
191
|
+
logger.error("spaCy is not installed. Please install it first with: pip install spacy")
|
|
192
|
+
return False
|
|
193
|
+
|
|
194
|
+
# Check if spacy_pkuseg is already installed
|
|
195
|
+
if check_python_package('spacy_pkuseg', logger):
|
|
196
|
+
logger.info("spacy_pkuseg is already installed")
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
# Install spacy_pkuseg package
|
|
200
|
+
cmd = [sys.executable, "-m", "pip", "install", "spacy_pkuseg"]
|
|
201
|
+
success, output = run_command(cmd, logger)
|
|
202
|
+
|
|
203
|
+
if success:
|
|
204
|
+
logger.info("Successfully installed spacy_pkuseg")
|
|
205
|
+
|
|
206
|
+
# Verify the package can be imported
|
|
207
|
+
try:
|
|
208
|
+
import spacy_pkuseg
|
|
209
|
+
logger.info("Verified spacy_pkuseg can be imported")
|
|
210
|
+
|
|
211
|
+
# Test basic functionality
|
|
212
|
+
seg = spacy_pkuseg.pkuseg()
|
|
213
|
+
test_result = seg.cut("这是一个测试句子")
|
|
214
|
+
logger.info(f"spacy_pkuseg test successful: {list(test_result)}")
|
|
215
|
+
return True
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"Installed spacy_pkuseg cannot be used: {e}")
|
|
218
|
+
return False
|
|
219
|
+
else:
|
|
220
|
+
logger.error(f"Failed to install spacy_pkuseg: {output}")
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def download_rake_nltk_data(logger: logging.Logger) -> bool:
|
|
225
|
+
"""
|
|
226
|
+
Ensure RAKE-NLTK has required data.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
logger: Logger instance
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
True if successful, False otherwise
|
|
233
|
+
"""
|
|
234
|
+
logger.info("Checking RAKE-NLTK data...")
|
|
235
|
+
|
|
236
|
+
if not check_python_package('rake_nltk', logger):
|
|
237
|
+
logger.warning("RAKE-NLTK is not installed. This is optional for English keyword extraction.")
|
|
238
|
+
return True # Not critical, return True
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
from rake_nltk import Rake
|
|
242
|
+
# Test RAKE functionality
|
|
243
|
+
rake = Rake()
|
|
244
|
+
rake.extract_keywords_from_text("This is a test sentence for RAKE.")
|
|
245
|
+
keywords = rake.get_ranked_phrases()
|
|
246
|
+
logger.info("RAKE-NLTK is working correctly")
|
|
247
|
+
return True
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.warning(f"RAKE-NLTK test failed: {e}. This is not critical.")
|
|
250
|
+
return True # Not critical, return True
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def verify_installation(logger: logging.Logger) -> bool:
|
|
254
|
+
"""
|
|
255
|
+
Verify all NLP components are properly installed.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
logger: Logger instance
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
True if all components work, False otherwise
|
|
262
|
+
"""
|
|
263
|
+
logger.info("Verifying NLP data installation...")
|
|
264
|
+
|
|
265
|
+
success = True
|
|
266
|
+
|
|
267
|
+
# Test NLTK
|
|
268
|
+
try:
|
|
269
|
+
import nltk
|
|
270
|
+
from nltk.corpus import stopwords
|
|
271
|
+
english_stopwords = stopwords.words('english')
|
|
272
|
+
logger.info(f"NLTK verification successful. Loaded {len(english_stopwords)} English stopwords")
|
|
273
|
+
except Exception as e:
|
|
274
|
+
logger.error(f"NLTK verification failed: {e}")
|
|
275
|
+
success = False
|
|
276
|
+
|
|
277
|
+
# Test spaCy English model
|
|
278
|
+
try:
|
|
279
|
+
import spacy
|
|
280
|
+
nlp_en = spacy.load('en_core_web_sm')
|
|
281
|
+
doc = nlp_en("This is a test sentence.")
|
|
282
|
+
logger.info(f"spaCy English model verification successful. Processed {len(doc)} tokens")
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.error(f"spaCy English model verification failed: {e}")
|
|
285
|
+
success = False
|
|
286
|
+
|
|
287
|
+
# Test spaCy Chinese model (optional)
|
|
288
|
+
try:
|
|
289
|
+
import spacy
|
|
290
|
+
nlp_zh = spacy.load('zh_core_web_sm')
|
|
291
|
+
doc = nlp_zh("这是一个测试句子。")
|
|
292
|
+
logger.info(f"spaCy Chinese model verification successful. Processed {len(doc)} tokens")
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.warning(f"spaCy Chinese model verification failed: {e}. This is optional.")
|
|
295
|
+
|
|
296
|
+
# Test spaCy PKUSeg model (optional)
|
|
297
|
+
try:
|
|
298
|
+
import spacy_pkuseg
|
|
299
|
+
seg = spacy_pkuseg.pkuseg()
|
|
300
|
+
result = list(seg.cut("这是一个测试句子"))
|
|
301
|
+
logger.info(f"spaCy PKUSeg model verification successful. Segmented: {result}")
|
|
302
|
+
except Exception as e:
|
|
303
|
+
logger.warning(f"spaCy PKUSeg model verification failed: {e}. This is optional.")
|
|
304
|
+
|
|
305
|
+
return success
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def main():
|
|
309
|
+
"""Main function to download all required NLP data."""
|
|
310
|
+
logger = setup_logging()
|
|
311
|
+
logger.info("Starting AIECS NLP data download process...")
|
|
312
|
+
|
|
313
|
+
success = True
|
|
314
|
+
|
|
315
|
+
# Download NLTK data
|
|
316
|
+
if not download_nltk_data(logger):
|
|
317
|
+
success = False
|
|
318
|
+
|
|
319
|
+
# Download spaCy English model
|
|
320
|
+
if not download_spacy_model('en_core_web_sm', logger):
|
|
321
|
+
success = False
|
|
322
|
+
|
|
323
|
+
# Download spaCy Chinese model (optional)
|
|
324
|
+
if not download_spacy_model('zh_core_web_sm', logger):
|
|
325
|
+
logger.warning("Chinese model download failed, but this is optional")
|
|
326
|
+
# Don't mark as failure for Chinese model
|
|
327
|
+
|
|
328
|
+
# Download spaCy Chinese segmentation model (optional)
|
|
329
|
+
if not download_spacy_pkuseg_model(logger):
|
|
330
|
+
logger.warning("spaCy PKUSeg model download failed, but this is optional")
|
|
331
|
+
# Don't mark as failure for PKUSeg model
|
|
332
|
+
|
|
333
|
+
# Check RAKE-NLTK (optional)
|
|
334
|
+
download_rake_nltk_data(logger)
|
|
335
|
+
|
|
336
|
+
# Verify installation
|
|
337
|
+
if success and verify_installation(logger):
|
|
338
|
+
logger.info("✅ All NLP data downloaded and verified successfully!")
|
|
339
|
+
logger.info("AIECS ClassifierTool is ready to use.")
|
|
340
|
+
return 0
|
|
341
|
+
else:
|
|
342
|
+
logger.error("❌ Some NLP data downloads failed. Please check the logs above.")
|
|
343
|
+
logger.error("You may need to install missing packages or run this script again.")
|
|
344
|
+
return 1
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
if __name__ == "__main__":
|
|
348
|
+
sys.exit(main())
|