rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -7,66 +7,63 @@ and improve the user experience in production environments.
|
|
7
7
|
|
8
8
|
import logging
|
9
9
|
import os
|
10
|
+
from typing import Any
|
10
11
|
|
11
12
|
# Production logging configuration
|
12
13
|
|
13
14
|
|
14
|
-
def configure_production_logging():
|
15
|
+
def configure_production_logging() -> None:
|
15
16
|
"""Configure logging for production to reduce verbose warnings"""
|
16
17
|
|
17
18
|
# Don't use basicConfig here - let main.py handle it
|
18
19
|
# Just set specific loggers to less verbose levels
|
19
|
-
logging.getLogger(
|
20
|
-
logging.getLogger(
|
21
|
-
logging.getLogger(
|
20
|
+
logging.getLogger("requests").setLevel(logging.WARNING)
|
21
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
22
|
+
logging.getLogger("requests_cache").setLevel(logging.WARNING)
|
22
23
|
|
23
24
|
# If PRODUCTION environment variable is set, be even quieter
|
24
|
-
if os.getenv(
|
25
|
+
if os.getenv("PRODUCTION", "false").lower() == "true":
|
25
26
|
logging.getLogger().setLevel(logging.WARNING)
|
26
|
-
logging.getLogger(
|
27
|
+
logging.getLogger("rust_crate_pipeline").setLevel(logging.INFO)
|
27
28
|
|
28
29
|
|
29
30
|
# Production-optimized settings
|
30
|
-
PRODUCTION_SETTINGS = {
|
31
|
+
PRODUCTION_SETTINGS: "dict[str, Any]" = {
|
31
32
|
# Reduced retries to minimize warnings
|
32
|
-
|
33
|
-
|
34
|
-
|
33
|
+
"max_retries": 2,
|
34
|
+
"validation_retries": 2,
|
35
35
|
# GitHub API management
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
"github_rate_limit_threshold": 100,
|
37
|
+
"github_critical_threshold": 50,
|
39
38
|
# LLM settings
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
"llm_timeout": 30,
|
40
|
+
"llm_max_attempts": 2,
|
43
41
|
# Logging preferences
|
44
|
-
|
45
|
-
|
46
|
-
|
42
|
+
"quiet_mode": True,
|
43
|
+
"log_level": "INFO",
|
47
44
|
# Performance settings
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
"batch_size": 10,
|
46
|
+
"checkpoint_interval": 10,
|
47
|
+
"cache_ttl": 3600,
|
51
48
|
}
|
52
49
|
|
53
50
|
|
54
|
-
def get_production_config():
|
51
|
+
def get_production_config() -> "dict[str, Any]":
|
55
52
|
"""Get production configuration dictionary"""
|
56
53
|
return PRODUCTION_SETTINGS.copy()
|
57
54
|
|
58
55
|
|
59
|
-
def is_production():
|
56
|
+
def is_production() -> bool:
|
60
57
|
"""Check if running in production mode"""
|
61
|
-
return os.getenv(
|
58
|
+
return os.getenv("PRODUCTION", "false").lower() == "true"
|
62
59
|
|
63
60
|
|
64
|
-
def setup_production_environment():
|
61
|
+
def setup_production_environment() -> "dict[str, Any]":
|
65
62
|
"""Set up the complete production environment"""
|
66
63
|
configure_production_logging()
|
67
64
|
|
68
65
|
# Set environment variables for quieter operation
|
69
|
-
os.environ.setdefault(
|
66
|
+
os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning")
|
70
67
|
|
71
68
|
if is_production():
|
72
69
|
print("🚀 Production mode enabled - optimized for minimal warnings")
|
@@ -0,0 +1,334 @@
|
|
1
|
+
# progress_monitor.py
|
2
|
+
"""
|
3
|
+
Real-time progress monitoring for the Rust Crate Pipeline (CLI-only).
|
4
|
+
|
5
|
+
This module provides:
|
6
|
+
- Live progress bars with ETA
|
7
|
+
- Real-time statistics and metrics
|
8
|
+
- Status printouts
|
9
|
+
- Performance monitoring
|
10
|
+
- Error tracking and reporting
|
11
|
+
- Status JSON file for external tools/scripts
|
12
|
+
"""
|
13
|
+
|
14
|
+
import time
|
15
|
+
import threading
|
16
|
+
import json
|
17
|
+
import os
|
18
|
+
from datetime import datetime, timedelta
|
19
|
+
from typing import Dict, List, Optional, Any, Union
|
20
|
+
from dataclasses import dataclass, field
|
21
|
+
from collections import deque
|
22
|
+
import logging
|
23
|
+
|
24
|
+
try:
|
25
|
+
from tqdm import tqdm
|
26
|
+
TQDM_AVAILABLE = True
|
27
|
+
except ImportError:
|
28
|
+
TQDM_AVAILABLE = False
|
29
|
+
|
30
|
+
try:
|
31
|
+
import psutil
|
32
|
+
PSUTIL_AVAILABLE = True
|
33
|
+
except ImportError:
|
34
|
+
PSUTIL_AVAILABLE = False
|
35
|
+
|
36
|
+
|
37
|
+
@dataclass
|
38
|
+
class PipelineMetrics:
|
39
|
+
"""Real-time pipeline metrics and statistics."""
|
40
|
+
total_crates: int = 0
|
41
|
+
processed_crates: int = 0
|
42
|
+
successful_crates: int = 0
|
43
|
+
failed_crates: int = 0
|
44
|
+
skipped_crates: int = 0
|
45
|
+
current_batch: int = 0
|
46
|
+
total_batches: int = 0
|
47
|
+
start_time: Optional[datetime] = None
|
48
|
+
current_operation: str = "Initializing"
|
49
|
+
errors: List[Dict[str, Any]] = field(default_factory=list)
|
50
|
+
warnings: List[Dict[str, Any]] = field(default_factory=list)
|
51
|
+
performance_stats: Dict[str, Any] = field(default_factory=dict)
|
52
|
+
|
53
|
+
@property
|
54
|
+
def progress_percentage(self) -> float:
|
55
|
+
"""Calculate progress percentage."""
|
56
|
+
if self.total_crates == 0:
|
57
|
+
return 0.0
|
58
|
+
return (self.processed_crates / self.total_crates) * 100
|
59
|
+
|
60
|
+
@property
|
61
|
+
def success_rate(self) -> float:
|
62
|
+
"""Calculate success rate percentage."""
|
63
|
+
if self.processed_crates == 0:
|
64
|
+
return 0.0
|
65
|
+
return (self.successful_crates / self.processed_crates) * 100
|
66
|
+
|
67
|
+
@property
|
68
|
+
def elapsed_time(self) -> timedelta:
|
69
|
+
"""Calculate elapsed time."""
|
70
|
+
if not self.start_time:
|
71
|
+
return timedelta(0)
|
72
|
+
return datetime.now() - self.start_time
|
73
|
+
|
74
|
+
@property
|
75
|
+
def estimated_completion(self) -> Optional[datetime]:
|
76
|
+
"""Estimate completion time."""
|
77
|
+
if self.processed_crates == 0 or not self.start_time:
|
78
|
+
return None
|
79
|
+
|
80
|
+
avg_time_per_crate = self.elapsed_time / self.processed_crates
|
81
|
+
remaining_crates = self.total_crates - self.processed_crates
|
82
|
+
estimated_remaining = avg_time_per_crate * remaining_crates
|
83
|
+
|
84
|
+
return datetime.now() + estimated_remaining
|
85
|
+
|
86
|
+
|
87
|
+
class ProgressMonitor:
|
88
|
+
"""Real-time progress monitoring with live dashboard."""
|
89
|
+
|
90
|
+
def __init__(self, total_crates: int, output_dir: str = "output"):
|
91
|
+
self.metrics = PipelineMetrics(total_crates=total_crates)
|
92
|
+
self.output_dir = output_dir
|
93
|
+
self.logger = logging.getLogger(__name__)
|
94
|
+
|
95
|
+
# Performance tracking
|
96
|
+
self.crate_times: deque = deque(maxlen=100) # Last 100 crate processing times
|
97
|
+
self.batch_times: deque = deque(maxlen=50) # Last 50 batch processing times
|
98
|
+
|
99
|
+
# Status tracking
|
100
|
+
self.current_crate: Optional[str] = None
|
101
|
+
self.current_operation: str = "Initializing"
|
102
|
+
self.status_file = os.path.join(output_dir, "pipeline_status.json")
|
103
|
+
|
104
|
+
# Thread safety
|
105
|
+
self._lock = threading.Lock()
|
106
|
+
|
107
|
+
# Initialize
|
108
|
+
self.metrics.start_time = datetime.now()
|
109
|
+
self._save_status()
|
110
|
+
|
111
|
+
# Create output directory if it doesn't exist
|
112
|
+
os.makedirs(output_dir, exist_ok=True)
|
113
|
+
|
114
|
+
def start_crate(self, crate_name: str) -> None:
|
115
|
+
"""Mark the start of processing a crate."""
|
116
|
+
with self._lock:
|
117
|
+
self.current_crate = crate_name
|
118
|
+
self.current_operation = f"Processing {crate_name}"
|
119
|
+
self.metrics.current_operation = self.current_operation
|
120
|
+
self._save_status()
|
121
|
+
|
122
|
+
def complete_crate(self, crate_name: str, success: bool = True,
|
123
|
+
processing_time: Optional[float] = None) -> None:
|
124
|
+
"""Mark the completion of processing a crate."""
|
125
|
+
with self._lock:
|
126
|
+
self.metrics.processed_crates += 1
|
127
|
+
|
128
|
+
if success:
|
129
|
+
self.metrics.successful_crates += 1
|
130
|
+
else:
|
131
|
+
self.metrics.failed_crates += 1
|
132
|
+
|
133
|
+
if processing_time:
|
134
|
+
self.crate_times.append(processing_time)
|
135
|
+
|
136
|
+
self.current_crate = None
|
137
|
+
self.current_operation = "Waiting for next crate"
|
138
|
+
self.metrics.current_operation = self.current_operation
|
139
|
+
|
140
|
+
# Update performance stats
|
141
|
+
self._update_performance_stats()
|
142
|
+
self._save_status()
|
143
|
+
|
144
|
+
def skip_crate(self, crate_name: str, reason: str = "Unknown") -> None:
|
145
|
+
"""Mark a crate as skipped."""
|
146
|
+
with self._lock:
|
147
|
+
self.metrics.processed_crates += 1
|
148
|
+
self.metrics.skipped_crates += 1
|
149
|
+
|
150
|
+
self.metrics.warnings.append({
|
151
|
+
"crate": crate_name,
|
152
|
+
"reason": reason,
|
153
|
+
"timestamp": datetime.now().isoformat()
|
154
|
+
})
|
155
|
+
|
156
|
+
self._save_status()
|
157
|
+
|
158
|
+
def start_batch(self, batch_num: int, batch_size: int) -> None:
|
159
|
+
"""Mark the start of processing a batch."""
|
160
|
+
with self._lock:
|
161
|
+
self.metrics.current_batch = batch_num
|
162
|
+
self.current_operation = f"Processing batch {batch_num}"
|
163
|
+
self.metrics.current_operation = self.current_operation
|
164
|
+
self._save_status()
|
165
|
+
|
166
|
+
def complete_batch(self, batch_num: int, processing_time: Optional[float] = None) -> None:
|
167
|
+
"""Mark the completion of processing a batch."""
|
168
|
+
with self._lock:
|
169
|
+
if processing_time:
|
170
|
+
self.batch_times.append(processing_time)
|
171
|
+
|
172
|
+
self.current_operation = "Batch completed, preparing next batch"
|
173
|
+
self.metrics.current_operation = self.current_operation
|
174
|
+
self._save_status()
|
175
|
+
|
176
|
+
def add_error(self, crate_name: str, error: str, error_type: str = "Processing") -> None:
|
177
|
+
"""Add an error to the metrics."""
|
178
|
+
with self._lock:
|
179
|
+
self.metrics.errors.append({
|
180
|
+
"crate": crate_name,
|
181
|
+
"error": error,
|
182
|
+
"type": error_type,
|
183
|
+
"timestamp": datetime.now().isoformat()
|
184
|
+
})
|
185
|
+
self._save_status()
|
186
|
+
|
187
|
+
def add_warning(self, crate_name: str, warning: str) -> None:
|
188
|
+
"""Add a warning to the metrics."""
|
189
|
+
with self._lock:
|
190
|
+
self.metrics.warnings.append({
|
191
|
+
"crate": crate_name,
|
192
|
+
"warning": warning,
|
193
|
+
"timestamp": datetime.now().isoformat()
|
194
|
+
})
|
195
|
+
self._save_status()
|
196
|
+
|
197
|
+
def update_operation(self, operation: str) -> None:
|
198
|
+
"""Update the current operation description."""
|
199
|
+
with self._lock:
|
200
|
+
self.current_operation = operation
|
201
|
+
self.metrics.current_operation = operation
|
202
|
+
self._save_status()
|
203
|
+
|
204
|
+
def _update_performance_stats(self) -> None:
|
205
|
+
"""Update performance statistics."""
|
206
|
+
if self.crate_times:
|
207
|
+
self.metrics.performance_stats.update({
|
208
|
+
"avg_crate_time": sum(self.crate_times) / len(self.crate_times),
|
209
|
+
"min_crate_time": min(self.crate_times),
|
210
|
+
"max_crate_time": max(self.crate_times),
|
211
|
+
"crates_per_minute": len(self.crate_times) / (sum(self.crate_times) / 60)
|
212
|
+
})
|
213
|
+
|
214
|
+
if self.batch_times:
|
215
|
+
self.metrics.performance_stats.update({
|
216
|
+
"avg_batch_time": sum(self.batch_times) / len(self.batch_times),
|
217
|
+
"min_batch_time": min(self.batch_times),
|
218
|
+
"max_batch_time": max(self.batch_times)
|
219
|
+
})
|
220
|
+
|
221
|
+
# System stats if available
|
222
|
+
if PSUTIL_AVAILABLE:
|
223
|
+
try:
|
224
|
+
cpu_percent = psutil.cpu_percent()
|
225
|
+
memory = psutil.virtual_memory()
|
226
|
+
disk = psutil.disk_usage(self.output_dir)
|
227
|
+
|
228
|
+
self.metrics.performance_stats.update({
|
229
|
+
"system_cpu_percent": cpu_percent,
|
230
|
+
"system_memory_percent": memory.percent,
|
231
|
+
"system_disk_percent": disk.percent,
|
232
|
+
"system_memory_available": memory.available,
|
233
|
+
"system_disk_free": disk.free
|
234
|
+
})
|
235
|
+
except Exception as e:
|
236
|
+
self.logger.warning(f"Failed to get system stats: {e}")
|
237
|
+
|
238
|
+
def _save_status(self) -> None:
|
239
|
+
"""Save current status to file."""
|
240
|
+
try:
|
241
|
+
status_data = {
|
242
|
+
"metrics": {
|
243
|
+
"total_crates": self.metrics.total_crates,
|
244
|
+
"processed_crates": self.metrics.processed_crates,
|
245
|
+
"successful_crates": self.metrics.successful_crates,
|
246
|
+
"failed_crates": self.metrics.failed_crates,
|
247
|
+
"skipped_crates": self.metrics.skipped_crates,
|
248
|
+
"progress_percentage": self.metrics.progress_percentage,
|
249
|
+
"success_rate": self.metrics.success_rate,
|
250
|
+
"current_batch": self.metrics.current_batch,
|
251
|
+
"total_batches": self.metrics.total_batches,
|
252
|
+
"start_time": self.metrics.start_time.isoformat() if self.metrics.start_time else None,
|
253
|
+
"elapsed_time": str(self.metrics.elapsed_time),
|
254
|
+
"estimated_completion": self.metrics.estimated_completion.isoformat() if self.metrics.estimated_completion else None,
|
255
|
+
"current_operation": self.metrics.current_operation
|
256
|
+
},
|
257
|
+
"current_crate": self.current_crate,
|
258
|
+
"performance_stats": self.metrics.performance_stats,
|
259
|
+
"errors": self.metrics.errors[-10:], # Last 10 errors
|
260
|
+
"warnings": self.metrics.warnings[-10:], # Last 10 warnings
|
261
|
+
"last_updated": datetime.now().isoformat()
|
262
|
+
}
|
263
|
+
|
264
|
+
with open(self.status_file, 'w') as f:
|
265
|
+
json.dump(status_data, f, indent=2)
|
266
|
+
|
267
|
+
except Exception as e:
|
268
|
+
self.logger.error(f"Failed to save status: {e}")
|
269
|
+
|
270
|
+
def get_status_summary(self) -> Dict[str, Any]:
|
271
|
+
"""Get a summary of current status."""
|
272
|
+
with self._lock:
|
273
|
+
return {
|
274
|
+
"progress": f"{self.metrics.progress_percentage:.1f}%",
|
275
|
+
"processed": f"{self.metrics.processed_crates}/{self.metrics.total_crates}",
|
276
|
+
"success_rate": f"{self.metrics.success_rate:.1f}%",
|
277
|
+
"elapsed_time": str(self.metrics.elapsed_time),
|
278
|
+
"estimated_completion": self.metrics.estimated_completion.isoformat() if self.metrics.estimated_completion else None,
|
279
|
+
"current_operation": self.current_operation,
|
280
|
+
"current_crate": self.current_crate,
|
281
|
+
"errors_count": len(self.metrics.errors),
|
282
|
+
"warnings_count": len(self.metrics.warnings)
|
283
|
+
}
|
284
|
+
|
285
|
+
def print_status(self) -> None:
|
286
|
+
"""Print current status to console."""
|
287
|
+
summary = self.get_status_summary()
|
288
|
+
|
289
|
+
print("\n" + "="*80)
|
290
|
+
print("🚀 RUST CRATE PIPELINE - REAL-TIME STATUS")
|
291
|
+
print("="*80)
|
292
|
+
print(f"📊 Progress: {summary['progress']} ({summary['processed']} crates)")
|
293
|
+
print(f"✅ Success Rate: {summary['success_rate']}")
|
294
|
+
print(f"⏱️ Elapsed Time: {summary['elapsed_time']}")
|
295
|
+
if summary['estimated_completion']:
|
296
|
+
print(f"🎯 Estimated Completion: {summary['estimated_completion']}")
|
297
|
+
print(f"🔄 Current Operation: {summary['current_operation']}")
|
298
|
+
if summary['current_crate']:
|
299
|
+
print(f"📦 Current Crate: {summary['current_crate']}")
|
300
|
+
print(f"❌ Errors: {summary['errors_count']}")
|
301
|
+
print(f"⚠️ Warnings: {summary['warnings_count']}")
|
302
|
+
|
303
|
+
# Performance stats
|
304
|
+
if self.metrics.performance_stats:
|
305
|
+
stats = self.metrics.performance_stats
|
306
|
+
if 'avg_crate_time' in stats:
|
307
|
+
print(f"⚡ Avg Crate Time: {stats['avg_crate_time']:.2f}s")
|
308
|
+
if 'crates_per_minute' in stats:
|
309
|
+
print(f"🚀 Processing Rate: {stats['crates_per_minute']:.1f} crates/min")
|
310
|
+
if 'system_cpu_percent' in stats:
|
311
|
+
print(f"💻 System CPU: {stats['system_cpu_percent']:.1f}%")
|
312
|
+
if 'system_memory_percent' in stats:
|
313
|
+
print(f"🧠 System Memory: {stats['system_memory_percent']:.1f}%")
|
314
|
+
|
315
|
+
print("="*80)
|
316
|
+
|
317
|
+
def create_progress_bar(self, desc: str = "Processing crates") -> Optional[Any]:
|
318
|
+
"""Create a progress bar if tqdm is available."""
|
319
|
+
if not TQDM_AVAILABLE:
|
320
|
+
return None
|
321
|
+
|
322
|
+
return tqdm(
|
323
|
+
total=self.metrics.total_crates,
|
324
|
+
desc=desc,
|
325
|
+
unit="crate",
|
326
|
+
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
|
327
|
+
)
|
328
|
+
|
329
|
+
|
330
|
+
def create_monitor(total_crates: int, output_dir: str = "output") -> ProgressMonitor:
|
331
|
+
"""Create and configure a CLI-only progress monitor."""
|
332
|
+
monitor = ProgressMonitor(total_crates, output_dir)
|
333
|
+
print("✅ Real-time CLI progress monitoring enabled")
|
334
|
+
return monitor
|
@@ -0,0 +1,13 @@
|
|
1
|
+
"""
|
2
|
+
Unified Scraping Module
|
3
|
+
|
4
|
+
This module provides a unified interface for all web scraping operations,
|
5
|
+
consolidating Crawl4AI integration and other scraping capabilities.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .unified_scraper import UnifiedScraper, ScrapingResult
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"UnifiedScraper",
|
12
|
+
"ScrapingResult",
|
13
|
+
]
|