rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. rust_crate_pipeline/__init__.py +18 -27
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +718 -596
  4. rust_crate_pipeline/analysis.py +330 -363
  5. rust_crate_pipeline/azure_ai_processing.py +462 -0
  6. rust_crate_pipeline/config.py +46 -28
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +108 -112
  14. rust_crate_pipeline/main.py +329 -109
  15. rust_crate_pipeline/network.py +317 -308
  16. rust_crate_pipeline/pipeline.py +300 -375
  17. rust_crate_pipeline/production_config.py +24 -27
  18. rust_crate_pipeline/progress_monitor.py +334 -0
  19. rust_crate_pipeline/scraping/__init__.py +13 -0
  20. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  21. rust_crate_pipeline/unified_llm_processor.py +637 -0
  22. rust_crate_pipeline/unified_pipeline.py +548 -0
  23. rust_crate_pipeline/utils/file_utils.py +32 -5
  24. rust_crate_pipeline/utils/logging_utils.py +21 -16
  25. rust_crate_pipeline/version.py +76 -47
  26. rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
  27. rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
  28. rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
  29. rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
  30. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
  31. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
  32. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
  33. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -7,66 +7,63 @@ and improve the user experience in production environments.
7
7
 
8
8
  import logging
9
9
  import os
10
+ from typing import Any
10
11
 
11
12
  # Production logging configuration
12
13
 
13
14
 
14
- def configure_production_logging():
15
+ def configure_production_logging() -> None:
15
16
  """Configure logging for production to reduce verbose warnings"""
16
17
 
17
18
  # Don't use basicConfig here - let main.py handle it
18
19
  # Just set specific loggers to less verbose levels
19
- logging.getLogger('requests').setLevel(logging.WARNING)
20
- logging.getLogger('urllib3').setLevel(logging.WARNING)
21
- logging.getLogger('requests_cache').setLevel(logging.WARNING)
20
+ logging.getLogger("requests").setLevel(logging.WARNING)
21
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
22
+ logging.getLogger("requests_cache").setLevel(logging.WARNING)
22
23
 
23
24
  # If PRODUCTION environment variable is set, be even quieter
24
- if os.getenv('PRODUCTION', 'false').lower() == 'true':
25
+ if os.getenv("PRODUCTION", "false").lower() == "true":
25
26
  logging.getLogger().setLevel(logging.WARNING)
26
- logging.getLogger('rust_crate_pipeline').setLevel(logging.INFO)
27
+ logging.getLogger("rust_crate_pipeline").setLevel(logging.INFO)
27
28
 
28
29
 
29
30
  # Production-optimized settings
30
- PRODUCTION_SETTINGS = {
31
+ PRODUCTION_SETTINGS: "dict[str, Any]" = {
31
32
  # Reduced retries to minimize warnings
32
- 'max_retries': 2,
33
- 'validation_retries': 2,
34
-
33
+ "max_retries": 2,
34
+ "validation_retries": 2,
35
35
  # GitHub API management
36
- 'github_rate_limit_threshold': 100,
37
- 'github_critical_threshold': 50,
38
-
36
+ "github_rate_limit_threshold": 100,
37
+ "github_critical_threshold": 50,
39
38
  # LLM settings
40
- 'llm_timeout': 30,
41
- 'llm_max_attempts': 2,
42
-
39
+ "llm_timeout": 30,
40
+ "llm_max_attempts": 2,
43
41
  # Logging preferences
44
- 'quiet_mode': True,
45
- 'log_level': 'INFO',
46
-
42
+ "quiet_mode": True,
43
+ "log_level": "INFO",
47
44
  # Performance settings
48
- 'batch_size': 10,
49
- 'checkpoint_interval': 10,
50
- 'cache_ttl': 3600,
45
+ "batch_size": 10,
46
+ "checkpoint_interval": 10,
47
+ "cache_ttl": 3600,
51
48
  }
52
49
 
53
50
 
54
- def get_production_config():
51
+ def get_production_config() -> "dict[str, Any]":
55
52
  """Get production configuration dictionary"""
56
53
  return PRODUCTION_SETTINGS.copy()
57
54
 
58
55
 
59
- def is_production():
56
+ def is_production() -> bool:
60
57
  """Check if running in production mode"""
61
- return os.getenv('PRODUCTION', 'false').lower() == 'true'
58
+ return os.getenv("PRODUCTION", "false").lower() == "true"
62
59
 
63
60
 
64
- def setup_production_environment():
61
+ def setup_production_environment() -> "dict[str, Any]":
65
62
  """Set up the complete production environment"""
66
63
  configure_production_logging()
67
64
 
68
65
  # Set environment variables for quieter operation
69
- os.environ.setdefault('PYTHONWARNINGS', 'ignore::UserWarning')
66
+ os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning")
70
67
 
71
68
  if is_production():
72
69
  print("🚀 Production mode enabled - optimized for minimal warnings")
@@ -0,0 +1,334 @@
1
+ # progress_monitor.py
2
+ """
3
+ Real-time progress monitoring for the Rust Crate Pipeline (CLI-only).
4
+
5
+ This module provides:
6
+ - Live progress bars with ETA
7
+ - Real-time statistics and metrics
8
+ - Status printouts
9
+ - Performance monitoring
10
+ - Error tracking and reporting
11
+ - Status JSON file for external tools/scripts
12
+ """
13
+
14
+ import time
15
+ import threading
16
+ import json
17
+ import os
18
+ from datetime import datetime, timedelta
19
+ from typing import Dict, List, Optional, Any, Union
20
+ from dataclasses import dataclass, field
21
+ from collections import deque
22
+ import logging
23
+
24
+ try:
25
+ from tqdm import tqdm
26
+ TQDM_AVAILABLE = True
27
+ except ImportError:
28
+ TQDM_AVAILABLE = False
29
+
30
+ try:
31
+ import psutil
32
+ PSUTIL_AVAILABLE = True
33
+ except ImportError:
34
+ PSUTIL_AVAILABLE = False
35
+
36
+
37
+ @dataclass
38
+ class PipelineMetrics:
39
+ """Real-time pipeline metrics and statistics."""
40
+ total_crates: int = 0
41
+ processed_crates: int = 0
42
+ successful_crates: int = 0
43
+ failed_crates: int = 0
44
+ skipped_crates: int = 0
45
+ current_batch: int = 0
46
+ total_batches: int = 0
47
+ start_time: Optional[datetime] = None
48
+ current_operation: str = "Initializing"
49
+ errors: List[Dict[str, Any]] = field(default_factory=list)
50
+ warnings: List[Dict[str, Any]] = field(default_factory=list)
51
+ performance_stats: Dict[str, Any] = field(default_factory=dict)
52
+
53
+ @property
54
+ def progress_percentage(self) -> float:
55
+ """Calculate progress percentage."""
56
+ if self.total_crates == 0:
57
+ return 0.0
58
+ return (self.processed_crates / self.total_crates) * 100
59
+
60
+ @property
61
+ def success_rate(self) -> float:
62
+ """Calculate success rate percentage."""
63
+ if self.processed_crates == 0:
64
+ return 0.0
65
+ return (self.successful_crates / self.processed_crates) * 100
66
+
67
+ @property
68
+ def elapsed_time(self) -> timedelta:
69
+ """Calculate elapsed time."""
70
+ if not self.start_time:
71
+ return timedelta(0)
72
+ return datetime.now() - self.start_time
73
+
74
+ @property
75
+ def estimated_completion(self) -> Optional[datetime]:
76
+ """Estimate completion time."""
77
+ if self.processed_crates == 0 or not self.start_time:
78
+ return None
79
+
80
+ avg_time_per_crate = self.elapsed_time / self.processed_crates
81
+ remaining_crates = self.total_crates - self.processed_crates
82
+ estimated_remaining = avg_time_per_crate * remaining_crates
83
+
84
+ return datetime.now() + estimated_remaining
85
+
86
+
87
+ class ProgressMonitor:
88
+ """Real-time progress monitoring with live dashboard."""
89
+
90
+ def __init__(self, total_crates: int, output_dir: str = "output"):
91
+ self.metrics = PipelineMetrics(total_crates=total_crates)
92
+ self.output_dir = output_dir
93
+ self.logger = logging.getLogger(__name__)
94
+
95
+ # Performance tracking
96
+ self.crate_times: deque = deque(maxlen=100) # Last 100 crate processing times
97
+ self.batch_times: deque = deque(maxlen=50) # Last 50 batch processing times
98
+
99
+ # Status tracking
100
+ self.current_crate: Optional[str] = None
101
+ self.current_operation: str = "Initializing"
102
+ self.status_file = os.path.join(output_dir, "pipeline_status.json")
103
+
104
+ # Thread safety
105
+ self._lock = threading.Lock()
106
+
107
+ # Initialize
108
+ self.metrics.start_time = datetime.now()
109
+ self._save_status()
110
+
111
+ # Create output directory if it doesn't exist
112
+ os.makedirs(output_dir, exist_ok=True)
113
+
114
+ def start_crate(self, crate_name: str) -> None:
115
+ """Mark the start of processing a crate."""
116
+ with self._lock:
117
+ self.current_crate = crate_name
118
+ self.current_operation = f"Processing {crate_name}"
119
+ self.metrics.current_operation = self.current_operation
120
+ self._save_status()
121
+
122
+ def complete_crate(self, crate_name: str, success: bool = True,
123
+ processing_time: Optional[float] = None) -> None:
124
+ """Mark the completion of processing a crate."""
125
+ with self._lock:
126
+ self.metrics.processed_crates += 1
127
+
128
+ if success:
129
+ self.metrics.successful_crates += 1
130
+ else:
131
+ self.metrics.failed_crates += 1
132
+
133
+ if processing_time:
134
+ self.crate_times.append(processing_time)
135
+
136
+ self.current_crate = None
137
+ self.current_operation = "Waiting for next crate"
138
+ self.metrics.current_operation = self.current_operation
139
+
140
+ # Update performance stats
141
+ self._update_performance_stats()
142
+ self._save_status()
143
+
144
+ def skip_crate(self, crate_name: str, reason: str = "Unknown") -> None:
145
+ """Mark a crate as skipped."""
146
+ with self._lock:
147
+ self.metrics.processed_crates += 1
148
+ self.metrics.skipped_crates += 1
149
+
150
+ self.metrics.warnings.append({
151
+ "crate": crate_name,
152
+ "reason": reason,
153
+ "timestamp": datetime.now().isoformat()
154
+ })
155
+
156
+ self._save_status()
157
+
158
+ def start_batch(self, batch_num: int, batch_size: int) -> None:
159
+ """Mark the start of processing a batch."""
160
+ with self._lock:
161
+ self.metrics.current_batch = batch_num
162
+ self.current_operation = f"Processing batch {batch_num}"
163
+ self.metrics.current_operation = self.current_operation
164
+ self._save_status()
165
+
166
+ def complete_batch(self, batch_num: int, processing_time: Optional[float] = None) -> None:
167
+ """Mark the completion of processing a batch."""
168
+ with self._lock:
169
+ if processing_time:
170
+ self.batch_times.append(processing_time)
171
+
172
+ self.current_operation = "Batch completed, preparing next batch"
173
+ self.metrics.current_operation = self.current_operation
174
+ self._save_status()
175
+
176
+ def add_error(self, crate_name: str, error: str, error_type: str = "Processing") -> None:
177
+ """Add an error to the metrics."""
178
+ with self._lock:
179
+ self.metrics.errors.append({
180
+ "crate": crate_name,
181
+ "error": error,
182
+ "type": error_type,
183
+ "timestamp": datetime.now().isoformat()
184
+ })
185
+ self._save_status()
186
+
187
+ def add_warning(self, crate_name: str, warning: str) -> None:
188
+ """Add a warning to the metrics."""
189
+ with self._lock:
190
+ self.metrics.warnings.append({
191
+ "crate": crate_name,
192
+ "warning": warning,
193
+ "timestamp": datetime.now().isoformat()
194
+ })
195
+ self._save_status()
196
+
197
+ def update_operation(self, operation: str) -> None:
198
+ """Update the current operation description."""
199
+ with self._lock:
200
+ self.current_operation = operation
201
+ self.metrics.current_operation = operation
202
+ self._save_status()
203
+
204
+ def _update_performance_stats(self) -> None:
205
+ """Update performance statistics."""
206
+ if self.crate_times:
207
+ self.metrics.performance_stats.update({
208
+ "avg_crate_time": sum(self.crate_times) / len(self.crate_times),
209
+ "min_crate_time": min(self.crate_times),
210
+ "max_crate_time": max(self.crate_times),
211
+ "crates_per_minute": len(self.crate_times) / (sum(self.crate_times) / 60)
212
+ })
213
+
214
+ if self.batch_times:
215
+ self.metrics.performance_stats.update({
216
+ "avg_batch_time": sum(self.batch_times) / len(self.batch_times),
217
+ "min_batch_time": min(self.batch_times),
218
+ "max_batch_time": max(self.batch_times)
219
+ })
220
+
221
+ # System stats if available
222
+ if PSUTIL_AVAILABLE:
223
+ try:
224
+ cpu_percent = psutil.cpu_percent()
225
+ memory = psutil.virtual_memory()
226
+ disk = psutil.disk_usage(self.output_dir)
227
+
228
+ self.metrics.performance_stats.update({
229
+ "system_cpu_percent": cpu_percent,
230
+ "system_memory_percent": memory.percent,
231
+ "system_disk_percent": disk.percent,
232
+ "system_memory_available": memory.available,
233
+ "system_disk_free": disk.free
234
+ })
235
+ except Exception as e:
236
+ self.logger.warning(f"Failed to get system stats: {e}")
237
+
238
+ def _save_status(self) -> None:
239
+ """Save current status to file."""
240
+ try:
241
+ status_data = {
242
+ "metrics": {
243
+ "total_crates": self.metrics.total_crates,
244
+ "processed_crates": self.metrics.processed_crates,
245
+ "successful_crates": self.metrics.successful_crates,
246
+ "failed_crates": self.metrics.failed_crates,
247
+ "skipped_crates": self.metrics.skipped_crates,
248
+ "progress_percentage": self.metrics.progress_percentage,
249
+ "success_rate": self.metrics.success_rate,
250
+ "current_batch": self.metrics.current_batch,
251
+ "total_batches": self.metrics.total_batches,
252
+ "start_time": self.metrics.start_time.isoformat() if self.metrics.start_time else None,
253
+ "elapsed_time": str(self.metrics.elapsed_time),
254
+ "estimated_completion": self.metrics.estimated_completion.isoformat() if self.metrics.estimated_completion else None,
255
+ "current_operation": self.metrics.current_operation
256
+ },
257
+ "current_crate": self.current_crate,
258
+ "performance_stats": self.metrics.performance_stats,
259
+ "errors": self.metrics.errors[-10:], # Last 10 errors
260
+ "warnings": self.metrics.warnings[-10:], # Last 10 warnings
261
+ "last_updated": datetime.now().isoformat()
262
+ }
263
+
264
+ with open(self.status_file, 'w') as f:
265
+ json.dump(status_data, f, indent=2)
266
+
267
+ except Exception as e:
268
+ self.logger.error(f"Failed to save status: {e}")
269
+
270
+ def get_status_summary(self) -> Dict[str, Any]:
271
+ """Get a summary of current status."""
272
+ with self._lock:
273
+ return {
274
+ "progress": f"{self.metrics.progress_percentage:.1f}%",
275
+ "processed": f"{self.metrics.processed_crates}/{self.metrics.total_crates}",
276
+ "success_rate": f"{self.metrics.success_rate:.1f}%",
277
+ "elapsed_time": str(self.metrics.elapsed_time),
278
+ "estimated_completion": self.metrics.estimated_completion.isoformat() if self.metrics.estimated_completion else None,
279
+ "current_operation": self.current_operation,
280
+ "current_crate": self.current_crate,
281
+ "errors_count": len(self.metrics.errors),
282
+ "warnings_count": len(self.metrics.warnings)
283
+ }
284
+
285
+ def print_status(self) -> None:
286
+ """Print current status to console."""
287
+ summary = self.get_status_summary()
288
+
289
+ print("\n" + "="*80)
290
+ print("🚀 RUST CRATE PIPELINE - REAL-TIME STATUS")
291
+ print("="*80)
292
+ print(f"📊 Progress: {summary['progress']} ({summary['processed']} crates)")
293
+ print(f"✅ Success Rate: {summary['success_rate']}")
294
+ print(f"⏱️ Elapsed Time: {summary['elapsed_time']}")
295
+ if summary['estimated_completion']:
296
+ print(f"🎯 Estimated Completion: {summary['estimated_completion']}")
297
+ print(f"🔄 Current Operation: {summary['current_operation']}")
298
+ if summary['current_crate']:
299
+ print(f"📦 Current Crate: {summary['current_crate']}")
300
+ print(f"❌ Errors: {summary['errors_count']}")
301
+ print(f"⚠️ Warnings: {summary['warnings_count']}")
302
+
303
+ # Performance stats
304
+ if self.metrics.performance_stats:
305
+ stats = self.metrics.performance_stats
306
+ if 'avg_crate_time' in stats:
307
+ print(f"⚡ Avg Crate Time: {stats['avg_crate_time']:.2f}s")
308
+ if 'crates_per_minute' in stats:
309
+ print(f"🚀 Processing Rate: {stats['crates_per_minute']:.1f} crates/min")
310
+ if 'system_cpu_percent' in stats:
311
+ print(f"💻 System CPU: {stats['system_cpu_percent']:.1f}%")
312
+ if 'system_memory_percent' in stats:
313
+ print(f"🧠 System Memory: {stats['system_memory_percent']:.1f}%")
314
+
315
+ print("="*80)
316
+
317
+ def create_progress_bar(self, desc: str = "Processing crates") -> Optional[Any]:
318
+ """Create a progress bar if tqdm is available."""
319
+ if not TQDM_AVAILABLE:
320
+ return None
321
+
322
+ return tqdm(
323
+ total=self.metrics.total_crates,
324
+ desc=desc,
325
+ unit="crate",
326
+ bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
327
+ )
328
+
329
+
330
+ def create_monitor(total_crates: int, output_dir: str = "output") -> ProgressMonitor:
331
+ """Create and configure a CLI-only progress monitor."""
332
+ monitor = ProgressMonitor(total_crates, output_dir)
333
+ print("✅ Real-time CLI progress monitoring enabled")
334
+ return monitor
@@ -0,0 +1,13 @@
1
+ """
2
+ Unified Scraping Module
3
+
4
+ This module provides a unified interface for all web scraping operations,
5
+ consolidating Crawl4AI integration and other scraping capabilities.
6
+ """
7
+
8
+ from .unified_scraper import UnifiedScraper, ScrapingResult
9
+
10
+ __all__ = [
11
+ "UnifiedScraper",
12
+ "ScrapingResult",
13
+ ]