rust-crate-pipeline 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -166,8 +166,7 @@ class LLMEnricher:
166
166
  prompt: str,
167
167
  validation_func: Callable[[str], bool],
168
168
  temp: float = 0.2,
169
- max_tokens: int = 256,
170
- retries: int = 3
169
+ max_tokens: int = 256, retries: int = 2 # Reduced default retries
171
170
  ) -> Optional[str]:
172
171
  """Run LLM with validation and automatic retry on failure"""
173
172
  for attempt in range(retries):
@@ -180,8 +179,11 @@ class LLMEnricher:
180
179
  if result and validation_func(result):
181
180
  return result
182
181
 
183
- # If we get here, validation failed
184
- logging.warning(f"Validation failed on attempt {attempt+1}/{retries}. Retrying with modified parameters.")
182
+ # If we get here, validation failed - use debug level for early attempts
183
+ if attempt == retries - 1:
184
+ logging.warning(f"Final validation attempt failed. Using best available result.")
185
+ else:
186
+ logging.debug(f"Validation failed on attempt {attempt+1}/{retries}. Retrying with modified parameters.")
185
187
 
186
188
  # For the last attempt, simplify the prompt
187
189
  if attempt == retries - 2:
@@ -190,11 +192,11 @@ class LLMEnricher:
190
192
  except Exception as e:
191
193
  logging.error(f"Generation error on attempt {attempt+1}: {str(e)}")
192
194
 
193
- # Backoff before retry
194
- time.sleep(1.5 * (2 ** attempt))
195
+ # Reduced backoff to minimize waiting time
196
+ time.sleep(1.0 + (attempt * 0.5))
195
197
 
196
- # If we exhaust all retries, return None
197
- return None
198
+ # If we exhausted all retries, return the last result even if not perfect
199
+ return result if 'result' in locals() else None
198
200
 
199
201
  def simplify_prompt(self, prompt: str) -> str:
200
202
  """Simplify a prompt by removing examples and reducing context"""
@@ -0,0 +1,102 @@
1
+ # github_token_checker.py
2
+ """
3
+ GitHub Token Checker Module
4
+ Lightweight version of the token checker for integration into the main pipeline.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import requests
10
+ import logging
11
+
12
+ def check_github_token_quick():
13
+ """Quick check if GitHub token is available and valid"""
14
+ token = os.getenv("GITHUB_TOKEN")
15
+
16
+ if not token:
17
+ return False, "GITHUB_TOKEN environment variable not set"
18
+
19
+ if len(token) < 20:
20
+ return False, "GITHUB_TOKEN seems too short - may be invalid"
21
+
22
+ try:
23
+ # Quick API check
24
+ headers = {
25
+ "Accept": "application/vnd.github.v3+json",
26
+ "Authorization": f"token {token}"
27
+ }
28
+
29
+ response = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
30
+
31
+ if response.status_code == 200:
32
+ data = response.json()
33
+ remaining = data["resources"]["core"]["remaining"]
34
+ return True, f"Token valid, {remaining} API calls remaining"
35
+ elif response.status_code == 401:
36
+ return False, "GitHub token is invalid or expired"
37
+ else:
38
+ return False, f"GitHub API returned status code: {response.status_code}"
39
+
40
+ except requests.exceptions.RequestException as e:
41
+ return False, f"Network error checking token: {str(e)}"
42
+ except Exception as e:
43
+ return False, f"Error checking token: {str(e)}"
44
+
45
+ def prompt_for_token_setup():
46
+ """Prompt user to set up GitHub token"""
47
+ print("\n" + "="*60)
48
+ print("🔑 GitHub Token Required")
49
+ print("="*60)
50
+ print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
51
+ print("to access repository information and avoid rate limits.")
52
+ print("\n📋 Quick Setup:")
53
+ print("1. Get token: https://github.com/settings/tokens")
54
+ print("2. Required scopes: public_repo, read:user")
55
+ print("3. Set in environment:")
56
+ print(" export GITHUB_TOKEN=\"your_token_here\"")
57
+ print("\n🔧 Setup Scripts Available:")
58
+ print(" ./setup_github_token.sh (Interactive setup)")
59
+ print(" python3 check_github_token.py (Full verification)")
60
+ print("\n" + "="*60)
61
+
62
+ # Ask if user wants to continue without token (limited functionality)
63
+ response = input("\nContinue without GitHub token? (y/N): ").strip().lower()
64
+
65
+ if response in ['y', 'yes']:
66
+ print("⚠️ Running with limited GitHub API access (60 requests/hour)")
67
+ print(" You may encounter rate limit warnings.")
68
+ return True
69
+ else:
70
+ print("\n🛑 Please set up your GitHub token and try again.")
71
+ return False
72
+
73
+ def check_and_setup_github_token():
74
+ """
75
+ Check GitHub token and prompt for setup if missing.
76
+ Returns True if should continue, False if should exit.
77
+ """
78
+ is_valid, message = check_github_token_quick()
79
+
80
+ if is_valid:
81
+ logging.debug(f"GitHub token check: {message}")
82
+ return True
83
+
84
+ # Token is missing or invalid
85
+ logging.warning(f"GitHub token issue: {message}")
86
+
87
+ # Check if we're in a non-interactive environment
88
+ if not sys.stdin.isatty():
89
+ logging.error("GitHub token not configured and running in non-interactive mode")
90
+ logging.error("Set GITHUB_TOKEN environment variable before running")
91
+ return False
92
+
93
+ # Interactive prompt
94
+ return prompt_for_token_setup()
95
+
96
+ if __name__ == "__main__":
97
+ # Allow running this module directly for testing
98
+ is_valid, message = check_github_token_quick()
99
+ print(f"Token check: {'✅' if is_valid else '❌'} {message}")
100
+
101
+ if not is_valid:
102
+ check_and_setup_github_token()
@@ -8,6 +8,8 @@ import argparse
8
8
  from typing import Optional
9
9
  from .config import PipelineConfig
10
10
  from .pipeline import CrateDataPipeline
11
+ from .production_config import setup_production_environment
12
+ from .github_token_checker import check_and_setup_github_token
11
13
 
12
14
  def parse_arguments():
13
15
  """Parse command line arguments"""
@@ -21,6 +23,7 @@ Examples:
21
23
  python -m rust_crate_pipeline --batch-size 5 # Smaller batches
22
24
  python -m rust_crate_pipeline --output-dir ./data # Custom output directory
23
25
  python -m rust_crate_pipeline --log-level DEBUG # Verbose logging
26
+ PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
24
27
  """
25
28
  )
26
29
 
@@ -123,14 +126,31 @@ def check_disk_space():
123
126
  logging.warning("Low disk space! This may affect performance.")
124
127
 
125
128
  def main():
129
+ # Setup production environment first for optimal logging
130
+ prod_config = setup_production_environment()
131
+
126
132
  args = parse_arguments()
127
133
  configure_logging(args.log_level)
128
134
  check_disk_space()
129
135
 
136
+ # Check GitHub token before proceeding
137
+ if not check_and_setup_github_token():
138
+ logging.error("GitHub token setup cancelled or failed. Exiting.")
139
+ sys.exit(1)
140
+
130
141
  try:
131
142
  # Create config from command line arguments
132
143
  config_kwargs = {}
133
144
 
145
+ # Apply production optimizations if available
146
+ if prod_config:
147
+ config_kwargs.update({
148
+ 'max_retries': prod_config.get('max_retries', 3),
149
+ 'batch_size': prod_config.get('batch_size', 10),
150
+ 'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
151
+ 'cache_ttl': prod_config.get('cache_ttl', 3600),
152
+ })
153
+
134
154
  if args.batch_size:
135
155
  config_kwargs['batch_size'] = args.batch_size
136
156
  if args.workers:
@@ -0,0 +1,76 @@
1
+ # production_config.py
2
+ """
3
+ Production configuration to reduce runtime warnings and optimize performance.
4
+ This file contains settings that can be imported to minimize verbose logging
5
+ and improve the user experience in production environments.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+
11
+ # Production logging configuration
12
+ def configure_production_logging():
13
+ """Configure logging for production to reduce verbose warnings"""
14
+
15
+ # Set up logging format
16
+ logging.basicConfig(
17
+ level=logging.INFO, # Default to INFO level
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
19
+ datefmt='%Y-%m-%d %H:%M:%S'
20
+ )
21
+
22
+ # Set specific loggers to less verbose levels
23
+ logging.getLogger('requests').setLevel(logging.WARNING)
24
+ logging.getLogger('urllib3').setLevel(logging.WARNING)
25
+ logging.getLogger('requests_cache').setLevel(logging.WARNING)
26
+
27
+ # If PRODUCTION environment variable is set, be even quieter
28
+ if os.getenv('PRODUCTION', 'false').lower() == 'true':
29
+ logging.getLogger().setLevel(logging.WARNING)
30
+ logging.getLogger('rust_crate_pipeline').setLevel(logging.INFO)
31
+
32
+ # Production-optimized settings
33
+ PRODUCTION_SETTINGS = {
34
+ # Reduced retries to minimize warnings
35
+ 'max_retries': 2,
36
+ 'validation_retries': 2,
37
+
38
+ # GitHub API management
39
+ 'github_rate_limit_threshold': 100,
40
+ 'github_critical_threshold': 50,
41
+
42
+ # LLM settings
43
+ 'llm_timeout': 30,
44
+ 'llm_max_attempts': 2,
45
+
46
+ # Logging preferences
47
+ 'quiet_mode': True,
48
+ 'log_level': 'INFO',
49
+
50
+ # Performance settings
51
+ 'batch_size': 10,
52
+ 'checkpoint_interval': 10,
53
+ 'cache_ttl': 3600,
54
+ }
55
+
56
+ def get_production_config():
57
+ """Get production configuration dictionary"""
58
+ return PRODUCTION_SETTINGS.copy()
59
+
60
+ def is_production():
61
+ """Check if running in production mode"""
62
+ return os.getenv('PRODUCTION', 'false').lower() == 'true'
63
+
64
+ def setup_production_environment():
65
+ """Set up the complete production environment"""
66
+ configure_production_logging()
67
+
68
+ # Set environment variables for quieter operation
69
+ os.environ.setdefault('PYTHONWARNINGS', 'ignore::UserWarning')
70
+
71
+ if is_production():
72
+ print("🚀 Production mode enabled - optimized for minimal warnings")
73
+ return get_production_config()
74
+ else:
75
+ print("🔧 Development mode - full logging enabled")
76
+ return {}
@@ -1,9 +1,17 @@
1
1
  """Version information for rust-crate-pipeline."""
2
2
 
3
- __version__ = "1.1.1"
3
+ __version__ = "1.2.0"
4
4
  __version_info__ = tuple(int(x) for x in __version__.split("."))
5
5
 
6
6
  # Version history
7
+ # 1.2.0 - Major release: Production-ready, cleaned codebase
8
+ # - Unified documentation into single comprehensive README
9
+ # - Removed all non-essential development and test files
10
+ # - Optimized for PyPI distribution and Docker deployment
11
+ # - Enhanced GitHub token integration and setup
12
+ # 1.1.2 - Production release: Cleaned up non-essential files
13
+ # - Unified documentation into single README
14
+ # - Optimized for PyPI distribution
7
15
  # 1.1.1 - Bug fix: Added missing python-dateutil dependency
8
16
  # - Fixed relativedelta import error
9
17
  # 1.1.0 - Updated author and contact information
@@ -0,0 +1,573 @@
1
+ Metadata-Version: 2.4
2
+ Name: rust-crate-pipeline
3
+ Version: 1.2.0
4
+ Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
5
+ Home-page: https://github.com/DaveTmire85/SigilDERG-Data_Production
6
+ Author: SuperUser666-Sigil
7
+ Author-email: SuperUser666-Sigil <miragemodularframework@gmail.com>
8
+ License-Expression: MIT
9
+ Project-URL: Homepage, https://github.com/DaveTmire85/SigilDERG-Data_Production
10
+ Project-URL: Documentation, https://github.com/DaveTmire85/SigilDERG-Data_Production#readme
11
+ Project-URL: Repository, https://github.com/DaveTmire85/SigilDERG-Data_Production
12
+ Project-URL: Bug Tracker, https://github.com/DaveTmire85/SigilDERG-Data_Production/issues
13
+ Keywords: rust,crates,metadata,ai,analysis,pipeline,dependencies
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Software Development :: Build Tools
25
+ Classifier: Topic :: Software Development :: Quality Assurance
26
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
+ Requires-Python: >=3.8
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: requests>=2.28.0
31
+ Requires-Dist: requests-cache>=1.0.0
32
+ Requires-Dist: beautifulsoup4>=4.11.0
33
+ Requires-Dist: tqdm>=4.64.0
34
+ Requires-Dist: llama-cpp-python>=0.2.0
35
+ Requires-Dist: tiktoken>=0.5.0
36
+ Requires-Dist: psutil>=5.9.0
37
+ Requires-Dist: python-dateutil>=2.8.0
38
+ Provides-Extra: dev
39
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
40
+ Requires-Dist: black>=22.0.0; extra == "dev"
41
+ Requires-Dist: isort>=5.10.0; extra == "dev"
42
+ Provides-Extra: advanced
43
+ Requires-Dist: radon>=6.0.0; extra == "advanced"
44
+ Requires-Dist: rustworkx>=0.13.0; extra == "advanced"
45
+ Dynamic: author
46
+ Dynamic: home-page
47
+ Dynamic: license-file
48
+ Dynamic: requires-python
49
+
50
+ # Rust Crate Pipeline
51
+
52
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
53
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
54
+ [![PyPI Ready](https://img.shields.io/badge/PyPI-Ready-green.svg)](https://pypi.org/)
55
+ [![Docker Ready](https://img.shields.io/badge/Docker-Ready-blue.svg)](https://docker.com/)
56
+
57
+ A production-ready pipeline for comprehensive Rust crate analysis, featuring AI-powered insights, dependency mapping, and automated data enrichment. Designed for researchers, developers, and data scientists studying the Rust ecosystem.
58
+
59
+ ## 🚀 Quick Start
60
+
61
+ ### 1. Installation
62
+
63
+ #### From PyPI (Recommended)
64
+ ```bash
65
+ pip install rust-crate-pipeline
66
+ ```
67
+
68
+ #### From Source
69
+ ```bash
70
+ git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
71
+ cd SigilDERG-Data_Production
72
+ pip install -e .
73
+ ```
74
+
75
+ #### Development Installation
76
+ ```bash
77
+ git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
78
+ cd SigilDERG-Data_Production
79
+ pip install -e ".[dev]"
80
+ ```
81
+
82
+ ### 2. GitHub Token Setup
83
+
84
+ The pipeline requires a GitHub Personal Access Token for optimal performance:
85
+
86
+ ```bash
87
+ # Interactive setup (Linux/Unix)
88
+ chmod +x setup_github_token.sh
89
+ ./setup_github_token.sh
90
+
91
+ # Manual setup
92
+ export GITHUB_TOKEN="your_token_here"
93
+ echo 'export GITHUB_TOKEN="your_token_here"' >> ~/.bashrc
94
+
95
+ # Verify setup
96
+ python3 check_github_token.py
97
+ ```
98
+
99
+ **Get your token at**: [GitHub Settings](https://github.com/settings/tokens)
100
+ **Required scopes**: `public_repo`, `read:user`
101
+
102
+ ### 3. Basic Usage
103
+
104
+ ```bash
105
+ # Standard mode
106
+ python3 -m rust_crate_pipeline
107
+
108
+ # Production mode (reduced warnings, optimized settings)
109
+ python3 run_production.py
110
+
111
+ # Process only 20 crates for testing
112
+ python3 -m rust_crate_pipeline --limit 20
113
+
114
+ # Skip AI processing for faster metadata-only collection
115
+ python3 -m rust_crate_pipeline --skip-ai --limit 50
116
+ ```
117
+
118
+ ### 4. Advanced Usage
119
+
120
+ ```bash
121
+ # Custom configuration
122
+ python3 -m rust_crate_pipeline \
123
+ --limit 100 \
124
+ --batch-size 5 \
125
+ --workers 2 \
126
+ --log-level DEBUG \
127
+ --output-dir ./results
128
+
129
+ # Process specific crates
130
+ python3 -m rust_crate_pipeline \
131
+ --crate-list serde tokio actix-web reqwest \
132
+ --output-dir ./specific_crates
133
+
134
+ # Use custom model and config
135
+ python3 -m rust_crate_pipeline \
136
+ --model-path ./my-model.gguf \
137
+ --config-file ./custom_config.json
138
+ ```
139
+
140
+ ## 🎯 Features
141
+
142
+ ### 📊 Data Collection & Analysis
143
+
144
+ - **Multi-source metadata**: crates.io, GitHub, lib.rs integration
145
+ - **Dependency mapping**: Complete dependency graphs and analysis
146
+ - **Code extraction**: Automatic Rust code example extraction
147
+ - **Security scanning**: Vulnerability and security pattern analysis
148
+ - **Performance metrics**: Lines of code, complexity, API surface analysis
149
+
150
+ ### 🤖 AI-Powered Enrichment
151
+
152
+ - **Smart categorization**: Automatic crate classification (Web, ML, Database, etc.)
153
+ - **Feature summarization**: AI-generated explanations and insights
154
+ - **Content optimization**: Intelligent README section preservation
155
+ - **Factual pairs**: Training data generation for fact verification
156
+
157
+ ### ⚡ Production Features
158
+
159
+ - **Automatic GitHub token detection**: Seamless setup and validation
160
+ - **Smart rate limiting**: Respects GitHub API limits with intelligent backoff
161
+ - **Robust error handling**: Graceful degradation and comprehensive logging
162
+ - **Progress checkpointing**: Automatic saving for long-running processes
163
+ - **Docker ready**: Full container support with optimized configurations
164
+
165
+ ## 💻 System Requirements
166
+
167
+ ### Minimum Requirements
168
+
169
+ - **Python**: 3.8+
170
+ - **Memory**: 4GB RAM
171
+ - **Storage**: 2GB free space
172
+ - **Network**: Stable internet connection
173
+
174
+ ### Recommended Setup
175
+
176
+ - **Python**: 3.10+
177
+ - **Memory**: 8GB+ RAM
178
+ - **Storage**: 10GB+ free space (SSD preferred)
179
+ - **GitHub Token**: For enhanced API access (5000 vs 60 requests/hour)
180
+
181
+ ### Dependencies
182
+
183
+ Core dependencies are automatically installed:
184
+
185
+ ```bash
186
+ requests>=2.28.0
187
+ requests-cache>=0.9.0
188
+ beautifulsoup4>=4.11.0
189
+ tqdm>=4.64.0
190
+ llama-cpp-python>=0.2.0
191
+ tiktoken>=0.4.0
192
+ psutil>=5.9.0
193
+ python-dateutil>=2.8.0
194
+ ```
195
+
196
+ ## ⚙️ Configuration & Usage
197
+
198
+ ### Command Line Options
199
+
200
+ | Argument | Type | Default | Description |
201
+ |----------|------|---------|-------------|
202
+ | `--limit` | int | None | Limit number of crates to process |
203
+ | `--batch-size` | int | 10 | Crates processed per batch |
204
+ | `--workers` | int | 4 | Parallel workers for API requests |
205
+ | `--output-dir` | str | auto | Custom output directory |
206
+ | `--model-path` | str | default | Path to LLM model file |
207
+ | `--max-tokens` | int | 256 | Maximum tokens for LLM generation |
208
+ | `--checkpoint-interval` | int | 10 | Save progress every N crates |
209
+ | `--log-level` | str | INFO | Logging verbosity |
210
+ | `--skip-ai` | flag | False | Skip AI enrichment |
211
+ | `--skip-source-analysis` | flag | False | Skip source code analysis |
212
+ | `--crate-list` | list | None | Specific crates to process |
213
+ | `--config-file` | str | None | JSON configuration file |
214
+
215
+ ### Production Mode
216
+
217
+ Production mode provides optimized settings with reduced warnings:
218
+
219
+ ```bash
220
+ # Using production launcher
221
+ python3 run_production.py [OPTIONS]
222
+
223
+ # Using environment variable
224
+ PRODUCTION=true python3 -m rust_crate_pipeline
225
+
226
+ # Docker production mode
227
+ docker run -e PRODUCTION=true -e GITHUB_TOKEN="token" your-image
228
+ ```
229
+
230
+ **Production optimizations:**
231
+
232
+ - Reduced retry attempts (3→2) to minimize warnings
233
+ - Smart GitHub API rate limiting with proactive pausing
234
+ - Enhanced logging with appropriate levels
235
+ - Optimized timeout and backoff strategies
236
+
237
+ ### Configuration Files
238
+
239
+ Create a JSON configuration file for custom settings:
240
+
241
+ ```json
242
+ {
243
+ "max_retries": 2,
244
+ "batch_size": 10,
245
+ "github_min_remaining": 500,
246
+ "cache_ttl": 7200,
247
+ "model_path": "~/models/your-model.gguf"
248
+ }
249
+ ```
250
+
251
+ Use with: `python3 -m rust_crate_pipeline --config-file config.json`
252
+
253
+ ## 🐳 Docker Deployment
254
+
255
+ ### Using Docker Compose (Recommended)
256
+
257
+ ```bash
258
+ # Set up environment
259
+ echo "GITHUB_TOKEN=your_token_here" > .env
260
+
261
+ # Run with compose
262
+ docker-compose up -d
263
+
264
+ # Monitor logs
265
+ docker-compose logs -f
266
+ ```
267
+
268
+ ### Manual Docker Commands
269
+
270
+ ```bash
271
+ # Build image
272
+ docker build -t rust-crate-pipeline .
273
+
274
+ # Run container
275
+ docker run -e GITHUB_TOKEN="your_token" \
276
+ -e PRODUCTION=true \
277
+ -v $(pwd)/output:/app/output \
278
+ rust-crate-pipeline
279
+
280
+ # Background execution
281
+ docker run -d --name pipeline \
282
+ -e GITHUB_TOKEN="your_token" \
283
+ rust-crate-pipeline
284
+ ```
285
+
286
+ ### Docker Environment Variables
287
+
288
+ | Variable | Description | Default |
289
+ |----------|-------------|---------|
290
+ | `GITHUB_TOKEN` | GitHub Personal Access Token | Required |
291
+ | `PRODUCTION` | Enable production mode | `false` |
292
+ | `PYTHONUNBUFFERED` | Force unbuffered output | `1` |
293
+
294
+ ## 📊 Output & Data Format
295
+
296
+ ### Output Structure
297
+
298
+ ```
299
+ output/
300
+ ├── enriched_crates_YYYYMMDD_HHMMSS.json # Main results
301
+ ├── metadata_YYYYMMDD_HHMMSS.json # Raw metadata
302
+ ├── errors_YYYYMMDD_HHMMSS.log # Error log
303
+ └── checkpoints/
304
+ └── checkpoint_N.json # Progress saves
305
+ ```
306
+
307
+ ### Data Schema
308
+
309
+ Each processed crate includes:
310
+
311
+ ```json
312
+ {
313
+ "name": "serde",
314
+ "version": "1.0.193",
315
+ "description": "A generic serialization/deserialization framework",
316
+ "repository": "https://github.com/serde-rs/serde",
317
+ "downloads": 50000000,
318
+ "github_stars": 8500,
319
+ "category": "Serialization",
320
+ "use_case": "Data serialization and deserialization",
321
+ "feature_summary": "Compile-time serialization framework...",
322
+ "dependencies": [...],
323
+ "security_analysis": {...},
324
+ "source_metrics": {...}
325
+ }
326
+ ```
327
+
328
+ ## 🔍 Monitoring & Troubleshooting
329
+
330
+ ### Common Issues & Solutions
331
+
332
+ #### GitHub Token Problems
333
+
334
+ ```bash
335
+ # Check token status
336
+ python3 check_github_token.py
337
+
338
+ # Common error: Rate limit warnings
339
+ [WARNING] GitHub API rate limit low: 60 remaining
340
+ # Solution: Set GITHUB_TOKEN environment variable
341
+
342
+ # Common error: Invalid token
343
+ [ERROR] GitHub token is invalid or expired
344
+ # Solution: Generate new token at https://github.com/settings/tokens
345
+ ```
346
+
347
+ #### LLM Validation Retries
348
+
349
+ ```bash
350
+ # Common warning: Validation failures
351
+ [WARNING] Validation failed on attempt 1/3. Retrying...
352
+ # Solution: Use production mode to reduce retry warnings
353
+ PRODUCTION=true python3 -m rust_crate_pipeline
354
+ ```
355
+
356
+ #### Resource Issues
357
+
358
+ ```bash
359
+ # Memory usage optimization
360
+ python3 -m rust_crate_pipeline --batch-size 3
361
+
362
+ # Disk space monitoring
363
+ df -h . # Check available space
364
+
365
+ # Network timeout handling
366
+ python3 -m rust_crate_pipeline --log-level DEBUG
367
+ ```
368
+
369
+ ### Performance Monitoring
370
+
371
+ #### Processing Times (Typical)
372
+
373
+ - **Metadata only**: 2-3 seconds per crate
374
+ - **With AI enrichment**: 15-30 seconds per crate
375
+ - **Full analysis**: 45-60 seconds per crate
376
+
377
+ #### Resource Usage
378
+
379
+ - **Memory**: 2-4GB during processing
380
+ - **Storage**: 10-50MB per crate (temporary files)
381
+ - **Network**: 1-5MB per crate (API calls)
382
+
383
+ #### Monitoring Commands
384
+
385
+ ```bash
386
+ # Check process status
387
+ ps aux | grep rust_crate_pipeline
388
+
389
+ # Monitor resource usage
390
+ top -p $(pgrep -f rust_crate_pipeline)
391
+
392
+ # Check logs
393
+ tail -f pipeline.log
394
+
395
+ # Docker monitoring
396
+ docker stats pipeline
397
+ ```
398
+
399
+ ## 🚀 Deployment Guide
400
+
401
+ ### SSH/Remote Server Deployment
402
+
403
+ ```bash
404
+ # Background execution with logging
405
+ nohup python3 run_production.py > pipeline.log 2>&1 &
406
+
407
+ # Monitor progress
408
+ tail -f pipeline.log
409
+
410
+ # Check process
411
+ jobs
412
+ ps aux | grep rust_crate_pipeline
413
+ ```
414
+
415
+ ### Systemd Service (Linux)
416
+
417
+ Create `/etc/systemd/system/rust-crate-pipeline.service`:
418
+
419
+ ```ini
420
+ [Unit]
421
+ Description=Rust Crate Data Pipeline
422
+ After=network.target
423
+
424
+ [Service]
425
+ Type=simple
426
+ User=your-username
427
+ WorkingDirectory=/path/to/pipeline
428
+ Environment=GITHUB_TOKEN=your_token_here
429
+ Environment=PRODUCTION=true
430
+ ExecStart=/usr/bin/python3 run_production.py
431
+ Restart=on-failure
432
+ RestartSec=30
433
+
434
+ [Install]
435
+ WantedBy=multi-user.target
436
+ ```
437
+
438
+ Enable and start:
439
+
440
+ ```bash
441
+ sudo systemctl daemon-reload
442
+ sudo systemctl enable rust-crate-pipeline
443
+ sudo systemctl start rust-crate-pipeline
444
+ sudo systemctl status rust-crate-pipeline
445
+ ```
446
+
447
+ ## 🏗️ Architecture
448
+
449
+ ### Core Components
450
+
451
+ 1. **CrateDataPipeline**: Main orchestration class that coordinates all processing
452
+ 2. **LLMEnricher**: Handles AI-powered enrichment using local LLM models
453
+ 3. **CrateAPIClient**: Manages API interactions with crates.io and fallback sources
454
+ 4. **GitHubBatchClient**: Optimized GitHub API client with rate limiting
455
+ 5. **SourceAnalyzer**: Analyzes source code metrics and complexity
456
+ 6. **SecurityAnalyzer**: Checks for security vulnerabilities and patterns
457
+ 7. **UserBehaviorAnalyzer**: Tracks community engagement and version adoption
458
+ 8. **DependencyAnalyzer**: Builds and analyzes dependency relationships
459
+
460
+ ### Processing Flow
461
+
462
+ ```
463
+ 1. Crate Discovery → 2. Metadata Fetching → 3. AI Enrichment
464
+ ↓ ↓ ↓
465
+ 4. Source Analysis → 5. Security Scanning → 6. Community Analysis
466
+ ↓ ↓ ↓
467
+ 7. Dependency Mapping → 8. Data Aggregation → 9. Report Generation
468
+ ```
469
+
470
+ ### Project Structure
471
+
472
+ ```
473
+ rust_crate_pipeline/
474
+ ├── __init__.py # Package initialization
475
+ ├── __main__.py # Entry point for python -m execution
476
+ ├── main.py # CLI interface and main execution logic
477
+ ├── config.py # Configuration classes and data models
478
+ ├── pipeline.py # Main orchestration and workflow management
479
+ ├── ai_processing.py # LLM integration and AI-powered enrichment
480
+ ├── network.py # API clients and HTTP request handling
481
+ ├── analysis.py # Source code, security, and dependency analysis
482
+ ├── github_token_checker.py # Token validation and setup
483
+ ├── production_config.py # Production optimizations
484
+ └── utils/ # Utility functions
485
+ ├── logging_utils.py # Logging configuration and decorators
486
+ └── file_utils.py # File operations and disk management
487
+ ```
488
+
489
+ ## 🧪 API Usage
490
+
491
+ ### Programmatic Usage
492
+
493
+ ```python
494
+ from rust_crate_pipeline import CrateDataPipeline, PipelineConfig
495
+
496
+ # Create custom configuration
497
+ config = PipelineConfig(
498
+ batch_size=5,
499
+ max_tokens=512,
500
+ model_path="/path/to/model.gguf"
501
+ )
502
+
503
+ # Initialize and run pipeline
504
+ pipeline = CrateDataPipeline(config)
505
+ pipeline.run()
506
+
507
+ # Or use individual components
508
+ from rust_crate_pipeline import LLMEnricher, SourceAnalyzer
509
+
510
+ enricher = LLMEnricher(config)
511
+ analyzer = SourceAnalyzer()
512
+ ```
513
+
514
+ ### Custom Processing
515
+
516
+ ```python
517
+ # Process specific crates with custom options
518
+ pipeline = CrateDataPipeline(
519
+ config,
520
+ limit=50,
521
+ crate_list=["serde", "tokio", "actix-web"],
522
+ skip_ai=False,
523
+ output_dir="./custom_analysis"
524
+ )
525
+ ```
526
+
527
+ ## 🔧 Development & Contributing
528
+
529
+ ### Development Setup
530
+
531
+ ```bash
532
+ # Clone and install
533
+ git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
534
+ cd SigilDERG-Data_Production
535
+ pip install -r requirements.txt
536
+
537
+ # Run tests
538
+ python3 test_optimizations.py
539
+ python3 test_token_integration.py
540
+
541
+ # Verify installation
542
+ python3 check_github_token.py
543
+ ```
544
+
545
+ ### Adding Features
546
+
547
+ 1. Implement new analyzer in `analysis.py`
548
+ 2. Add configuration options to `config.py`
549
+ 3. Integrate with pipeline in `pipeline.py`
550
+ 4. Add CLI arguments in `main.py`
551
+ 5. Update tests and documentation
552
+
553
+ ## 📄 License
554
+
555
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
556
+
557
+ ## 🙏 Acknowledgments
558
+
559
+ - **Rust Community** for the excellent crates ecosystem
560
+ - **crates.io** for comprehensive API access
561
+ - **GitHub** for repository metadata and community data
562
+ - **Deepseek** for powerful code-focused language models
563
+ - **llama.cpp** team for efficient local inference
564
+
565
+ ## 📞 Support
566
+
567
+ - **Issues**: Report bugs and request features
568
+ - **Documentation**: Complete guides and API reference
569
+ - **Community**: Join discussions and get help
570
+
571
+ ---
572
+
573
+ **Ready to analyze the Rust ecosystem! 🦀✨**
@@ -1,17 +1,19 @@
1
1
  rust_crate_pipeline/__init__.py,sha256=m9fb1WGbyOimxK2e18FSgvLWGYBwbLoHM_mscr-nAPs,1429
2
2
  rust_crate_pipeline/__main__.py,sha256=fYgtPofuk4vkwiZ7ELP4GVMNj_QiKmZMSlvhzsNGuDs,155
3
- rust_crate_pipeline/ai_processing.py,sha256=kmV6RfhwRVYQyp55Eez8R0jNAVJB8sdfIB1-60j1Eio,18027
3
+ rust_crate_pipeline/ai_processing.py,sha256=Ma5Oo4_pRfhoyvti_ZF6xV9zi4kEukMRzBva76F7cEM,18351
4
4
  rust_crate_pipeline/analysis.py,sha256=ijP4zp3cFnN09nZkeCluyAvbyAtAW_M2YSxALpQX8LY,18615
5
5
  rust_crate_pipeline/config.py,sha256=r4Y_5SD-lfrM1112edk9T0S0MiVxaNSSHk4q2yDrM88,1528
6
- rust_crate_pipeline/main.py,sha256=vPLvuwekNpwSUHEAc6lnzQ0q3QU-YZS9ZIijhkkfRzI,5384
6
+ rust_crate_pipeline/github_token_checker.py,sha256=MJqHP8J84NEZ6nzdutpC7iRnsP0kyqscjLUosvmI4MI,3768
7
+ rust_crate_pipeline/main.py,sha256=J8ORQA6s3wyWw2R3oB_IEm2J5tx1CFdspw5kb5Ep8zQ,6323
7
8
  rust_crate_pipeline/network.py,sha256=t_G8eh_WHNugm_laMftcWVbHsmP0bOlTPnVW9DqF6SU,13375
8
9
  rust_crate_pipeline/pipeline.py,sha256=Uwfw4uLL3aN1gJl5xSwvvyaY9ceeP7LVr02IzNx0tPM,12033
9
- rust_crate_pipeline/version.py,sha256=np_eAssJT7w0mfKLkvPGj07FJndSt79OH1PjHEaXb_s,542
10
+ rust_crate_pipeline/production_config.py,sha256=2GT8bxytcrMRrcfjzpay5RTtATE3rbmDvNUBvVhrYSQ,2472
11
+ rust_crate_pipeline/version.py,sha256=Ne-Iy0D2YOCWyWVo3gFNVhuUg4tBtSnlqGIDUEeWtws,1022
10
12
  rust_crate_pipeline/utils/file_utils.py,sha256=lnHeLrt1JYaQhRDKtA1TWR2HIyRO8zwOyWb-KmAmWgk,2126
11
13
  rust_crate_pipeline/utils/logging_utils.py,sha256=O4Jnr_k9dBchrVqXf-vqtDKgizDtL_ljh8g7G2VCX_c,2241
12
- rust_crate_pipeline-1.1.1.dist-info/licenses/LICENSE,sha256=tpd4XNpbssrSx9-iErATOLrOh0ivNPfO2I5MAPUpats,1088
13
- rust_crate_pipeline-1.1.1.dist-info/METADATA,sha256=96NawdhQK_JtrzH7uQYVxD4JJtsr-p8ask9tf6QmgTc,15260
14
- rust_crate_pipeline-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- rust_crate_pipeline-1.1.1.dist-info/entry_points.txt,sha256=9Rr_IRuFRIridXxUSdEJbB3ba0NnpEfKmknZXFdYRC0,70
16
- rust_crate_pipeline-1.1.1.dist-info/top_level.txt,sha256=GUdB7RyxHLhijQxui_KTy3B8p_L2APui9C6RYa0FuaE,20
17
- rust_crate_pipeline-1.1.1.dist-info/RECORD,,
14
+ rust_crate_pipeline-1.2.0.dist-info/licenses/LICENSE,sha256=tpd4XNpbssrSx9-iErATOLrOh0ivNPfO2I5MAPUpats,1088
15
+ rust_crate_pipeline-1.2.0.dist-info/METADATA,sha256=0iLlshmEVa7L-CNZp2RtrG2eTyGULwT_wx-GfbckhD4,16741
16
+ rust_crate_pipeline-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ rust_crate_pipeline-1.2.0.dist-info/entry_points.txt,sha256=9Rr_IRuFRIridXxUSdEJbB3ba0NnpEfKmknZXFdYRC0,70
18
+ rust_crate_pipeline-1.2.0.dist-info/top_level.txt,sha256=GUdB7RyxHLhijQxui_KTy3B8p_L2APui9C6RYa0FuaE,20
19
+ rust_crate_pipeline-1.2.0.dist-info/RECORD,,
@@ -1,474 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: rust-crate-pipeline
3
- Version: 1.1.1
4
- Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
5
- Home-page: https://github.com/DaveTmire85/SigilDERG-Data_Production
6
- Author: SuperUser666-Sigil
7
- Author-email: SuperUser666-Sigil <miragemodularframework@gmail.com>
8
- License-Expression: MIT
9
- Project-URL: Homepage, https://github.com/DaveTmire85/SigilDERG-Data_Production
10
- Project-URL: Documentation, https://github.com/DaveTmire85/SigilDERG-Data_Production#readme
11
- Project-URL: Repository, https://github.com/DaveTmire85/SigilDERG-Data_Production
12
- Project-URL: Bug Tracker, https://github.com/DaveTmire85/SigilDERG-Data_Production/issues
13
- Keywords: rust,crates,metadata,ai,analysis,pipeline,dependencies
14
- Classifier: Development Status :: 4 - Beta
15
- Classifier: Intended Audience :: Developers
16
- Classifier: Operating System :: OS Independent
17
- Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.8
19
- Classifier: Programming Language :: Python :: 3.9
20
- Classifier: Programming Language :: Python :: 3.10
21
- Classifier: Programming Language :: Python :: 3.11
22
- Classifier: Programming Language :: Python :: 3.12
23
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
- Classifier: Topic :: Software Development :: Build Tools
25
- Classifier: Topic :: Software Development :: Quality Assurance
26
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
- Requires-Python: >=3.8
28
- Description-Content-Type: text/markdown
29
- License-File: LICENSE
30
- Requires-Dist: requests>=2.28.0
31
- Requires-Dist: requests-cache>=1.0.0
32
- Requires-Dist: beautifulsoup4>=4.11.0
33
- Requires-Dist: tqdm>=4.64.0
34
- Requires-Dist: llama-cpp-python>=0.2.0
35
- Requires-Dist: tiktoken>=0.5.0
36
- Requires-Dist: psutil>=5.9.0
37
- Requires-Dist: python-dateutil>=2.8.0
38
- Provides-Extra: dev
39
- Requires-Dist: pytest>=7.0.0; extra == "dev"
40
- Requires-Dist: black>=22.0.0; extra == "dev"
41
- Requires-Dist: isort>=5.10.0; extra == "dev"
42
- Provides-Extra: advanced
43
- Requires-Dist: radon>=6.0.0; extra == "advanced"
44
- Requires-Dist: rustworkx>=0.13.0; extra == "advanced"
45
- Dynamic: author
46
- Dynamic: home-page
47
- Dynamic: license-file
48
- Dynamic: requires-python
49
-
50
- # Rust Crate Data Processing Pipeline
51
-
52
- [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
53
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
54
-
55
- A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights and dependency analysis.
56
-
57
- ## 🚀 Features
58
-
59
- ### 📊 **Comprehensive Data Collection**
60
- - **Multi-source metadata fetching**: Pulls data from crates.io, GitHub, and lib.rs
61
- - **Dependency analysis**: Complete dependency graphs and reverse dependency mapping
62
- - **Code snippet extraction**: Automatically extracts Rust code examples from READMEs
63
- - **Feature analysis**: Detailed breakdown of crate features and their dependencies
64
-
65
- ### 🤖 **AI-Powered Enrichment**
66
- - **Use case classification**: Automatically categorizes crates (Web Framework, ML, Database, etc.)
67
- - **Feature summarization**: AI-generated explanations of crate features
68
- - **Factual/counterfactual pairs**: Generates training data for fact verification
69
- - **Smart content truncation**: Intelligently preserves important README sections
70
-
71
- ### 🔍 **Advanced Analysis**
72
- - **Source code metrics**: Lines of code, complexity analysis, API surface area
73
- - **Security scanning**: Vulnerability checks and security pattern analysis
74
- - **Community metrics**: GitHub activity, issue tracking, version adoption
75
- - **Performance optimization**: Batch processing, caching, and retry logic
76
-
77
- ### ⚡ **Production-Ready Features**
78
- - **Robust error handling**: Graceful degradation and comprehensive logging
79
- - **Rate limiting**: Respects GitHub API limits with intelligent backoff
80
- - **Checkpointing**: Automatic progress saving for long-running processes
81
- - **Configurable processing**: Extensive CLI and config file options
82
-
83
- ## 📋 Prerequisites
84
-
85
- ### Required Dependencies
86
- ```bash
87
- pip install requests requests-cache beautifulsoup4 tqdm llama-cpp-python tiktoken psutil
88
- ```
89
-
90
- ### Optional Dependencies
91
- ```bash
92
- pip install radon rustworkx # For advanced code analysis
93
- ```
94
-
95
- ### System Requirements
96
- - **Python 3.8+**
97
- - **Local LLM Model**: Deepseek Coder or compatible GGUF model
98
- - **GitHub Token**: For enhanced GitHub API access (optional but recommended)
99
- - **Disk Space**: ~1GB free space for processing and caching
100
-
101
- ## 🛠️ Installation
102
-
103
- ### 1. Clone the Repository
104
- ```bash
105
- git clone <repository-url>
106
- cd enrichment-flow2
107
- ```
108
-
109
- ### 2. Install Dependencies
110
- ```bash
111
- pip install -r requirements.txt
112
- ```
113
-
114
- ### 3. Download LLM Model
115
- ```bash
116
- # Example: Download Deepseek Coder model
117
- mkdir -p ~/models/deepseek/
118
- wget https://huggingface.co/TheBloke/deepseek-coder-6.7B-instruct-GGUF/resolve/main/deepseek-coder-6.7b-instruct.Q4_K_M.gguf \
119
- -O ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
120
- ```
121
-
122
- ### 4. Set Environment Variables (Optional)
123
- ```bash
124
- export GITHUB_TOKEN="your_github_token_here"
125
- ```
126
-
127
- ## 🚀 Quick Start
128
-
129
- ### Installation
130
-
131
- #### From PyPI (Recommended)
132
- ```bash
133
- pip install rust-crate-pipeline
134
- ```
135
-
136
- #### From Source
137
- ```bash
138
- git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
139
- cd SigilDERG-Data_Production
140
- pip install -e .
141
- ```
142
-
143
- #### Development Installation
144
- ```bash
145
- git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
146
- cd SigilDERG-Data_Production
147
- pip install -e ".[dev]"
148
- ```
149
-
150
- ### Basic Usage
151
- ```bash
152
- # Run with default settings
153
- python -m rust_crate_pipeline
154
-
155
- # Process only 20 crates for testing
156
- python -m rust_crate_pipeline --limit 20
157
-
158
- # Skip AI processing for faster metadata-only collection
159
- python -m rust_crate_pipeline --skip-ai --limit 50
160
- ```
161
-
162
- ### Advanced Usage
163
- ```bash
164
- # Custom configuration
165
- python -m rust_crate_pipeline \
166
- --limit 100 \
167
- --batch-size 5 \
168
- --workers 2 \
169
- --log-level DEBUG \
170
- --output-dir ./results
171
-
172
- # Process specific crates
173
- python -m rust_crate_pipeline \
174
- --crate-list serde tokio actix-web reqwest \
175
- --output-dir ./specific_crates
176
-
177
- # Use custom model and config
178
- python -m rust_crate_pipeline \
179
- --model-path ./my-model.gguf \
180
- --config-file ./custom_config.json
181
- ```
182
-
183
- ## 📁 Project Structure
184
-
185
- ```
186
- enrichment-flow2/
187
- ├── __init__.py # Package initialization and public API
188
- ├── __main__.py # Entry point for python -m execution
189
- ├── main.py # CLI interface and main execution logic
190
- ├── config.py # Configuration classes and data models
191
- ├── pipeline.py # Main orchestration and workflow management
192
- ├── ai_processing.py # LLM integration and AI-powered enrichment
193
- ├── network.py # API clients and HTTP request handling
194
- ├── analysis.py # Source code, security, and dependency analysis
195
- └── utils/ # Utility functions
196
- ├── logging_utils.py # Logging configuration and decorators
197
- └── file_utils.py # File operations and disk management
198
- ```
199
-
200
- ## ⚙️ Configuration
201
-
202
- ### Command Line Arguments
203
-
204
- | Argument | Type | Default | Description |
205
- |----------|------|---------|-------------|
206
- | `--limit` | int | None | Limit number of crates to process |
207
- | `--batch-size` | int | 10 | Crates processed per batch |
208
- | `--workers` | int | 4 | Parallel workers for API requests |
209
- | `--output-dir` | str | auto | Custom output directory |
210
- | `--model-path` | str | default | Path to LLM model file |
211
- | `--max-tokens` | int | 256 | Maximum tokens for LLM generation |
212
- | `--checkpoint-interval` | int | 10 | Save progress every N crates |
213
- | `--log-level` | str | INFO | Logging verbosity |
214
- | `--skip-ai` | flag | False | Skip AI enrichment |
215
- | `--skip-source-analysis` | flag | False | Skip source code analysis |
216
- | `--crate-list` | list | None | Specific crates to process |
217
- | `--config-file` | str | None | JSON configuration file |
218
-
219
- ### Configuration File Example
220
- ```json
221
- {
222
- "model_path": "/path/to/your/model.gguf",
223
- "batch_size": 5,
224
- "n_workers": 2,
225
- "max_tokens": 512,
226
- "checkpoint_interval": 5,
227
- "github_token": "ghp_your_token_here",
228
- "cache_ttl": 7200
229
- }
230
- ```
231
-
232
- ## 📊 Output Format
233
-
234
- The pipeline generates several output files:
235
-
236
- ### 1. **Enriched Metadata** (`enriched_crate_metadata_TIMESTAMP.jsonl`)
237
- ```json
238
- {
239
- "name": "serde",
240
- "version": "1.0.193",
241
- "description": "A generic serialization/deserialization framework",
242
- "use_case": "Serialization",
243
- "score": 8542.3,
244
- "feature_summary": "Provides derive macros for automatic serialization...",
245
- "factual_counterfactual": "✅ Factual: Serde supports JSON serialization...",
246
- "source_analysis": {
247
- "file_count": 45,
248
- "loc": 12500,
249
- "functions": ["serialize", "deserialize", ...],
250
- "has_tests": true
251
- }
252
- }
253
- ```
254
-
255
- ### 2. **Dependency Analysis** (`dependency_analysis_TIMESTAMP.json`)
256
- ```json
257
- {
258
- "dependency_graph": {
259
- "actix-web": ["tokio", "serde", "futures"],
260
- "tokio": ["mio", "parking_lot"]
261
- },
262
- "reverse_dependencies": {
263
- "serde": ["actix-web", "reqwest", "clap"],
264
- "tokio": ["actix-web", "reqwest"]
265
- },
266
- "most_depended": [
267
- ["serde", 156],
268
- ["tokio", 98]
269
- ]
270
- }
271
- ```
272
-
273
- ### 3. **Summary Report** (`summary_report_TIMESTAMP.json`)
274
- ```json
275
- {
276
- "total_crates": 150,
277
- "total_time": "1247.32s",
278
- "timestamp": "2025-06-18T10:30:00",
279
- "most_popular": [
280
- {"name": "serde", "score": 8542.3},
281
- {"name": "tokio", "score": 7234.1}
282
- ]
283
- }
284
- ```
285
-
286
- ## 🔧 Advanced Features
287
-
288
- ### Custom Crate Lists
289
- Process specific crates by providing a custom list:
290
- ```bash
291
- python -m rust_crate_pipeline --crate-list \
292
- serde tokio actix-web reqwest clap \
293
- --output-dir ./web_framework_analysis
294
- ```
295
-
296
- ### Performance Tuning
297
- Optimize for your system:
298
- ```bash
299
- # High-performance setup (good internet, powerful machine)
300
- python -m rust_crate_pipeline --batch-size 20 --workers 8
301
-
302
- # Conservative setup (limited resources)
303
- python -m rust_crate_pipeline --batch-size 3 --workers 1
304
- ```
305
-
306
- ### Development Mode
307
- Quick testing with minimal processing:
308
- ```bash
309
- python -m rust_crate_pipeline \
310
- --limit 5 \
311
- --skip-ai \
312
- --skip-source-analysis \
313
- --log-level DEBUG
314
- ```
315
-
316
- ## 🏗️ Architecture
317
-
318
- ### Core Components
319
-
320
- 1. **CrateDataPipeline**: Main orchestration class that coordinates all processing
321
- 2. **LLMEnricher**: Handles AI-powered enrichment using local LLM models
322
- 3. **CrateAPIClient**: Manages API interactions with crates.io and fallback sources
323
- 4. **GitHubBatchClient**: Optimized GitHub API client with rate limiting
324
- 5. **SourceAnalyzer**: Analyzes source code metrics and complexity
325
- 6. **SecurityAnalyzer**: Checks for security vulnerabilities and patterns
326
- 7. **UserBehaviorAnalyzer**: Tracks community engagement and version adoption
327
- 8. **DependencyAnalyzer**: Builds and analyzes dependency relationships
328
-
329
- ### Processing Flow
330
-
331
- ```
332
- 1. Crate Discovery → 2. Metadata Fetching → 3. AI Enrichment
333
- ↓ ↓ ↓
334
- 4. Source Analysis → 5. Security Scanning → 6. Community Analysis
335
- ↓ ↓ ↓
336
- 7. Dependency Mapping → 8. Data Aggregation → 9. Report Generation
337
- ```
338
-
339
- ## 🧪 API Usage
340
-
341
- ### Programmatic Usage
342
- ```python
343
- from rust_crate_pipeline import CrateDataPipeline, PipelineConfig
344
-
345
- # Create custom configuration
346
- config = PipelineConfig(
347
- batch_size=5,
348
- max_tokens=512,
349
- model_path="/path/to/model.gguf"
350
- )
351
-
352
- # Initialize and run pipeline
353
- pipeline = CrateDataPipeline(config)
354
- pipeline.run()
355
-
356
- # Or use individual components
357
- from rust_crate_pipeline import LLMEnricher, SourceAnalyzer
358
-
359
- enricher = LLMEnricher(config)
360
- analyzer = SourceAnalyzer()
361
- ```
362
-
363
- ### Custom Processing
364
- ```python
365
- # Process specific crates with custom options
366
- pipeline = CrateDataPipeline(
367
- config,
368
- limit=50,
369
- crate_list=["serde", "tokio", "actix-web"],
370
- skip_ai=False,
371
- output_dir="./custom_analysis"
372
- )
373
- ```
374
-
375
- ## 🐛 Troubleshooting
376
-
377
- ### Common Issues
378
-
379
- **🔴 Model Loading Errors**
380
- ```bash
381
- # Verify model path
382
- ls -la ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
383
-
384
- # Check model format compatibility
385
- python -c "from llama_cpp import Llama; print('Model loading OK')"
386
- ```
387
-
388
- **🔴 API Rate Limiting**
389
- ```bash
390
- # Set GitHub token for higher rate limits
391
- export GITHUB_TOKEN="your_token_here"
392
-
393
- # Reduce batch size and workers
394
- python -m rust_crate_pipeline --batch-size 3 --workers 1
395
- ```
396
-
397
- **🔴 Memory Issues**
398
- ```bash
399
- # Reduce token limits and batch size
400
- python -m rust_crate_pipeline --max-tokens 128 --batch-size 2
401
- ```
402
-
403
- **🔴 Network Timeouts**
404
- ```bash
405
- # Enable debug logging to identify issues
406
- python -m rust_crate_pipeline --log-level DEBUG --limit 10
407
- ```
408
-
409
- ### Performance Optimization
410
-
411
- 1. **Use SSD storage** for faster caching and temporary file operations
412
- 2. **Increase RAM** if processing large batches (recommended: 8GB+)
413
- 3. **Set GITHUB_TOKEN** for 5000 req/hour instead of 60 req/hour
414
- 4. **Use appropriate batch sizes** based on your internet connection
415
- 5. **Monitor disk space** - processing can generate several GB of data
416
-
417
- ## 📈 Performance Metrics
418
-
419
- ### Typical Processing Times
420
- - **Metadata only**: ~2-3 seconds per crate
421
- - **With AI enrichment**: ~15-30 seconds per crate
422
- - **Full analysis**: ~45-60 seconds per crate
423
-
424
- ### Resource Usage
425
- - **Memory**: 2-4GB during processing
426
- - **Disk**: 10-50MB per crate (temporary files)
427
- - **Network**: ~1-5MB per crate (API calls)
428
-
429
- ## 🤝 Contributing
430
-
431
- ### Development Setup
432
- ```bash
433
- # Clone repository
434
- git clone <repository-url>
435
- cd enrichment-flow2
436
-
437
- # Install development dependencies
438
- pip install -r requirements-dev.txt
439
-
440
- # Run tests
441
- python -m pytest tests/
442
-
443
- # Format code
444
- black . && isort .
445
- ```
446
-
447
- ### Adding New Analysis Features
448
- 1. Implement new analyzer in `analysis.py`
449
- 2. Add configuration options to `config.py`
450
- 3. Integrate with pipeline in `pipeline.py`
451
- 4. Add CLI arguments in `main.py`
452
- 5. Update documentation
453
-
454
- ## 📄 License
455
-
456
- This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
457
-
458
- ## 🙏 Acknowledgments
459
-
460
- - **Rust Community** for the excellent crates ecosystem
461
- - **crates.io** for providing comprehensive API access
462
- - **GitHub** for repository metadata and community data
463
- - **Deepseek** for the powerful code-focused language model
464
- - **llama.cpp** team for efficient local inference capabilities
465
-
466
- ## 📞 Support
467
-
468
- - **Issues**: [GitHub Issues](https://github.com/your-repo/issues)
469
- - **Discussions**: [GitHub Discussions](https://github.com/your-repo/discussions)
470
- - **Documentation**: [Wiki](https://github.com/your-repo/wiki)
471
-
472
- ---
473
-
474
- **Happy crate analyzing! 🦀✨**