rust-crate-pipeline 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/ai_processing.py +10 -8
- rust_crate_pipeline/github_token_checker.py +102 -0
- rust_crate_pipeline/main.py +20 -0
- rust_crate_pipeline/production_config.py +76 -0
- rust_crate_pipeline/version.py +9 -1
- rust_crate_pipeline-1.2.0.dist-info/METADATA +573 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.0.dist-info}/RECORD +11 -9
- rust_crate_pipeline-1.1.1.dist-info/METADATA +0 -474
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.0.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.0.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.0.dist-info}/top_level.txt +0 -0
@@ -166,8 +166,7 @@ class LLMEnricher:
|
|
166
166
|
prompt: str,
|
167
167
|
validation_func: Callable[[str], bool],
|
168
168
|
temp: float = 0.2,
|
169
|
-
max_tokens: int = 256,
|
170
|
-
retries: int = 3
|
169
|
+
max_tokens: int = 256, retries: int = 2 # Reduced default retries
|
171
170
|
) -> Optional[str]:
|
172
171
|
"""Run LLM with validation and automatic retry on failure"""
|
173
172
|
for attempt in range(retries):
|
@@ -180,8 +179,11 @@ class LLMEnricher:
|
|
180
179
|
if result and validation_func(result):
|
181
180
|
return result
|
182
181
|
|
183
|
-
# If we get here, validation failed
|
184
|
-
|
182
|
+
# If we get here, validation failed - use debug level for early attempts
|
183
|
+
if attempt == retries - 1:
|
184
|
+
logging.warning(f"Final validation attempt failed. Using best available result.")
|
185
|
+
else:
|
186
|
+
logging.debug(f"Validation failed on attempt {attempt+1}/{retries}. Retrying with modified parameters.")
|
185
187
|
|
186
188
|
# For the last attempt, simplify the prompt
|
187
189
|
if attempt == retries - 2:
|
@@ -190,11 +192,11 @@ class LLMEnricher:
|
|
190
192
|
except Exception as e:
|
191
193
|
logging.error(f"Generation error on attempt {attempt+1}: {str(e)}")
|
192
194
|
|
193
|
-
#
|
194
|
-
time.sleep(1.
|
195
|
+
# Reduced backoff to minimize waiting time
|
196
|
+
time.sleep(1.0 + (attempt * 0.5))
|
195
197
|
|
196
|
-
# If we
|
197
|
-
return None
|
198
|
+
# If we exhausted all retries, return the last result even if not perfect
|
199
|
+
return result if 'result' in locals() else None
|
198
200
|
|
199
201
|
def simplify_prompt(self, prompt: str) -> str:
|
200
202
|
"""Simplify a prompt by removing examples and reducing context"""
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# github_token_checker.py
|
2
|
+
"""
|
3
|
+
GitHub Token Checker Module
|
4
|
+
Lightweight version of the token checker for integration into the main pipeline.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import requests
|
10
|
+
import logging
|
11
|
+
|
12
|
+
def check_github_token_quick():
|
13
|
+
"""Quick check if GitHub token is available and valid"""
|
14
|
+
token = os.getenv("GITHUB_TOKEN")
|
15
|
+
|
16
|
+
if not token:
|
17
|
+
return False, "GITHUB_TOKEN environment variable not set"
|
18
|
+
|
19
|
+
if len(token) < 20:
|
20
|
+
return False, "GITHUB_TOKEN seems too short - may be invalid"
|
21
|
+
|
22
|
+
try:
|
23
|
+
# Quick API check
|
24
|
+
headers = {
|
25
|
+
"Accept": "application/vnd.github.v3+json",
|
26
|
+
"Authorization": f"token {token}"
|
27
|
+
}
|
28
|
+
|
29
|
+
response = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
|
30
|
+
|
31
|
+
if response.status_code == 200:
|
32
|
+
data = response.json()
|
33
|
+
remaining = data["resources"]["core"]["remaining"]
|
34
|
+
return True, f"Token valid, {remaining} API calls remaining"
|
35
|
+
elif response.status_code == 401:
|
36
|
+
return False, "GitHub token is invalid or expired"
|
37
|
+
else:
|
38
|
+
return False, f"GitHub API returned status code: {response.status_code}"
|
39
|
+
|
40
|
+
except requests.exceptions.RequestException as e:
|
41
|
+
return False, f"Network error checking token: {str(e)}"
|
42
|
+
except Exception as e:
|
43
|
+
return False, f"Error checking token: {str(e)}"
|
44
|
+
|
45
|
+
def prompt_for_token_setup():
|
46
|
+
"""Prompt user to set up GitHub token"""
|
47
|
+
print("\n" + "="*60)
|
48
|
+
print("🔑 GitHub Token Required")
|
49
|
+
print("="*60)
|
50
|
+
print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
|
51
|
+
print("to access repository information and avoid rate limits.")
|
52
|
+
print("\n📋 Quick Setup:")
|
53
|
+
print("1. Get token: https://github.com/settings/tokens")
|
54
|
+
print("2. Required scopes: public_repo, read:user")
|
55
|
+
print("3. Set in environment:")
|
56
|
+
print(" export GITHUB_TOKEN=\"your_token_here\"")
|
57
|
+
print("\n🔧 Setup Scripts Available:")
|
58
|
+
print(" ./setup_github_token.sh (Interactive setup)")
|
59
|
+
print(" python3 check_github_token.py (Full verification)")
|
60
|
+
print("\n" + "="*60)
|
61
|
+
|
62
|
+
# Ask if user wants to continue without token (limited functionality)
|
63
|
+
response = input("\nContinue without GitHub token? (y/N): ").strip().lower()
|
64
|
+
|
65
|
+
if response in ['y', 'yes']:
|
66
|
+
print("⚠️ Running with limited GitHub API access (60 requests/hour)")
|
67
|
+
print(" You may encounter rate limit warnings.")
|
68
|
+
return True
|
69
|
+
else:
|
70
|
+
print("\n🛑 Please set up your GitHub token and try again.")
|
71
|
+
return False
|
72
|
+
|
73
|
+
def check_and_setup_github_token():
|
74
|
+
"""
|
75
|
+
Check GitHub token and prompt for setup if missing.
|
76
|
+
Returns True if should continue, False if should exit.
|
77
|
+
"""
|
78
|
+
is_valid, message = check_github_token_quick()
|
79
|
+
|
80
|
+
if is_valid:
|
81
|
+
logging.debug(f"GitHub token check: {message}")
|
82
|
+
return True
|
83
|
+
|
84
|
+
# Token is missing or invalid
|
85
|
+
logging.warning(f"GitHub token issue: {message}")
|
86
|
+
|
87
|
+
# Check if we're in a non-interactive environment
|
88
|
+
if not sys.stdin.isatty():
|
89
|
+
logging.error("GitHub token not configured and running in non-interactive mode")
|
90
|
+
logging.error("Set GITHUB_TOKEN environment variable before running")
|
91
|
+
return False
|
92
|
+
|
93
|
+
# Interactive prompt
|
94
|
+
return prompt_for_token_setup()
|
95
|
+
|
96
|
+
if __name__ == "__main__":
|
97
|
+
# Allow running this module directly for testing
|
98
|
+
is_valid, message = check_github_token_quick()
|
99
|
+
print(f"Token check: {'✅' if is_valid else '❌'} {message}")
|
100
|
+
|
101
|
+
if not is_valid:
|
102
|
+
check_and_setup_github_token()
|
rust_crate_pipeline/main.py
CHANGED
@@ -8,6 +8,8 @@ import argparse
|
|
8
8
|
from typing import Optional
|
9
9
|
from .config import PipelineConfig
|
10
10
|
from .pipeline import CrateDataPipeline
|
11
|
+
from .production_config import setup_production_environment
|
12
|
+
from .github_token_checker import check_and_setup_github_token
|
11
13
|
|
12
14
|
def parse_arguments():
|
13
15
|
"""Parse command line arguments"""
|
@@ -21,6 +23,7 @@ Examples:
|
|
21
23
|
python -m rust_crate_pipeline --batch-size 5 # Smaller batches
|
22
24
|
python -m rust_crate_pipeline --output-dir ./data # Custom output directory
|
23
25
|
python -m rust_crate_pipeline --log-level DEBUG # Verbose logging
|
26
|
+
PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
|
24
27
|
"""
|
25
28
|
)
|
26
29
|
|
@@ -123,14 +126,31 @@ def check_disk_space():
|
|
123
126
|
logging.warning("Low disk space! This may affect performance.")
|
124
127
|
|
125
128
|
def main():
|
129
|
+
# Setup production environment first for optimal logging
|
130
|
+
prod_config = setup_production_environment()
|
131
|
+
|
126
132
|
args = parse_arguments()
|
127
133
|
configure_logging(args.log_level)
|
128
134
|
check_disk_space()
|
129
135
|
|
136
|
+
# Check GitHub token before proceeding
|
137
|
+
if not check_and_setup_github_token():
|
138
|
+
logging.error("GitHub token setup cancelled or failed. Exiting.")
|
139
|
+
sys.exit(1)
|
140
|
+
|
130
141
|
try:
|
131
142
|
# Create config from command line arguments
|
132
143
|
config_kwargs = {}
|
133
144
|
|
145
|
+
# Apply production optimizations if available
|
146
|
+
if prod_config:
|
147
|
+
config_kwargs.update({
|
148
|
+
'max_retries': prod_config.get('max_retries', 3),
|
149
|
+
'batch_size': prod_config.get('batch_size', 10),
|
150
|
+
'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
|
151
|
+
'cache_ttl': prod_config.get('cache_ttl', 3600),
|
152
|
+
})
|
153
|
+
|
134
154
|
if args.batch_size:
|
135
155
|
config_kwargs['batch_size'] = args.batch_size
|
136
156
|
if args.workers:
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# production_config.py
|
2
|
+
"""
|
3
|
+
Production configuration to reduce runtime warnings and optimize performance.
|
4
|
+
This file contains settings that can be imported to minimize verbose logging
|
5
|
+
and improve the user experience in production environments.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import os
|
10
|
+
|
11
|
+
# Production logging configuration
|
12
|
+
def configure_production_logging():
|
13
|
+
"""Configure logging for production to reduce verbose warnings"""
|
14
|
+
|
15
|
+
# Set up logging format
|
16
|
+
logging.basicConfig(
|
17
|
+
level=logging.INFO, # Default to INFO level
|
18
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
19
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
20
|
+
)
|
21
|
+
|
22
|
+
# Set specific loggers to less verbose levels
|
23
|
+
logging.getLogger('requests').setLevel(logging.WARNING)
|
24
|
+
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
25
|
+
logging.getLogger('requests_cache').setLevel(logging.WARNING)
|
26
|
+
|
27
|
+
# If PRODUCTION environment variable is set, be even quieter
|
28
|
+
if os.getenv('PRODUCTION', 'false').lower() == 'true':
|
29
|
+
logging.getLogger().setLevel(logging.WARNING)
|
30
|
+
logging.getLogger('rust_crate_pipeline').setLevel(logging.INFO)
|
31
|
+
|
32
|
+
# Production-optimized settings
|
33
|
+
PRODUCTION_SETTINGS = {
|
34
|
+
# Reduced retries to minimize warnings
|
35
|
+
'max_retries': 2,
|
36
|
+
'validation_retries': 2,
|
37
|
+
|
38
|
+
# GitHub API management
|
39
|
+
'github_rate_limit_threshold': 100,
|
40
|
+
'github_critical_threshold': 50,
|
41
|
+
|
42
|
+
# LLM settings
|
43
|
+
'llm_timeout': 30,
|
44
|
+
'llm_max_attempts': 2,
|
45
|
+
|
46
|
+
# Logging preferences
|
47
|
+
'quiet_mode': True,
|
48
|
+
'log_level': 'INFO',
|
49
|
+
|
50
|
+
# Performance settings
|
51
|
+
'batch_size': 10,
|
52
|
+
'checkpoint_interval': 10,
|
53
|
+
'cache_ttl': 3600,
|
54
|
+
}
|
55
|
+
|
56
|
+
def get_production_config():
|
57
|
+
"""Get production configuration dictionary"""
|
58
|
+
return PRODUCTION_SETTINGS.copy()
|
59
|
+
|
60
|
+
def is_production():
|
61
|
+
"""Check if running in production mode"""
|
62
|
+
return os.getenv('PRODUCTION', 'false').lower() == 'true'
|
63
|
+
|
64
|
+
def setup_production_environment():
|
65
|
+
"""Set up the complete production environment"""
|
66
|
+
configure_production_logging()
|
67
|
+
|
68
|
+
# Set environment variables for quieter operation
|
69
|
+
os.environ.setdefault('PYTHONWARNINGS', 'ignore::UserWarning')
|
70
|
+
|
71
|
+
if is_production():
|
72
|
+
print("🚀 Production mode enabled - optimized for minimal warnings")
|
73
|
+
return get_production_config()
|
74
|
+
else:
|
75
|
+
print("🔧 Development mode - full logging enabled")
|
76
|
+
return {}
|
rust_crate_pipeline/version.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1
1
|
"""Version information for rust-crate-pipeline."""
|
2
2
|
|
3
|
-
__version__ = "1.
|
3
|
+
__version__ = "1.2.0"
|
4
4
|
__version_info__ = tuple(int(x) for x in __version__.split("."))
|
5
5
|
|
6
6
|
# Version history
|
7
|
+
# 1.2.0 - Major release: Production-ready, cleaned codebase
|
8
|
+
# - Unified documentation into single comprehensive README
|
9
|
+
# - Removed all non-essential development and test files
|
10
|
+
# - Optimized for PyPI distribution and Docker deployment
|
11
|
+
# - Enhanced GitHub token integration and setup
|
12
|
+
# 1.1.2 - Production release: Cleaned up non-essential files
|
13
|
+
# - Unified documentation into single README
|
14
|
+
# - Optimized for PyPI distribution
|
7
15
|
# 1.1.1 - Bug fix: Added missing python-dateutil dependency
|
8
16
|
# - Fixed relativedelta import error
|
9
17
|
# 1.1.0 - Updated author and contact information
|
@@ -0,0 +1,573 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: rust-crate-pipeline
|
3
|
+
Version: 1.2.0
|
4
|
+
Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
|
5
|
+
Home-page: https://github.com/DaveTmire85/SigilDERG-Data_Production
|
6
|
+
Author: SuperUser666-Sigil
|
7
|
+
Author-email: SuperUser666-Sigil <miragemodularframework@gmail.com>
|
8
|
+
License-Expression: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/DaveTmire85/SigilDERG-Data_Production
|
10
|
+
Project-URL: Documentation, https://github.com/DaveTmire85/SigilDERG-Data_Production#readme
|
11
|
+
Project-URL: Repository, https://github.com/DaveTmire85/SigilDERG-Data_Production
|
12
|
+
Project-URL: Bug Tracker, https://github.com/DaveTmire85/SigilDERG-Data_Production/issues
|
13
|
+
Keywords: rust,crates,metadata,ai,analysis,pipeline,dependencies
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
15
|
+
Classifier: Intended Audience :: Developers
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
24
|
+
Classifier: Topic :: Software Development :: Build Tools
|
25
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
27
|
+
Requires-Python: >=3.8
|
28
|
+
Description-Content-Type: text/markdown
|
29
|
+
License-File: LICENSE
|
30
|
+
Requires-Dist: requests>=2.28.0
|
31
|
+
Requires-Dist: requests-cache>=1.0.0
|
32
|
+
Requires-Dist: beautifulsoup4>=4.11.0
|
33
|
+
Requires-Dist: tqdm>=4.64.0
|
34
|
+
Requires-Dist: llama-cpp-python>=0.2.0
|
35
|
+
Requires-Dist: tiktoken>=0.5.0
|
36
|
+
Requires-Dist: psutil>=5.9.0
|
37
|
+
Requires-Dist: python-dateutil>=2.8.0
|
38
|
+
Provides-Extra: dev
|
39
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
40
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
41
|
+
Requires-Dist: isort>=5.10.0; extra == "dev"
|
42
|
+
Provides-Extra: advanced
|
43
|
+
Requires-Dist: radon>=6.0.0; extra == "advanced"
|
44
|
+
Requires-Dist: rustworkx>=0.13.0; extra == "advanced"
|
45
|
+
Dynamic: author
|
46
|
+
Dynamic: home-page
|
47
|
+
Dynamic: license-file
|
48
|
+
Dynamic: requires-python
|
49
|
+
|
50
|
+
# Rust Crate Pipeline
|
51
|
+
|
52
|
+
[](https://www.python.org/downloads/)
|
53
|
+
[](https://opensource.org/licenses/MIT)
|
54
|
+
[](https://pypi.org/)
|
55
|
+
[](https://docker.com/)
|
56
|
+
|
57
|
+
A production-ready pipeline for comprehensive Rust crate analysis, featuring AI-powered insights, dependency mapping, and automated data enrichment. Designed for researchers, developers, and data scientists studying the Rust ecosystem.
|
58
|
+
|
59
|
+
## 🚀 Quick Start
|
60
|
+
|
61
|
+
### 1. Installation
|
62
|
+
|
63
|
+
#### From PyPI (Recommended)
|
64
|
+
```bash
|
65
|
+
pip install rust-crate-pipeline
|
66
|
+
```
|
67
|
+
|
68
|
+
#### From Source
|
69
|
+
```bash
|
70
|
+
git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
|
71
|
+
cd SigilDERG-Data_Production
|
72
|
+
pip install -e .
|
73
|
+
```
|
74
|
+
|
75
|
+
#### Development Installation
|
76
|
+
```bash
|
77
|
+
git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
|
78
|
+
cd SigilDERG-Data_Production
|
79
|
+
pip install -e ".[dev]"
|
80
|
+
```
|
81
|
+
|
82
|
+
### 2. GitHub Token Setup
|
83
|
+
|
84
|
+
The pipeline requires a GitHub Personal Access Token for optimal performance:
|
85
|
+
|
86
|
+
```bash
|
87
|
+
# Interactive setup (Linux/Unix)
|
88
|
+
chmod +x setup_github_token.sh
|
89
|
+
./setup_github_token.sh
|
90
|
+
|
91
|
+
# Manual setup
|
92
|
+
export GITHUB_TOKEN="your_token_here"
|
93
|
+
echo 'export GITHUB_TOKEN="your_token_here"' >> ~/.bashrc
|
94
|
+
|
95
|
+
# Verify setup
|
96
|
+
python3 check_github_token.py
|
97
|
+
```
|
98
|
+
|
99
|
+
**Get your token at**: [GitHub Settings](https://github.com/settings/tokens)
|
100
|
+
**Required scopes**: `public_repo`, `read:user`
|
101
|
+
|
102
|
+
### 3. Basic Usage
|
103
|
+
|
104
|
+
```bash
|
105
|
+
# Standard mode
|
106
|
+
python3 -m rust_crate_pipeline
|
107
|
+
|
108
|
+
# Production mode (reduced warnings, optimized settings)
|
109
|
+
python3 run_production.py
|
110
|
+
|
111
|
+
# Process only 20 crates for testing
|
112
|
+
python3 -m rust_crate_pipeline --limit 20
|
113
|
+
|
114
|
+
# Skip AI processing for faster metadata-only collection
|
115
|
+
python3 -m rust_crate_pipeline --skip-ai --limit 50
|
116
|
+
```
|
117
|
+
|
118
|
+
### 4. Advanced Usage
|
119
|
+
|
120
|
+
```bash
|
121
|
+
# Custom configuration
|
122
|
+
python3 -m rust_crate_pipeline \
|
123
|
+
--limit 100 \
|
124
|
+
--batch-size 5 \
|
125
|
+
--workers 2 \
|
126
|
+
--log-level DEBUG \
|
127
|
+
--output-dir ./results
|
128
|
+
|
129
|
+
# Process specific crates
|
130
|
+
python3 -m rust_crate_pipeline \
|
131
|
+
--crate-list serde tokio actix-web reqwest \
|
132
|
+
--output-dir ./specific_crates
|
133
|
+
|
134
|
+
# Use custom model and config
|
135
|
+
python3 -m rust_crate_pipeline \
|
136
|
+
--model-path ./my-model.gguf \
|
137
|
+
--config-file ./custom_config.json
|
138
|
+
```
|
139
|
+
|
140
|
+
## 🎯 Features
|
141
|
+
|
142
|
+
### 📊 Data Collection & Analysis
|
143
|
+
|
144
|
+
- **Multi-source metadata**: crates.io, GitHub, lib.rs integration
|
145
|
+
- **Dependency mapping**: Complete dependency graphs and analysis
|
146
|
+
- **Code extraction**: Automatic Rust code example extraction
|
147
|
+
- **Security scanning**: Vulnerability and security pattern analysis
|
148
|
+
- **Performance metrics**: Lines of code, complexity, API surface analysis
|
149
|
+
|
150
|
+
### 🤖 AI-Powered Enrichment
|
151
|
+
|
152
|
+
- **Smart categorization**: Automatic crate classification (Web, ML, Database, etc.)
|
153
|
+
- **Feature summarization**: AI-generated explanations and insights
|
154
|
+
- **Content optimization**: Intelligent README section preservation
|
155
|
+
- **Factual pairs**: Training data generation for fact verification
|
156
|
+
|
157
|
+
### ⚡ Production Features
|
158
|
+
|
159
|
+
- **Automatic GitHub token detection**: Seamless setup and validation
|
160
|
+
- **Smart rate limiting**: Respects GitHub API limits with intelligent backoff
|
161
|
+
- **Robust error handling**: Graceful degradation and comprehensive logging
|
162
|
+
- **Progress checkpointing**: Automatic saving for long-running processes
|
163
|
+
- **Docker ready**: Full container support with optimized configurations
|
164
|
+
|
165
|
+
## 💻 System Requirements
|
166
|
+
|
167
|
+
### Minimum Requirements
|
168
|
+
|
169
|
+
- **Python**: 3.8+
|
170
|
+
- **Memory**: 4GB RAM
|
171
|
+
- **Storage**: 2GB free space
|
172
|
+
- **Network**: Stable internet connection
|
173
|
+
|
174
|
+
### Recommended Setup
|
175
|
+
|
176
|
+
- **Python**: 3.10+
|
177
|
+
- **Memory**: 8GB+ RAM
|
178
|
+
- **Storage**: 10GB+ free space (SSD preferred)
|
179
|
+
- **GitHub Token**: For enhanced API access (5000 vs 60 requests/hour)
|
180
|
+
|
181
|
+
### Dependencies
|
182
|
+
|
183
|
+
Core dependencies are automatically installed:
|
184
|
+
|
185
|
+
```bash
|
186
|
+
requests>=2.28.0
|
187
|
+
requests-cache>=0.9.0
|
188
|
+
beautifulsoup4>=4.11.0
|
189
|
+
tqdm>=4.64.0
|
190
|
+
llama-cpp-python>=0.2.0
|
191
|
+
tiktoken>=0.4.0
|
192
|
+
psutil>=5.9.0
|
193
|
+
python-dateutil>=2.8.0
|
194
|
+
```
|
195
|
+
|
196
|
+
## ⚙️ Configuration & Usage
|
197
|
+
|
198
|
+
### Command Line Options
|
199
|
+
|
200
|
+
| Argument | Type | Default | Description |
|
201
|
+
|----------|------|---------|-------------|
|
202
|
+
| `--limit` | int | None | Limit number of crates to process |
|
203
|
+
| `--batch-size` | int | 10 | Crates processed per batch |
|
204
|
+
| `--workers` | int | 4 | Parallel workers for API requests |
|
205
|
+
| `--output-dir` | str | auto | Custom output directory |
|
206
|
+
| `--model-path` | str | default | Path to LLM model file |
|
207
|
+
| `--max-tokens` | int | 256 | Maximum tokens for LLM generation |
|
208
|
+
| `--checkpoint-interval` | int | 10 | Save progress every N crates |
|
209
|
+
| `--log-level` | str | INFO | Logging verbosity |
|
210
|
+
| `--skip-ai` | flag | False | Skip AI enrichment |
|
211
|
+
| `--skip-source-analysis` | flag | False | Skip source code analysis |
|
212
|
+
| `--crate-list` | list | None | Specific crates to process |
|
213
|
+
| `--config-file` | str | None | JSON configuration file |
|
214
|
+
|
215
|
+
### Production Mode
|
216
|
+
|
217
|
+
Production mode provides optimized settings with reduced warnings:
|
218
|
+
|
219
|
+
```bash
|
220
|
+
# Using production launcher
|
221
|
+
python3 run_production.py [OPTIONS]
|
222
|
+
|
223
|
+
# Using environment variable
|
224
|
+
PRODUCTION=true python3 -m rust_crate_pipeline
|
225
|
+
|
226
|
+
# Docker production mode
|
227
|
+
docker run -e PRODUCTION=true -e GITHUB_TOKEN="token" your-image
|
228
|
+
```
|
229
|
+
|
230
|
+
**Production optimizations:**
|
231
|
+
|
232
|
+
- Reduced retry attempts (3→2) to minimize warnings
|
233
|
+
- Smart GitHub API rate limiting with proactive pausing
|
234
|
+
- Enhanced logging with appropriate levels
|
235
|
+
- Optimized timeout and backoff strategies
|
236
|
+
|
237
|
+
### Configuration Files
|
238
|
+
|
239
|
+
Create a JSON configuration file for custom settings:
|
240
|
+
|
241
|
+
```json
|
242
|
+
{
|
243
|
+
"max_retries": 2,
|
244
|
+
"batch_size": 10,
|
245
|
+
"github_min_remaining": 500,
|
246
|
+
"cache_ttl": 7200,
|
247
|
+
"model_path": "~/models/your-model.gguf"
|
248
|
+
}
|
249
|
+
```
|
250
|
+
|
251
|
+
Use with: `python3 -m rust_crate_pipeline --config-file config.json`
|
252
|
+
|
253
|
+
## 🐳 Docker Deployment
|
254
|
+
|
255
|
+
### Using Docker Compose (Recommended)
|
256
|
+
|
257
|
+
```bash
|
258
|
+
# Set up environment
|
259
|
+
echo "GITHUB_TOKEN=your_token_here" > .env
|
260
|
+
|
261
|
+
# Run with compose
|
262
|
+
docker-compose up -d
|
263
|
+
|
264
|
+
# Monitor logs
|
265
|
+
docker-compose logs -f
|
266
|
+
```
|
267
|
+
|
268
|
+
### Manual Docker Commands
|
269
|
+
|
270
|
+
```bash
|
271
|
+
# Build image
|
272
|
+
docker build -t rust-crate-pipeline .
|
273
|
+
|
274
|
+
# Run container
|
275
|
+
docker run -e GITHUB_TOKEN="your_token" \
|
276
|
+
-e PRODUCTION=true \
|
277
|
+
-v $(pwd)/output:/app/output \
|
278
|
+
rust-crate-pipeline
|
279
|
+
|
280
|
+
# Background execution
|
281
|
+
docker run -d --name pipeline \
|
282
|
+
-e GITHUB_TOKEN="your_token" \
|
283
|
+
rust-crate-pipeline
|
284
|
+
```
|
285
|
+
|
286
|
+
### Docker Environment Variables
|
287
|
+
|
288
|
+
| Variable | Description | Default |
|
289
|
+
|----------|-------------|---------|
|
290
|
+
| `GITHUB_TOKEN` | GitHub Personal Access Token | Required |
|
291
|
+
| `PRODUCTION` | Enable production mode | `false` |
|
292
|
+
| `PYTHONUNBUFFERED` | Force unbuffered output | `1` |
|
293
|
+
|
294
|
+
## 📊 Output & Data Format
|
295
|
+
|
296
|
+
### Output Structure
|
297
|
+
|
298
|
+
```
|
299
|
+
output/
|
300
|
+
├── enriched_crates_YYYYMMDD_HHMMSS.json # Main results
|
301
|
+
├── metadata_YYYYMMDD_HHMMSS.json # Raw metadata
|
302
|
+
├── errors_YYYYMMDD_HHMMSS.log # Error log
|
303
|
+
└── checkpoints/
|
304
|
+
└── checkpoint_N.json # Progress saves
|
305
|
+
```
|
306
|
+
|
307
|
+
### Data Schema
|
308
|
+
|
309
|
+
Each processed crate includes:
|
310
|
+
|
311
|
+
```json
|
312
|
+
{
|
313
|
+
"name": "serde",
|
314
|
+
"version": "1.0.193",
|
315
|
+
"description": "A generic serialization/deserialization framework",
|
316
|
+
"repository": "https://github.com/serde-rs/serde",
|
317
|
+
"downloads": 50000000,
|
318
|
+
"github_stars": 8500,
|
319
|
+
"category": "Serialization",
|
320
|
+
"use_case": "Data serialization and deserialization",
|
321
|
+
"feature_summary": "Compile-time serialization framework...",
|
322
|
+
"dependencies": [...],
|
323
|
+
"security_analysis": {...},
|
324
|
+
"source_metrics": {...}
|
325
|
+
}
|
326
|
+
```
|
327
|
+
|
328
|
+
## 🔍 Monitoring & Troubleshooting
|
329
|
+
|
330
|
+
### Common Issues & Solutions
|
331
|
+
|
332
|
+
#### GitHub Token Problems
|
333
|
+
|
334
|
+
```bash
|
335
|
+
# Check token status
|
336
|
+
python3 check_github_token.py
|
337
|
+
|
338
|
+
# Common error: Rate limit warnings
|
339
|
+
[WARNING] GitHub API rate limit low: 60 remaining
|
340
|
+
# Solution: Set GITHUB_TOKEN environment variable
|
341
|
+
|
342
|
+
# Common error: Invalid token
|
343
|
+
[ERROR] GitHub token is invalid or expired
|
344
|
+
# Solution: Generate new token at https://github.com/settings/tokens
|
345
|
+
```
|
346
|
+
|
347
|
+
#### LLM Validation Retries
|
348
|
+
|
349
|
+
```bash
|
350
|
+
# Common warning: Validation failures
|
351
|
+
[WARNING] Validation failed on attempt 1/3. Retrying...
|
352
|
+
# Solution: Use production mode to reduce retry warnings
|
353
|
+
PRODUCTION=true python3 -m rust_crate_pipeline
|
354
|
+
```
|
355
|
+
|
356
|
+
#### Resource Issues
|
357
|
+
|
358
|
+
```bash
|
359
|
+
# Memory usage optimization
|
360
|
+
python3 -m rust_crate_pipeline --batch-size 3
|
361
|
+
|
362
|
+
# Disk space monitoring
|
363
|
+
df -h . # Check available space
|
364
|
+
|
365
|
+
# Network timeout handling
|
366
|
+
python3 -m rust_crate_pipeline --log-level DEBUG
|
367
|
+
```
|
368
|
+
|
369
|
+
### Performance Monitoring
|
370
|
+
|
371
|
+
#### Processing Times (Typical)
|
372
|
+
|
373
|
+
- **Metadata only**: 2-3 seconds per crate
|
374
|
+
- **With AI enrichment**: 15-30 seconds per crate
|
375
|
+
- **Full analysis**: 45-60 seconds per crate
|
376
|
+
|
377
|
+
#### Resource Usage
|
378
|
+
|
379
|
+
- **Memory**: 2-4GB during processing
|
380
|
+
- **Storage**: 10-50MB per crate (temporary files)
|
381
|
+
- **Network**: 1-5MB per crate (API calls)
|
382
|
+
|
383
|
+
#### Monitoring Commands
|
384
|
+
|
385
|
+
```bash
|
386
|
+
# Check process status
|
387
|
+
ps aux | grep rust_crate_pipeline
|
388
|
+
|
389
|
+
# Monitor resource usage
|
390
|
+
top -p $(pgrep -f rust_crate_pipeline)
|
391
|
+
|
392
|
+
# Check logs
|
393
|
+
tail -f pipeline.log
|
394
|
+
|
395
|
+
# Docker monitoring
|
396
|
+
docker stats pipeline
|
397
|
+
```
|
398
|
+
|
399
|
+
## 🚀 Deployment Guide
|
400
|
+
|
401
|
+
### SSH/Remote Server Deployment
|
402
|
+
|
403
|
+
```bash
|
404
|
+
# Background execution with logging
|
405
|
+
nohup python3 run_production.py > pipeline.log 2>&1 &
|
406
|
+
|
407
|
+
# Monitor progress
|
408
|
+
tail -f pipeline.log
|
409
|
+
|
410
|
+
# Check process
|
411
|
+
jobs
|
412
|
+
ps aux | grep rust_crate_pipeline
|
413
|
+
```
|
414
|
+
|
415
|
+
### Systemd Service (Linux)
|
416
|
+
|
417
|
+
Create `/etc/systemd/system/rust-crate-pipeline.service`:
|
418
|
+
|
419
|
+
```ini
|
420
|
+
[Unit]
|
421
|
+
Description=Rust Crate Data Pipeline
|
422
|
+
After=network.target
|
423
|
+
|
424
|
+
[Service]
|
425
|
+
Type=simple
|
426
|
+
User=your-username
|
427
|
+
WorkingDirectory=/path/to/pipeline
|
428
|
+
Environment=GITHUB_TOKEN=your_token_here
|
429
|
+
Environment=PRODUCTION=true
|
430
|
+
ExecStart=/usr/bin/python3 run_production.py
|
431
|
+
Restart=on-failure
|
432
|
+
RestartSec=30
|
433
|
+
|
434
|
+
[Install]
|
435
|
+
WantedBy=multi-user.target
|
436
|
+
```
|
437
|
+
|
438
|
+
Enable and start:
|
439
|
+
|
440
|
+
```bash
|
441
|
+
sudo systemctl daemon-reload
|
442
|
+
sudo systemctl enable rust-crate-pipeline
|
443
|
+
sudo systemctl start rust-crate-pipeline
|
444
|
+
sudo systemctl status rust-crate-pipeline
|
445
|
+
```
|
446
|
+
|
447
|
+
## 🏗️ Architecture
|
448
|
+
|
449
|
+
### Core Components
|
450
|
+
|
451
|
+
1. **CrateDataPipeline**: Main orchestration class that coordinates all processing
|
452
|
+
2. **LLMEnricher**: Handles AI-powered enrichment using local LLM models
|
453
|
+
3. **CrateAPIClient**: Manages API interactions with crates.io and fallback sources
|
454
|
+
4. **GitHubBatchClient**: Optimized GitHub API client with rate limiting
|
455
|
+
5. **SourceAnalyzer**: Analyzes source code metrics and complexity
|
456
|
+
6. **SecurityAnalyzer**: Checks for security vulnerabilities and patterns
|
457
|
+
7. **UserBehaviorAnalyzer**: Tracks community engagement and version adoption
|
458
|
+
8. **DependencyAnalyzer**: Builds and analyzes dependency relationships
|
459
|
+
|
460
|
+
### Processing Flow
|
461
|
+
|
462
|
+
```
|
463
|
+
1. Crate Discovery → 2. Metadata Fetching → 3. AI Enrichment
|
464
|
+
↓ ↓ ↓
|
465
|
+
4. Source Analysis → 5. Security Scanning → 6. Community Analysis
|
466
|
+
↓ ↓ ↓
|
467
|
+
7. Dependency Mapping → 8. Data Aggregation → 9. Report Generation
|
468
|
+
```
|
469
|
+
|
470
|
+
### Project Structure
|
471
|
+
|
472
|
+
```
|
473
|
+
rust_crate_pipeline/
|
474
|
+
├── __init__.py # Package initialization
|
475
|
+
├── __main__.py # Entry point for python -m execution
|
476
|
+
├── main.py # CLI interface and main execution logic
|
477
|
+
├── config.py # Configuration classes and data models
|
478
|
+
├── pipeline.py # Main orchestration and workflow management
|
479
|
+
├── ai_processing.py # LLM integration and AI-powered enrichment
|
480
|
+
├── network.py # API clients and HTTP request handling
|
481
|
+
├── analysis.py # Source code, security, and dependency analysis
|
482
|
+
├── github_token_checker.py # Token validation and setup
|
483
|
+
├── production_config.py # Production optimizations
|
484
|
+
└── utils/ # Utility functions
|
485
|
+
├── logging_utils.py # Logging configuration and decorators
|
486
|
+
└── file_utils.py # File operations and disk management
|
487
|
+
```
|
488
|
+
|
489
|
+
## 🧪 API Usage
|
490
|
+
|
491
|
+
### Programmatic Usage
|
492
|
+
|
493
|
+
```python
|
494
|
+
from rust_crate_pipeline import CrateDataPipeline, PipelineConfig
|
495
|
+
|
496
|
+
# Create custom configuration
|
497
|
+
config = PipelineConfig(
|
498
|
+
batch_size=5,
|
499
|
+
max_tokens=512,
|
500
|
+
model_path="/path/to/model.gguf"
|
501
|
+
)
|
502
|
+
|
503
|
+
# Initialize and run pipeline
|
504
|
+
pipeline = CrateDataPipeline(config)
|
505
|
+
pipeline.run()
|
506
|
+
|
507
|
+
# Or use individual components
|
508
|
+
from rust_crate_pipeline import LLMEnricher, SourceAnalyzer
|
509
|
+
|
510
|
+
enricher = LLMEnricher(config)
|
511
|
+
analyzer = SourceAnalyzer()
|
512
|
+
```
|
513
|
+
|
514
|
+
### Custom Processing
|
515
|
+
|
516
|
+
```python
|
517
|
+
# Process specific crates with custom options
|
518
|
+
pipeline = CrateDataPipeline(
|
519
|
+
config,
|
520
|
+
limit=50,
|
521
|
+
crate_list=["serde", "tokio", "actix-web"],
|
522
|
+
skip_ai=False,
|
523
|
+
output_dir="./custom_analysis"
|
524
|
+
)
|
525
|
+
```
|
526
|
+
|
527
|
+
## 🔧 Development & Contributing
|
528
|
+
|
529
|
+
### Development Setup
|
530
|
+
|
531
|
+
```bash
|
532
|
+
# Clone and install
|
533
|
+
git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
|
534
|
+
cd SigilDERG-Data_Production
|
535
|
+
pip install -r requirements.txt
|
536
|
+
|
537
|
+
# Run tests
|
538
|
+
python3 test_optimizations.py
|
539
|
+
python3 test_token_integration.py
|
540
|
+
|
541
|
+
# Verify installation
|
542
|
+
python3 check_github_token.py
|
543
|
+
```
|
544
|
+
|
545
|
+
### Adding Features
|
546
|
+
|
547
|
+
1. Implement new analyzer in `analysis.py`
|
548
|
+
2. Add configuration options to `config.py`
|
549
|
+
3. Integrate with pipeline in `pipeline.py`
|
550
|
+
4. Add CLI arguments in `main.py`
|
551
|
+
5. Update tests and documentation
|
552
|
+
|
553
|
+
## 📄 License
|
554
|
+
|
555
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
556
|
+
|
557
|
+
## 🙏 Acknowledgments
|
558
|
+
|
559
|
+
- **Rust Community** for the excellent crates ecosystem
|
560
|
+
- **crates.io** for comprehensive API access
|
561
|
+
- **GitHub** for repository metadata and community data
|
562
|
+
- **Deepseek** for powerful code-focused language models
|
563
|
+
- **llama.cpp** team for efficient local inference
|
564
|
+
|
565
|
+
## 📞 Support
|
566
|
+
|
567
|
+
- **Issues**: Report bugs and request features
|
568
|
+
- **Documentation**: Complete guides and API reference
|
569
|
+
- **Community**: Join discussions and get help
|
570
|
+
|
571
|
+
---
|
572
|
+
|
573
|
+
**Ready to analyze the Rust ecosystem! 🦀✨**
|
@@ -1,17 +1,19 @@
|
|
1
1
|
rust_crate_pipeline/__init__.py,sha256=m9fb1WGbyOimxK2e18FSgvLWGYBwbLoHM_mscr-nAPs,1429
|
2
2
|
rust_crate_pipeline/__main__.py,sha256=fYgtPofuk4vkwiZ7ELP4GVMNj_QiKmZMSlvhzsNGuDs,155
|
3
|
-
rust_crate_pipeline/ai_processing.py,sha256=
|
3
|
+
rust_crate_pipeline/ai_processing.py,sha256=Ma5Oo4_pRfhoyvti_ZF6xV9zi4kEukMRzBva76F7cEM,18351
|
4
4
|
rust_crate_pipeline/analysis.py,sha256=ijP4zp3cFnN09nZkeCluyAvbyAtAW_M2YSxALpQX8LY,18615
|
5
5
|
rust_crate_pipeline/config.py,sha256=r4Y_5SD-lfrM1112edk9T0S0MiVxaNSSHk4q2yDrM88,1528
|
6
|
-
rust_crate_pipeline/
|
6
|
+
rust_crate_pipeline/github_token_checker.py,sha256=MJqHP8J84NEZ6nzdutpC7iRnsP0kyqscjLUosvmI4MI,3768
|
7
|
+
rust_crate_pipeline/main.py,sha256=J8ORQA6s3wyWw2R3oB_IEm2J5tx1CFdspw5kb5Ep8zQ,6323
|
7
8
|
rust_crate_pipeline/network.py,sha256=t_G8eh_WHNugm_laMftcWVbHsmP0bOlTPnVW9DqF6SU,13375
|
8
9
|
rust_crate_pipeline/pipeline.py,sha256=Uwfw4uLL3aN1gJl5xSwvvyaY9ceeP7LVr02IzNx0tPM,12033
|
9
|
-
rust_crate_pipeline/
|
10
|
+
rust_crate_pipeline/production_config.py,sha256=2GT8bxytcrMRrcfjzpay5RTtATE3rbmDvNUBvVhrYSQ,2472
|
11
|
+
rust_crate_pipeline/version.py,sha256=Ne-Iy0D2YOCWyWVo3gFNVhuUg4tBtSnlqGIDUEeWtws,1022
|
10
12
|
rust_crate_pipeline/utils/file_utils.py,sha256=lnHeLrt1JYaQhRDKtA1TWR2HIyRO8zwOyWb-KmAmWgk,2126
|
11
13
|
rust_crate_pipeline/utils/logging_utils.py,sha256=O4Jnr_k9dBchrVqXf-vqtDKgizDtL_ljh8g7G2VCX_c,2241
|
12
|
-
rust_crate_pipeline-1.
|
13
|
-
rust_crate_pipeline-1.
|
14
|
-
rust_crate_pipeline-1.
|
15
|
-
rust_crate_pipeline-1.
|
16
|
-
rust_crate_pipeline-1.
|
17
|
-
rust_crate_pipeline-1.
|
14
|
+
rust_crate_pipeline-1.2.0.dist-info/licenses/LICENSE,sha256=tpd4XNpbssrSx9-iErATOLrOh0ivNPfO2I5MAPUpats,1088
|
15
|
+
rust_crate_pipeline-1.2.0.dist-info/METADATA,sha256=0iLlshmEVa7L-CNZp2RtrG2eTyGULwT_wx-GfbckhD4,16741
|
16
|
+
rust_crate_pipeline-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
17
|
+
rust_crate_pipeline-1.2.0.dist-info/entry_points.txt,sha256=9Rr_IRuFRIridXxUSdEJbB3ba0NnpEfKmknZXFdYRC0,70
|
18
|
+
rust_crate_pipeline-1.2.0.dist-info/top_level.txt,sha256=GUdB7RyxHLhijQxui_KTy3B8p_L2APui9C6RYa0FuaE,20
|
19
|
+
rust_crate_pipeline-1.2.0.dist-info/RECORD,,
|
@@ -1,474 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: rust-crate-pipeline
|
3
|
-
Version: 1.1.1
|
4
|
-
Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
|
5
|
-
Home-page: https://github.com/DaveTmire85/SigilDERG-Data_Production
|
6
|
-
Author: SuperUser666-Sigil
|
7
|
-
Author-email: SuperUser666-Sigil <miragemodularframework@gmail.com>
|
8
|
-
License-Expression: MIT
|
9
|
-
Project-URL: Homepage, https://github.com/DaveTmire85/SigilDERG-Data_Production
|
10
|
-
Project-URL: Documentation, https://github.com/DaveTmire85/SigilDERG-Data_Production#readme
|
11
|
-
Project-URL: Repository, https://github.com/DaveTmire85/SigilDERG-Data_Production
|
12
|
-
Project-URL: Bug Tracker, https://github.com/DaveTmire85/SigilDERG-Data_Production/issues
|
13
|
-
Keywords: rust,crates,metadata,ai,analysis,pipeline,dependencies
|
14
|
-
Classifier: Development Status :: 4 - Beta
|
15
|
-
Classifier: Intended Audience :: Developers
|
16
|
-
Classifier: Operating System :: OS Independent
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
18
|
-
Classifier: Programming Language :: Python :: 3.8
|
19
|
-
Classifier: Programming Language :: Python :: 3.9
|
20
|
-
Classifier: Programming Language :: Python :: 3.10
|
21
|
-
Classifier: Programming Language :: Python :: 3.11
|
22
|
-
Classifier: Programming Language :: Python :: 3.12
|
23
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
24
|
-
Classifier: Topic :: Software Development :: Build Tools
|
25
|
-
Classifier: Topic :: Software Development :: Quality Assurance
|
26
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
27
|
-
Requires-Python: >=3.8
|
28
|
-
Description-Content-Type: text/markdown
|
29
|
-
License-File: LICENSE
|
30
|
-
Requires-Dist: requests>=2.28.0
|
31
|
-
Requires-Dist: requests-cache>=1.0.0
|
32
|
-
Requires-Dist: beautifulsoup4>=4.11.0
|
33
|
-
Requires-Dist: tqdm>=4.64.0
|
34
|
-
Requires-Dist: llama-cpp-python>=0.2.0
|
35
|
-
Requires-Dist: tiktoken>=0.5.0
|
36
|
-
Requires-Dist: psutil>=5.9.0
|
37
|
-
Requires-Dist: python-dateutil>=2.8.0
|
38
|
-
Provides-Extra: dev
|
39
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
40
|
-
Requires-Dist: black>=22.0.0; extra == "dev"
|
41
|
-
Requires-Dist: isort>=5.10.0; extra == "dev"
|
42
|
-
Provides-Extra: advanced
|
43
|
-
Requires-Dist: radon>=6.0.0; extra == "advanced"
|
44
|
-
Requires-Dist: rustworkx>=0.13.0; extra == "advanced"
|
45
|
-
Dynamic: author
|
46
|
-
Dynamic: home-page
|
47
|
-
Dynamic: license-file
|
48
|
-
Dynamic: requires-python
|
49
|
-
|
50
|
-
# Rust Crate Data Processing Pipeline
|
51
|
-
|
52
|
-
[](https://www.python.org/downloads/)
|
53
|
-
[](https://opensource.org/licenses/MIT)
|
54
|
-
|
55
|
-
A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights and dependency analysis.
|
56
|
-
|
57
|
-
## 🚀 Features
|
58
|
-
|
59
|
-
### 📊 **Comprehensive Data Collection**
|
60
|
-
- **Multi-source metadata fetching**: Pulls data from crates.io, GitHub, and lib.rs
|
61
|
-
- **Dependency analysis**: Complete dependency graphs and reverse dependency mapping
|
62
|
-
- **Code snippet extraction**: Automatically extracts Rust code examples from READMEs
|
63
|
-
- **Feature analysis**: Detailed breakdown of crate features and their dependencies
|
64
|
-
|
65
|
-
### 🤖 **AI-Powered Enrichment**
|
66
|
-
- **Use case classification**: Automatically categorizes crates (Web Framework, ML, Database, etc.)
|
67
|
-
- **Feature summarization**: AI-generated explanations of crate features
|
68
|
-
- **Factual/counterfactual pairs**: Generates training data for fact verification
|
69
|
-
- **Smart content truncation**: Intelligently preserves important README sections
|
70
|
-
|
71
|
-
### 🔍 **Advanced Analysis**
|
72
|
-
- **Source code metrics**: Lines of code, complexity analysis, API surface area
|
73
|
-
- **Security scanning**: Vulnerability checks and security pattern analysis
|
74
|
-
- **Community metrics**: GitHub activity, issue tracking, version adoption
|
75
|
-
- **Performance optimization**: Batch processing, caching, and retry logic
|
76
|
-
|
77
|
-
### ⚡ **Production-Ready Features**
|
78
|
-
- **Robust error handling**: Graceful degradation and comprehensive logging
|
79
|
-
- **Rate limiting**: Respects GitHub API limits with intelligent backoff
|
80
|
-
- **Checkpointing**: Automatic progress saving for long-running processes
|
81
|
-
- **Configurable processing**: Extensive CLI and config file options
|
82
|
-
|
83
|
-
## 📋 Prerequisites
|
84
|
-
|
85
|
-
### Required Dependencies
|
86
|
-
```bash
|
87
|
-
pip install requests requests-cache beautifulsoup4 tqdm llama-cpp-python tiktoken psutil
|
88
|
-
```
|
89
|
-
|
90
|
-
### Optional Dependencies
|
91
|
-
```bash
|
92
|
-
pip install radon rustworkx # For advanced code analysis
|
93
|
-
```
|
94
|
-
|
95
|
-
### System Requirements
|
96
|
-
- **Python 3.8+**
|
97
|
-
- **Local LLM Model**: Deepseek Coder or compatible GGUF model
|
98
|
-
- **GitHub Token**: For enhanced GitHub API access (optional but recommended)
|
99
|
-
- **Disk Space**: ~1GB free space for processing and caching
|
100
|
-
|
101
|
-
## 🛠️ Installation
|
102
|
-
|
103
|
-
### 1. Clone the Repository
|
104
|
-
```bash
|
105
|
-
git clone <repository-url>
|
106
|
-
cd enrichment-flow2
|
107
|
-
```
|
108
|
-
|
109
|
-
### 2. Install Dependencies
|
110
|
-
```bash
|
111
|
-
pip install -r requirements.txt
|
112
|
-
```
|
113
|
-
|
114
|
-
### 3. Download LLM Model
|
115
|
-
```bash
|
116
|
-
# Example: Download Deepseek Coder model
|
117
|
-
mkdir -p ~/models/deepseek/
|
118
|
-
wget https://huggingface.co/TheBloke/deepseek-coder-6.7B-instruct-GGUF/resolve/main/deepseek-coder-6.7b-instruct.Q4_K_M.gguf \
|
119
|
-
-O ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
|
120
|
-
```
|
121
|
-
|
122
|
-
### 4. Set Environment Variables (Optional)
|
123
|
-
```bash
|
124
|
-
export GITHUB_TOKEN="your_github_token_here"
|
125
|
-
```
|
126
|
-
|
127
|
-
## 🚀 Quick Start
|
128
|
-
|
129
|
-
### Installation
|
130
|
-
|
131
|
-
#### From PyPI (Recommended)
|
132
|
-
```bash
|
133
|
-
pip install rust-crate-pipeline
|
134
|
-
```
|
135
|
-
|
136
|
-
#### From Source
|
137
|
-
```bash
|
138
|
-
git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
|
139
|
-
cd SigilDERG-Data_Production
|
140
|
-
pip install -e .
|
141
|
-
```
|
142
|
-
|
143
|
-
#### Development Installation
|
144
|
-
```bash
|
145
|
-
git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
|
146
|
-
cd SigilDERG-Data_Production
|
147
|
-
pip install -e ".[dev]"
|
148
|
-
```
|
149
|
-
|
150
|
-
### Basic Usage
|
151
|
-
```bash
|
152
|
-
# Run with default settings
|
153
|
-
python -m rust_crate_pipeline
|
154
|
-
|
155
|
-
# Process only 20 crates for testing
|
156
|
-
python -m rust_crate_pipeline --limit 20
|
157
|
-
|
158
|
-
# Skip AI processing for faster metadata-only collection
|
159
|
-
python -m rust_crate_pipeline --skip-ai --limit 50
|
160
|
-
```
|
161
|
-
|
162
|
-
### Advanced Usage
|
163
|
-
```bash
|
164
|
-
# Custom configuration
|
165
|
-
python -m rust_crate_pipeline \
|
166
|
-
--limit 100 \
|
167
|
-
--batch-size 5 \
|
168
|
-
--workers 2 \
|
169
|
-
--log-level DEBUG \
|
170
|
-
--output-dir ./results
|
171
|
-
|
172
|
-
# Process specific crates
|
173
|
-
python -m rust_crate_pipeline \
|
174
|
-
--crate-list serde tokio actix-web reqwest \
|
175
|
-
--output-dir ./specific_crates
|
176
|
-
|
177
|
-
# Use custom model and config
|
178
|
-
python -m rust_crate_pipeline \
|
179
|
-
--model-path ./my-model.gguf \
|
180
|
-
--config-file ./custom_config.json
|
181
|
-
```
|
182
|
-
|
183
|
-
## 📁 Project Structure
|
184
|
-
|
185
|
-
```
|
186
|
-
enrichment-flow2/
|
187
|
-
├── __init__.py # Package initialization and public API
|
188
|
-
├── __main__.py # Entry point for python -m execution
|
189
|
-
├── main.py # CLI interface and main execution logic
|
190
|
-
├── config.py # Configuration classes and data models
|
191
|
-
├── pipeline.py # Main orchestration and workflow management
|
192
|
-
├── ai_processing.py # LLM integration and AI-powered enrichment
|
193
|
-
├── network.py # API clients and HTTP request handling
|
194
|
-
├── analysis.py # Source code, security, and dependency analysis
|
195
|
-
└── utils/ # Utility functions
|
196
|
-
├── logging_utils.py # Logging configuration and decorators
|
197
|
-
└── file_utils.py # File operations and disk management
|
198
|
-
```
|
199
|
-
|
200
|
-
## ⚙️ Configuration
|
201
|
-
|
202
|
-
### Command Line Arguments
|
203
|
-
|
204
|
-
| Argument | Type | Default | Description |
|
205
|
-
|----------|------|---------|-------------|
|
206
|
-
| `--limit` | int | None | Limit number of crates to process |
|
207
|
-
| `--batch-size` | int | 10 | Crates processed per batch |
|
208
|
-
| `--workers` | int | 4 | Parallel workers for API requests |
|
209
|
-
| `--output-dir` | str | auto | Custom output directory |
|
210
|
-
| `--model-path` | str | default | Path to LLM model file |
|
211
|
-
| `--max-tokens` | int | 256 | Maximum tokens for LLM generation |
|
212
|
-
| `--checkpoint-interval` | int | 10 | Save progress every N crates |
|
213
|
-
| `--log-level` | str | INFO | Logging verbosity |
|
214
|
-
| `--skip-ai` | flag | False | Skip AI enrichment |
|
215
|
-
| `--skip-source-analysis` | flag | False | Skip source code analysis |
|
216
|
-
| `--crate-list` | list | None | Specific crates to process |
|
217
|
-
| `--config-file` | str | None | JSON configuration file |
|
218
|
-
|
219
|
-
### Configuration File Example
|
220
|
-
```json
|
221
|
-
{
|
222
|
-
"model_path": "/path/to/your/model.gguf",
|
223
|
-
"batch_size": 5,
|
224
|
-
"n_workers": 2,
|
225
|
-
"max_tokens": 512,
|
226
|
-
"checkpoint_interval": 5,
|
227
|
-
"github_token": "ghp_your_token_here",
|
228
|
-
"cache_ttl": 7200
|
229
|
-
}
|
230
|
-
```
|
231
|
-
|
232
|
-
## 📊 Output Format
|
233
|
-
|
234
|
-
The pipeline generates several output files:
|
235
|
-
|
236
|
-
### 1. **Enriched Metadata** (`enriched_crate_metadata_TIMESTAMP.jsonl`)
|
237
|
-
```json
|
238
|
-
{
|
239
|
-
"name": "serde",
|
240
|
-
"version": "1.0.193",
|
241
|
-
"description": "A generic serialization/deserialization framework",
|
242
|
-
"use_case": "Serialization",
|
243
|
-
"score": 8542.3,
|
244
|
-
"feature_summary": "Provides derive macros for automatic serialization...",
|
245
|
-
"factual_counterfactual": "✅ Factual: Serde supports JSON serialization...",
|
246
|
-
"source_analysis": {
|
247
|
-
"file_count": 45,
|
248
|
-
"loc": 12500,
|
249
|
-
"functions": ["serialize", "deserialize", ...],
|
250
|
-
"has_tests": true
|
251
|
-
}
|
252
|
-
}
|
253
|
-
```
|
254
|
-
|
255
|
-
### 2. **Dependency Analysis** (`dependency_analysis_TIMESTAMP.json`)
|
256
|
-
```json
|
257
|
-
{
|
258
|
-
"dependency_graph": {
|
259
|
-
"actix-web": ["tokio", "serde", "futures"],
|
260
|
-
"tokio": ["mio", "parking_lot"]
|
261
|
-
},
|
262
|
-
"reverse_dependencies": {
|
263
|
-
"serde": ["actix-web", "reqwest", "clap"],
|
264
|
-
"tokio": ["actix-web", "reqwest"]
|
265
|
-
},
|
266
|
-
"most_depended": [
|
267
|
-
["serde", 156],
|
268
|
-
["tokio", 98]
|
269
|
-
]
|
270
|
-
}
|
271
|
-
```
|
272
|
-
|
273
|
-
### 3. **Summary Report** (`summary_report_TIMESTAMP.json`)
|
274
|
-
```json
|
275
|
-
{
|
276
|
-
"total_crates": 150,
|
277
|
-
"total_time": "1247.32s",
|
278
|
-
"timestamp": "2025-06-18T10:30:00",
|
279
|
-
"most_popular": [
|
280
|
-
{"name": "serde", "score": 8542.3},
|
281
|
-
{"name": "tokio", "score": 7234.1}
|
282
|
-
]
|
283
|
-
}
|
284
|
-
```
|
285
|
-
|
286
|
-
## 🔧 Advanced Features
|
287
|
-
|
288
|
-
### Custom Crate Lists
|
289
|
-
Process specific crates by providing a custom list:
|
290
|
-
```bash
|
291
|
-
python -m rust_crate_pipeline --crate-list \
|
292
|
-
serde tokio actix-web reqwest clap \
|
293
|
-
--output-dir ./web_framework_analysis
|
294
|
-
```
|
295
|
-
|
296
|
-
### Performance Tuning
|
297
|
-
Optimize for your system:
|
298
|
-
```bash
|
299
|
-
# High-performance setup (good internet, powerful machine)
|
300
|
-
python -m rust_crate_pipeline --batch-size 20 --workers 8
|
301
|
-
|
302
|
-
# Conservative setup (limited resources)
|
303
|
-
python -m rust_crate_pipeline --batch-size 3 --workers 1
|
304
|
-
```
|
305
|
-
|
306
|
-
### Development Mode
|
307
|
-
Quick testing with minimal processing:
|
308
|
-
```bash
|
309
|
-
python -m rust_crate_pipeline \
|
310
|
-
--limit 5 \
|
311
|
-
--skip-ai \
|
312
|
-
--skip-source-analysis \
|
313
|
-
--log-level DEBUG
|
314
|
-
```
|
315
|
-
|
316
|
-
## 🏗️ Architecture
|
317
|
-
|
318
|
-
### Core Components
|
319
|
-
|
320
|
-
1. **CrateDataPipeline**: Main orchestration class that coordinates all processing
|
321
|
-
2. **LLMEnricher**: Handles AI-powered enrichment using local LLM models
|
322
|
-
3. **CrateAPIClient**: Manages API interactions with crates.io and fallback sources
|
323
|
-
4. **GitHubBatchClient**: Optimized GitHub API client with rate limiting
|
324
|
-
5. **SourceAnalyzer**: Analyzes source code metrics and complexity
|
325
|
-
6. **SecurityAnalyzer**: Checks for security vulnerabilities and patterns
|
326
|
-
7. **UserBehaviorAnalyzer**: Tracks community engagement and version adoption
|
327
|
-
8. **DependencyAnalyzer**: Builds and analyzes dependency relationships
|
328
|
-
|
329
|
-
### Processing Flow
|
330
|
-
|
331
|
-
```
|
332
|
-
1. Crate Discovery → 2. Metadata Fetching → 3. AI Enrichment
|
333
|
-
↓ ↓ ↓
|
334
|
-
4. Source Analysis → 5. Security Scanning → 6. Community Analysis
|
335
|
-
↓ ↓ ↓
|
336
|
-
7. Dependency Mapping → 8. Data Aggregation → 9. Report Generation
|
337
|
-
```
|
338
|
-
|
339
|
-
## 🧪 API Usage
|
340
|
-
|
341
|
-
### Programmatic Usage
|
342
|
-
```python
|
343
|
-
from rust_crate_pipeline import CrateDataPipeline, PipelineConfig
|
344
|
-
|
345
|
-
# Create custom configuration
|
346
|
-
config = PipelineConfig(
|
347
|
-
batch_size=5,
|
348
|
-
max_tokens=512,
|
349
|
-
model_path="/path/to/model.gguf"
|
350
|
-
)
|
351
|
-
|
352
|
-
# Initialize and run pipeline
|
353
|
-
pipeline = CrateDataPipeline(config)
|
354
|
-
pipeline.run()
|
355
|
-
|
356
|
-
# Or use individual components
|
357
|
-
from rust_crate_pipeline import LLMEnricher, SourceAnalyzer
|
358
|
-
|
359
|
-
enricher = LLMEnricher(config)
|
360
|
-
analyzer = SourceAnalyzer()
|
361
|
-
```
|
362
|
-
|
363
|
-
### Custom Processing
|
364
|
-
```python
|
365
|
-
# Process specific crates with custom options
|
366
|
-
pipeline = CrateDataPipeline(
|
367
|
-
config,
|
368
|
-
limit=50,
|
369
|
-
crate_list=["serde", "tokio", "actix-web"],
|
370
|
-
skip_ai=False,
|
371
|
-
output_dir="./custom_analysis"
|
372
|
-
)
|
373
|
-
```
|
374
|
-
|
375
|
-
## 🐛 Troubleshooting
|
376
|
-
|
377
|
-
### Common Issues
|
378
|
-
|
379
|
-
**🔴 Model Loading Errors**
|
380
|
-
```bash
|
381
|
-
# Verify model path
|
382
|
-
ls -la ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
|
383
|
-
|
384
|
-
# Check model format compatibility
|
385
|
-
python -c "from llama_cpp import Llama; print('Model loading OK')"
|
386
|
-
```
|
387
|
-
|
388
|
-
**🔴 API Rate Limiting**
|
389
|
-
```bash
|
390
|
-
# Set GitHub token for higher rate limits
|
391
|
-
export GITHUB_TOKEN="your_token_here"
|
392
|
-
|
393
|
-
# Reduce batch size and workers
|
394
|
-
python -m rust_crate_pipeline --batch-size 3 --workers 1
|
395
|
-
```
|
396
|
-
|
397
|
-
**🔴 Memory Issues**
|
398
|
-
```bash
|
399
|
-
# Reduce token limits and batch size
|
400
|
-
python -m rust_crate_pipeline --max-tokens 128 --batch-size 2
|
401
|
-
```
|
402
|
-
|
403
|
-
**🔴 Network Timeouts**
|
404
|
-
```bash
|
405
|
-
# Enable debug logging to identify issues
|
406
|
-
python -m rust_crate_pipeline --log-level DEBUG --limit 10
|
407
|
-
```
|
408
|
-
|
409
|
-
### Performance Optimization
|
410
|
-
|
411
|
-
1. **Use SSD storage** for faster caching and temporary file operations
|
412
|
-
2. **Increase RAM** if processing large batches (recommended: 8GB+)
|
413
|
-
3. **Set GITHUB_TOKEN** for 5000 req/hour instead of 60 req/hour
|
414
|
-
4. **Use appropriate batch sizes** based on your internet connection
|
415
|
-
5. **Monitor disk space** - processing can generate several GB of data
|
416
|
-
|
417
|
-
## 📈 Performance Metrics
|
418
|
-
|
419
|
-
### Typical Processing Times
|
420
|
-
- **Metadata only**: ~2-3 seconds per crate
|
421
|
-
- **With AI enrichment**: ~15-30 seconds per crate
|
422
|
-
- **Full analysis**: ~45-60 seconds per crate
|
423
|
-
|
424
|
-
### Resource Usage
|
425
|
-
- **Memory**: 2-4GB during processing
|
426
|
-
- **Disk**: 10-50MB per crate (temporary files)
|
427
|
-
- **Network**: ~1-5MB per crate (API calls)
|
428
|
-
|
429
|
-
## 🤝 Contributing
|
430
|
-
|
431
|
-
### Development Setup
|
432
|
-
```bash
|
433
|
-
# Clone repository
|
434
|
-
git clone <repository-url>
|
435
|
-
cd enrichment-flow2
|
436
|
-
|
437
|
-
# Install development dependencies
|
438
|
-
pip install -r requirements-dev.txt
|
439
|
-
|
440
|
-
# Run tests
|
441
|
-
python -m pytest tests/
|
442
|
-
|
443
|
-
# Format code
|
444
|
-
black . && isort .
|
445
|
-
```
|
446
|
-
|
447
|
-
### Adding New Analysis Features
|
448
|
-
1. Implement new analyzer in `analysis.py`
|
449
|
-
2. Add configuration options to `config.py`
|
450
|
-
3. Integrate with pipeline in `pipeline.py`
|
451
|
-
4. Add CLI arguments in `main.py`
|
452
|
-
5. Update documentation
|
453
|
-
|
454
|
-
## 📄 License
|
455
|
-
|
456
|
-
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
457
|
-
|
458
|
-
## 🙏 Acknowledgments
|
459
|
-
|
460
|
-
- **Rust Community** for the excellent crates ecosystem
|
461
|
-
- **crates.io** for providing comprehensive API access
|
462
|
-
- **GitHub** for repository metadata and community data
|
463
|
-
- **Deepseek** for the powerful code-focused language model
|
464
|
-
- **llama.cpp** team for efficient local inference capabilities
|
465
|
-
|
466
|
-
## 📞 Support
|
467
|
-
|
468
|
-
- **Issues**: [GitHub Issues](https://github.com/your-repo/issues)
|
469
|
-
- **Discussions**: [GitHub Discussions](https://github.com/your-repo/discussions)
|
470
|
-
- **Documentation**: [Wiki](https://github.com/your-repo/wiki)
|
471
|
-
|
472
|
-
---
|
473
|
-
|
474
|
-
**Happy crate analyzing! 🦀✨**
|
File without changes
|
{rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.0.dist-info}/entry_points.txt
RENAMED
File without changes
|
{rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.0.dist-info}/licenses/LICENSE
RENAMED
File without changes
|
File without changes
|