rust-crate-pipeline 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/ai_processing.py +27 -30
- rust_crate_pipeline/github_token_checker.py +102 -0
- rust_crate_pipeline/main.py +20 -0
- rust_crate_pipeline/production_config.py +76 -0
- rust_crate_pipeline/version.py +9 -1
- rust_crate_pipeline-1.2.1.dist-info/METADATA +573 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.1.dist-info}/RECORD +11 -9
- rust_crate_pipeline-1.1.1.dist-info/METADATA +0 -474
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.1.1.dist-info → rust_crate_pipeline-1.2.1.dist-info}/top_level.txt +0 -0
@@ -166,35 +166,33 @@ class LLMEnricher:
|
|
166
166
|
prompt: str,
|
167
167
|
validation_func: Callable[[str], bool],
|
168
168
|
temp: float = 0.2,
|
169
|
-
max_tokens: int = 256,
|
170
|
-
retries: int = 3
|
169
|
+
max_tokens: int = 256, retries: int = 4 # Increased from 2 to 4 for better success rates
|
171
170
|
) -> Optional[str]:
|
172
171
|
"""Run LLM with validation and automatic retry on failure"""
|
173
172
|
for attempt in range(retries):
|
174
|
-
try:
|
175
|
-
|
176
|
-
adjusted_temp = temp * (1 + (attempt * 0.1))
|
173
|
+
try: # More generous temperature adjustment for better variety
|
174
|
+
adjusted_temp = temp * (1 + (attempt * 0.2)) # 20% increases instead of 10%
|
177
175
|
result = self.run_llama(prompt, temp=adjusted_temp, max_tokens=max_tokens)
|
178
176
|
|
179
177
|
# Validate the result
|
180
178
|
if result and validation_func(result):
|
181
179
|
return result
|
182
180
|
|
183
|
-
# If we get here, validation failed
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
181
|
+
# If we get here, validation failed - use debug level for early attempts if attempt == retries - 1:
|
182
|
+
logging.debug(f"All {retries} validation attempts failed, using last available result.")
|
183
|
+
else:
|
184
|
+
logging.debug(f"Validation failed on attempt {attempt+1}/{retries}. Retrying with adjusted temp={adjusted_temp:.2f}")
|
185
|
+
# Only simplify prompt on later attempts (attempt 2+)
|
186
|
+
if attempt >= 2:
|
188
187
|
prompt = self.simplify_prompt(prompt)
|
189
188
|
|
190
189
|
except Exception as e:
|
191
190
|
logging.error(f"Generation error on attempt {attempt+1}: {str(e)}")
|
192
|
-
|
193
|
-
#
|
194
|
-
time.sleep(1.5 * (2 ** attempt))
|
191
|
+
# More generous backoff - give the model more time
|
192
|
+
time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
|
195
193
|
|
196
|
-
# If we
|
197
|
-
return None
|
194
|
+
# If we exhausted all retries, return the last result even if not perfect
|
195
|
+
return result if 'result' in locals() else None
|
198
196
|
|
199
197
|
def simplify_prompt(self, prompt: str) -> str:
|
200
198
|
"""Simplify a prompt by removing examples and reducing context"""
|
@@ -245,9 +243,10 @@ class LLMEnricher:
|
|
245
243
|
temp=0.3,
|
246
244
|
max_tokens=300
|
247
245
|
)
|
248
|
-
|
249
|
-
|
250
|
-
|
246
|
+
# Extract key dependencies for context
|
247
|
+
key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5]
|
248
|
+
if dep.get("kind") == "normal" and dep.get("crate_id")]
|
249
|
+
key_deps_str = ", ".join(str(dep) for dep in key_deps) if key_deps else "None"
|
251
250
|
|
252
251
|
# Generate other enrichments
|
253
252
|
enriched.feature_summary = self.summarize_features(crate)
|
@@ -294,13 +293,13 @@ class LLMEnricher:
|
|
294
293
|
|
295
294
|
def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
|
296
295
|
"""Classify the use case of a crate with rich context"""
|
297
|
-
try:
|
298
|
-
# Calculate available tokens for prompt (classification usually needs ~20 response tokens)
|
296
|
+
try: # Calculate available tokens for prompt (classification usually needs ~20 response tokens)
|
299
297
|
available_prompt_tokens = self.config.model_token_limit - 200 # Reserve for response
|
300
298
|
|
301
299
|
joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
|
302
|
-
key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5]
|
303
|
-
|
300
|
+
key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5]
|
301
|
+
if dep.get("kind") == "normal" and dep.get("crate_id")]
|
302
|
+
key_deps_str = ", ".join(str(dep) for dep in key_deps) if key_deps else "None"
|
304
303
|
|
305
304
|
# Adaptively truncate different sections based on importance
|
306
305
|
token_budget = available_prompt_tokens - 400 # Reserve tokens for prompt template
|
@@ -339,13 +338,12 @@ class LLMEnricher:
|
|
339
338
|
f"Category (pick only one): [AI, Database, Web Framework, Networking, Serialization, Utilities, DevTools, ML, Cryptography, Unknown]\n"
|
340
339
|
f"<|end|>"
|
341
340
|
)
|
342
|
-
|
343
|
-
# Validate classification with retry
|
341
|
+
# Validate classification with retry - more generous parameters
|
344
342
|
result = self.validate_and_retry(
|
345
343
|
prompt,
|
346
344
|
validation_func=self.validate_classification,
|
347
|
-
temp=0.
|
348
|
-
max_tokens=20
|
345
|
+
temp=0.2, # Increased from 0.1 for more variety
|
346
|
+
max_tokens=50 # Increased from 20 to allow more complete responses
|
349
347
|
)
|
350
348
|
|
351
349
|
return result or "Unknown"
|
@@ -375,13 +373,12 @@ class LLMEnricher:
|
|
375
373
|
f"Create exactly 5 pairs.\n"
|
376
374
|
f"<|end|>"
|
377
375
|
)
|
378
|
-
|
379
|
-
# Use validation for retry
|
376
|
+
# Use validation for retry - more generous parameters
|
380
377
|
result = self.validate_and_retry(
|
381
378
|
prompt,
|
382
379
|
validation_func=self.validate_factual_pairs,
|
383
|
-
temp=0.
|
384
|
-
max_tokens=500
|
380
|
+
temp=0.7, # Increased from 0.6 for more creativity
|
381
|
+
max_tokens=800 # Increased from 500 for more complete responses
|
385
382
|
)
|
386
383
|
|
387
384
|
return result or "Factual pairs generation failed."
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# github_token_checker.py
|
2
|
+
"""
|
3
|
+
GitHub Token Checker Module
|
4
|
+
Lightweight version of the token checker for integration into the main pipeline.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import requests
|
10
|
+
import logging
|
11
|
+
|
12
|
+
def check_github_token_quick():
|
13
|
+
"""Quick check if GitHub token is available and valid"""
|
14
|
+
token = os.getenv("GITHUB_TOKEN")
|
15
|
+
|
16
|
+
if not token:
|
17
|
+
return False, "GITHUB_TOKEN environment variable not set"
|
18
|
+
|
19
|
+
if len(token) < 20:
|
20
|
+
return False, "GITHUB_TOKEN seems too short - may be invalid"
|
21
|
+
|
22
|
+
try:
|
23
|
+
# Quick API check
|
24
|
+
headers = {
|
25
|
+
"Accept": "application/vnd.github.v3+json",
|
26
|
+
"Authorization": f"token {token}"
|
27
|
+
}
|
28
|
+
|
29
|
+
response = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
|
30
|
+
|
31
|
+
if response.status_code == 200:
|
32
|
+
data = response.json()
|
33
|
+
remaining = data["resources"]["core"]["remaining"]
|
34
|
+
return True, f"Token valid, {remaining} API calls remaining"
|
35
|
+
elif response.status_code == 401:
|
36
|
+
return False, "GitHub token is invalid or expired"
|
37
|
+
else:
|
38
|
+
return False, f"GitHub API returned status code: {response.status_code}"
|
39
|
+
|
40
|
+
except requests.exceptions.RequestException as e:
|
41
|
+
return False, f"Network error checking token: {str(e)}"
|
42
|
+
except Exception as e:
|
43
|
+
return False, f"Error checking token: {str(e)}"
|
44
|
+
|
45
|
+
def prompt_for_token_setup():
|
46
|
+
"""Prompt user to set up GitHub token"""
|
47
|
+
print("\n" + "="*60)
|
48
|
+
print("🔑 GitHub Token Required")
|
49
|
+
print("="*60)
|
50
|
+
print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
|
51
|
+
print("to access repository information and avoid rate limits.")
|
52
|
+
print("\n📋 Quick Setup:")
|
53
|
+
print("1. Get token: https://github.com/settings/tokens")
|
54
|
+
print("2. Required scopes: public_repo, read:user")
|
55
|
+
print("3. Set in environment:")
|
56
|
+
print(" export GITHUB_TOKEN=\"your_token_here\"")
|
57
|
+
print("\n🔧 Setup Scripts Available:")
|
58
|
+
print(" ./setup_github_token.sh (Interactive setup)")
|
59
|
+
print(" python3 check_github_token.py (Full verification)")
|
60
|
+
print("\n" + "="*60)
|
61
|
+
|
62
|
+
# Ask if user wants to continue without token (limited functionality)
|
63
|
+
response = input("\nContinue without GitHub token? (y/N): ").strip().lower()
|
64
|
+
|
65
|
+
if response in ['y', 'yes']:
|
66
|
+
print("⚠️ Running with limited GitHub API access (60 requests/hour)")
|
67
|
+
print(" You may encounter rate limit warnings.")
|
68
|
+
return True
|
69
|
+
else:
|
70
|
+
print("\n🛑 Please set up your GitHub token and try again.")
|
71
|
+
return False
|
72
|
+
|
73
|
+
def check_and_setup_github_token():
|
74
|
+
"""
|
75
|
+
Check GitHub token and prompt for setup if missing.
|
76
|
+
Returns True if should continue, False if should exit.
|
77
|
+
"""
|
78
|
+
is_valid, message = check_github_token_quick()
|
79
|
+
|
80
|
+
if is_valid:
|
81
|
+
logging.debug(f"GitHub token check: {message}")
|
82
|
+
return True
|
83
|
+
|
84
|
+
# Token is missing or invalid
|
85
|
+
logging.warning(f"GitHub token issue: {message}")
|
86
|
+
|
87
|
+
# Check if we're in a non-interactive environment
|
88
|
+
if not sys.stdin.isatty():
|
89
|
+
logging.error("GitHub token not configured and running in non-interactive mode")
|
90
|
+
logging.error("Set GITHUB_TOKEN environment variable before running")
|
91
|
+
return False
|
92
|
+
|
93
|
+
# Interactive prompt
|
94
|
+
return prompt_for_token_setup()
|
95
|
+
|
96
|
+
if __name__ == "__main__":
|
97
|
+
# Allow running this module directly for testing
|
98
|
+
is_valid, message = check_github_token_quick()
|
99
|
+
print(f"Token check: {'✅' if is_valid else '❌'} {message}")
|
100
|
+
|
101
|
+
if not is_valid:
|
102
|
+
check_and_setup_github_token()
|
rust_crate_pipeline/main.py
CHANGED
@@ -8,6 +8,8 @@ import argparse
|
|
8
8
|
from typing import Optional
|
9
9
|
from .config import PipelineConfig
|
10
10
|
from .pipeline import CrateDataPipeline
|
11
|
+
from .production_config import setup_production_environment
|
12
|
+
from .github_token_checker import check_and_setup_github_token
|
11
13
|
|
12
14
|
def parse_arguments():
|
13
15
|
"""Parse command line arguments"""
|
@@ -21,6 +23,7 @@ Examples:
|
|
21
23
|
python -m rust_crate_pipeline --batch-size 5 # Smaller batches
|
22
24
|
python -m rust_crate_pipeline --output-dir ./data # Custom output directory
|
23
25
|
python -m rust_crate_pipeline --log-level DEBUG # Verbose logging
|
26
|
+
PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
|
24
27
|
"""
|
25
28
|
)
|
26
29
|
|
@@ -123,14 +126,31 @@ def check_disk_space():
|
|
123
126
|
logging.warning("Low disk space! This may affect performance.")
|
124
127
|
|
125
128
|
def main():
|
129
|
+
# Setup production environment first for optimal logging
|
130
|
+
prod_config = setup_production_environment()
|
131
|
+
|
126
132
|
args = parse_arguments()
|
127
133
|
configure_logging(args.log_level)
|
128
134
|
check_disk_space()
|
129
135
|
|
136
|
+
# Check GitHub token before proceeding
|
137
|
+
if not check_and_setup_github_token():
|
138
|
+
logging.error("GitHub token setup cancelled or failed. Exiting.")
|
139
|
+
sys.exit(1)
|
140
|
+
|
130
141
|
try:
|
131
142
|
# Create config from command line arguments
|
132
143
|
config_kwargs = {}
|
133
144
|
|
145
|
+
# Apply production optimizations if available
|
146
|
+
if prod_config:
|
147
|
+
config_kwargs.update({
|
148
|
+
'max_retries': prod_config.get('max_retries', 3),
|
149
|
+
'batch_size': prod_config.get('batch_size', 10),
|
150
|
+
'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
|
151
|
+
'cache_ttl': prod_config.get('cache_ttl', 3600),
|
152
|
+
})
|
153
|
+
|
134
154
|
if args.batch_size:
|
135
155
|
config_kwargs['batch_size'] = args.batch_size
|
136
156
|
if args.workers:
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# production_config.py
|
2
|
+
"""
|
3
|
+
Production configuration to reduce runtime warnings and optimize performance.
|
4
|
+
This file contains settings that can be imported to minimize verbose logging
|
5
|
+
and improve the user experience in production environments.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import os
|
10
|
+
|
11
|
+
# Production logging configuration
|
12
|
+
def configure_production_logging():
|
13
|
+
"""Configure logging for production to reduce verbose warnings"""
|
14
|
+
|
15
|
+
# Set up logging format
|
16
|
+
logging.basicConfig(
|
17
|
+
level=logging.INFO, # Default to INFO level
|
18
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
19
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
20
|
+
)
|
21
|
+
|
22
|
+
# Set specific loggers to less verbose levels
|
23
|
+
logging.getLogger('requests').setLevel(logging.WARNING)
|
24
|
+
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
25
|
+
logging.getLogger('requests_cache').setLevel(logging.WARNING)
|
26
|
+
|
27
|
+
# If PRODUCTION environment variable is set, be even quieter
|
28
|
+
if os.getenv('PRODUCTION', 'false').lower() == 'true':
|
29
|
+
logging.getLogger().setLevel(logging.WARNING)
|
30
|
+
logging.getLogger('rust_crate_pipeline').setLevel(logging.INFO)
|
31
|
+
|
32
|
+
# Production-optimized settings
|
33
|
+
PRODUCTION_SETTINGS = {
|
34
|
+
# Reduced retries to minimize warnings
|
35
|
+
'max_retries': 2,
|
36
|
+
'validation_retries': 2,
|
37
|
+
|
38
|
+
# GitHub API management
|
39
|
+
'github_rate_limit_threshold': 100,
|
40
|
+
'github_critical_threshold': 50,
|
41
|
+
|
42
|
+
# LLM settings
|
43
|
+
'llm_timeout': 30,
|
44
|
+
'llm_max_attempts': 2,
|
45
|
+
|
46
|
+
# Logging preferences
|
47
|
+
'quiet_mode': True,
|
48
|
+
'log_level': 'INFO',
|
49
|
+
|
50
|
+
# Performance settings
|
51
|
+
'batch_size': 10,
|
52
|
+
'checkpoint_interval': 10,
|
53
|
+
'cache_ttl': 3600,
|
54
|
+
}
|
55
|
+
|
56
|
+
def get_production_config():
|
57
|
+
"""Get production configuration dictionary"""
|
58
|
+
return PRODUCTION_SETTINGS.copy()
|
59
|
+
|
60
|
+
def is_production():
|
61
|
+
"""Check if running in production mode"""
|
62
|
+
return os.getenv('PRODUCTION', 'false').lower() == 'true'
|
63
|
+
|
64
|
+
def setup_production_environment():
|
65
|
+
"""Set up the complete production environment"""
|
66
|
+
configure_production_logging()
|
67
|
+
|
68
|
+
# Set environment variables for quieter operation
|
69
|
+
os.environ.setdefault('PYTHONWARNINGS', 'ignore::UserWarning')
|
70
|
+
|
71
|
+
if is_production():
|
72
|
+
print("🚀 Production mode enabled - optimized for minimal warnings")
|
73
|
+
return get_production_config()
|
74
|
+
else:
|
75
|
+
print("🔧 Development mode - full logging enabled")
|
76
|
+
return {}
|
rust_crate_pipeline/version.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1
1
|
"""Version information for rust-crate-pipeline."""
|
2
2
|
|
3
|
-
__version__ = "1.
|
3
|
+
__version__ = "1.2.1"
|
4
4
|
__version_info__ = tuple(int(x) for x in __version__.split("."))
|
5
5
|
|
6
6
|
# Version history
|
7
|
+
# 1.2.0 - Major release: Production-ready, cleaned codebase
|
8
|
+
# - Unified documentation into single comprehensive README
|
9
|
+
# - Removed all non-essential development and test files
|
10
|
+
# - Optimized for PyPI distribution and Docker deployment
|
11
|
+
# - Enhanced GitHub token integration and setup
|
12
|
+
# 1.1.2 - Production release: Cleaned up non-essential files
|
13
|
+
# - Unified documentation into single README
|
14
|
+
# - Optimized for PyPI distribution
|
7
15
|
# 1.1.1 - Bug fix: Added missing python-dateutil dependency
|
8
16
|
# - Fixed relativedelta import error
|
9
17
|
# 1.1.0 - Updated author and contact information
|