rust-crate-pipeline 1.2.5__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +15 -6
- rust_crate_pipeline/ai_processing.py +260 -153
- rust_crate_pipeline/analysis.py +171 -160
- rust_crate_pipeline/config.py +22 -2
- rust_crate_pipeline/github_token_checker.py +30 -20
- rust_crate_pipeline/main.py +108 -45
- rust_crate_pipeline/network.py +109 -108
- rust_crate_pipeline/pipeline.py +269 -125
- rust_crate_pipeline/production_config.py +15 -9
- rust_crate_pipeline/utils/file_utils.py +14 -10
- rust_crate_pipeline/utils/logging_utils.py +25 -13
- rust_crate_pipeline/version.py +25 -1
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.4.0.dist-info}/METADATA +27 -15
- rust_crate_pipeline-1.4.0.dist-info/RECORD +19 -0
- rust_crate_pipeline-1.2.5.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.4.0.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.4.0.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.4.0.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.4.0.dist-info}/top_level.txt +0 -0
@@ -9,25 +9,29 @@ import sys
|
|
9
9
|
import requests
|
10
10
|
import logging
|
11
11
|
|
12
|
+
|
12
13
|
def check_github_token_quick():
|
13
14
|
"""Quick check if GitHub token is available and valid"""
|
14
15
|
token = os.getenv("GITHUB_TOKEN")
|
15
|
-
|
16
|
+
|
16
17
|
if not token:
|
17
18
|
return False, "GITHUB_TOKEN environment variable not set"
|
18
|
-
|
19
|
+
|
19
20
|
if len(token) < 20:
|
20
21
|
return False, "GITHUB_TOKEN seems too short - may be invalid"
|
21
|
-
|
22
|
+
|
22
23
|
try:
|
23
24
|
# Quick API check
|
24
25
|
headers = {
|
25
26
|
"Accept": "application/vnd.github.v3+json",
|
26
27
|
"Authorization": f"token {token}"
|
27
28
|
}
|
28
|
-
|
29
|
-
response = requests.get(
|
30
|
-
|
29
|
+
|
30
|
+
response = requests.get(
|
31
|
+
"https://api.github.com/rate_limit",
|
32
|
+
headers=headers,
|
33
|
+
timeout=10)
|
34
|
+
|
31
35
|
if response.status_code == 200:
|
32
36
|
data = response.json()
|
33
37
|
remaining = data["resources"]["core"]["remaining"]
|
@@ -35,18 +39,20 @@ def check_github_token_quick():
|
|
35
39
|
elif response.status_code == 401:
|
36
40
|
return False, "GitHub token is invalid or expired"
|
37
41
|
else:
|
38
|
-
return False, f"GitHub API returned status code: {
|
39
|
-
|
42
|
+
return False, f"GitHub API returned status code: {
|
43
|
+
response.status_code}"
|
44
|
+
|
40
45
|
except requests.exceptions.RequestException as e:
|
41
46
|
return False, f"Network error checking token: {str(e)}"
|
42
47
|
except Exception as e:
|
43
48
|
return False, f"Error checking token: {str(e)}"
|
44
49
|
|
50
|
+
|
45
51
|
def prompt_for_token_setup():
|
46
52
|
"""Prompt user to set up GitHub token"""
|
47
|
-
print("\n" + "="*60)
|
53
|
+
print("\n" + "=" * 60)
|
48
54
|
print("🔑 GitHub Token Required")
|
49
|
-
print("="*60)
|
55
|
+
print("=" * 60)
|
50
56
|
print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
|
51
57
|
print("to access repository information and avoid rate limits.")
|
52
58
|
print("\n📋 Quick Setup:")
|
@@ -57,11 +63,12 @@ def prompt_for_token_setup():
|
|
57
63
|
print("\n🔧 Setup Scripts Available:")
|
58
64
|
print(" ./setup_github_token.sh (Interactive setup)")
|
59
65
|
print(" python3 check_github_token.py (Full verification)")
|
60
|
-
print("\n" + "="*60)
|
61
|
-
|
66
|
+
print("\n" + "=" * 60)
|
67
|
+
|
62
68
|
# Ask if user wants to continue without token (limited functionality)
|
63
|
-
response = input(
|
64
|
-
|
69
|
+
response = input(
|
70
|
+
"\nContinue without GitHub token? (y/N): ").strip().lower()
|
71
|
+
|
65
72
|
if response in ['y', 'yes']:
|
66
73
|
print("⚠️ Running with limited GitHub API access (60 requests/hour)")
|
67
74
|
print(" You may encounter rate limit warnings.")
|
@@ -70,33 +77,36 @@ def prompt_for_token_setup():
|
|
70
77
|
print("\n🛑 Please set up your GitHub token and try again.")
|
71
78
|
return False
|
72
79
|
|
80
|
+
|
73
81
|
def check_and_setup_github_token():
|
74
82
|
"""
|
75
83
|
Check GitHub token and prompt for setup if missing.
|
76
84
|
Returns True if should continue, False if should exit.
|
77
85
|
"""
|
78
86
|
is_valid, message = check_github_token_quick()
|
79
|
-
|
87
|
+
|
80
88
|
if is_valid:
|
81
89
|
logging.debug(f"GitHub token check: {message}")
|
82
90
|
return True
|
83
|
-
|
91
|
+
|
84
92
|
# Token is missing or invalid
|
85
93
|
logging.warning(f"GitHub token issue: {message}")
|
86
|
-
|
94
|
+
|
87
95
|
# Check if we're in a non-interactive environment
|
88
96
|
if not sys.stdin.isatty():
|
89
|
-
logging.error(
|
97
|
+
logging.error(
|
98
|
+
"GitHub token not configured and running in non-interactive mode")
|
90
99
|
logging.error("Set GITHUB_TOKEN environment variable before running")
|
91
100
|
return False
|
92
|
-
|
101
|
+
|
93
102
|
# Interactive prompt
|
94
103
|
return prompt_for_token_setup()
|
95
104
|
|
105
|
+
|
96
106
|
if __name__ == "__main__":
|
97
107
|
# Allow running this module directly for testing
|
98
108
|
is_valid, message = check_github_token_quick()
|
99
109
|
print(f"Token check: {'✅' if is_valid else '❌'} {message}")
|
100
|
-
|
110
|
+
|
101
111
|
if not is_valid:
|
102
112
|
check_and_setup_github_token()
|
rust_crate_pipeline/main.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
# main.py
|
2
|
-
import os
|
3
2
|
import sys
|
4
3
|
import time
|
5
4
|
import logging
|
6
5
|
import shutil
|
7
6
|
import argparse
|
8
|
-
from typing import Optional
|
9
7
|
from .config import PipelineConfig
|
10
8
|
from .pipeline import CrateDataPipeline
|
11
9
|
from .production_config import setup_production_environment
|
12
10
|
from .github_token_checker import check_and_setup_github_token
|
13
11
|
|
12
|
+
|
14
13
|
def parse_arguments():
|
15
14
|
"""Parse command line arguments"""
|
16
15
|
parser = argparse.ArgumentParser(
|
@@ -26,102 +25,135 @@ Examples:
|
|
26
25
|
PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
|
27
26
|
"""
|
28
27
|
)
|
29
|
-
|
28
|
+
|
30
29
|
parser.add_argument(
|
31
30
|
'--limit', '-l',
|
32
31
|
type=int,
|
33
32
|
default=None,
|
34
33
|
help='Limit the number of crates to process (default: process all)'
|
35
34
|
)
|
36
|
-
|
35
|
+
|
37
36
|
parser.add_argument(
|
38
37
|
'--batch-size', '-b',
|
39
38
|
type=int,
|
40
39
|
default=10,
|
41
40
|
help='Number of crates to process in each batch (default: 10)'
|
42
41
|
)
|
43
|
-
|
42
|
+
|
44
43
|
parser.add_argument(
|
45
44
|
'--workers', '-w',
|
46
45
|
type=int,
|
47
46
|
default=4,
|
48
47
|
help='Number of parallel workers for API requests (default: 4)'
|
49
48
|
)
|
50
|
-
|
49
|
+
|
51
50
|
parser.add_argument(
|
52
51
|
'--output-dir', '-o',
|
53
52
|
type=str,
|
54
53
|
default=None,
|
55
54
|
help='Output directory for results (default: auto-generated timestamped directory)'
|
56
55
|
)
|
57
|
-
|
56
|
+
|
58
57
|
parser.add_argument(
|
59
58
|
'--model-path', '-m',
|
60
59
|
type=str,
|
61
60
|
default=None,
|
62
61
|
help='Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
|
63
62
|
)
|
64
|
-
|
63
|
+
|
65
64
|
parser.add_argument(
|
66
65
|
'--max-tokens',
|
67
66
|
type=int,
|
68
67
|
default=256,
|
69
68
|
help='Maximum tokens for LLM generation (default: 256)'
|
70
69
|
)
|
71
|
-
|
70
|
+
|
72
71
|
parser.add_argument(
|
73
72
|
'--checkpoint-interval',
|
74
73
|
type=int,
|
75
74
|
default=10,
|
76
75
|
help='Save checkpoint every N crates (default: 10)'
|
77
76
|
)
|
78
|
-
|
79
|
-
parser.add_argument(
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
77
|
+
|
78
|
+
parser.add_argument('--log-level',
|
79
|
+
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
|
80
|
+
default='INFO',
|
81
|
+
help='Logging level (default: INFO)'
|
82
|
+
)
|
83
|
+
|
86
84
|
parser.add_argument(
|
87
85
|
'--skip-ai',
|
88
86
|
action='store_true',
|
89
87
|
help='Skip AI enrichment (faster, metadata only)'
|
90
88
|
)
|
91
|
-
|
89
|
+
|
92
90
|
parser.add_argument(
|
93
91
|
'--skip-source-analysis',
|
94
92
|
action='store_true',
|
95
93
|
help='Skip source code analysis'
|
96
94
|
)
|
97
|
-
|
95
|
+
|
96
|
+
# Enhanced scraping with Crawl4AI
|
97
|
+
parser.add_argument(
|
98
|
+
'--enable-crawl4ai',
|
99
|
+
action='store_true',
|
100
|
+
default=True,
|
101
|
+
help='Enable enhanced web scraping with Crawl4AI (default: enabled)'
|
102
|
+
)
|
103
|
+
|
104
|
+
parser.add_argument(
|
105
|
+
'--disable-crawl4ai',
|
106
|
+
action='store_true',
|
107
|
+
help='Disable Crawl4AI enhanced scraping (use basic scraping only)'
|
108
|
+
)
|
109
|
+
|
110
|
+
parser.add_argument(
|
111
|
+
'--crawl4ai-model',
|
112
|
+
type=str,
|
113
|
+
default='ollama/deepseek-coder:6.7b',
|
114
|
+
help='Model to use with Crawl4AI (default: ollama/deepseek-coder:6.7b)'
|
115
|
+
)
|
116
|
+
|
117
|
+
parser.add_argument(
|
118
|
+
'--enable-sigil-protocol',
|
119
|
+
action='store_true',
|
120
|
+
help='Enable Sigil Protocol Sacred Chain processing (Rule Zero compliance)')
|
121
|
+
|
122
|
+
parser.add_argument(
|
123
|
+
'--sigil-mode',
|
124
|
+
choices=['enhanced', 'direct-llm', 'hybrid'],
|
125
|
+
default='enhanced',
|
126
|
+
help='Sigil processing mode: enhanced (API-based), direct-llm (local), hybrid (both)'
|
127
|
+
)
|
128
|
+
|
98
129
|
parser.add_argument(
|
99
130
|
'--crate-list',
|
100
131
|
type=str,
|
101
132
|
nargs='+',
|
102
133
|
help='Specific crates to process (space-separated list)'
|
103
134
|
)
|
104
|
-
|
135
|
+
|
105
136
|
parser.add_argument(
|
106
137
|
'--config-file',
|
107
138
|
type=str,
|
108
139
|
help='JSON config file to override default settings'
|
109
140
|
)
|
110
|
-
|
141
|
+
|
111
142
|
return parser.parse_args()
|
112
143
|
|
144
|
+
|
113
145
|
def configure_logging(log_level: str = 'INFO'):
|
114
146
|
"""Configure logging with both console and file output"""
|
115
147
|
level = getattr(logging, log_level.upper())
|
116
|
-
|
148
|
+
|
117
149
|
# Clear any existing handlers to avoid conflicts
|
118
150
|
root_logger = logging.getLogger()
|
119
151
|
for handler in root_logger.handlers[:]:
|
120
152
|
root_logger.removeHandler(handler)
|
121
|
-
|
153
|
+
|
122
154
|
# Set root logger level
|
123
155
|
root_logger.setLevel(level)
|
124
|
-
|
156
|
+
|
125
157
|
# Create formatters
|
126
158
|
detailed_formatter = logging.Formatter(
|
127
159
|
"%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
@@ -130,55 +162,58 @@ def configure_logging(log_level: str = 'INFO'):
|
|
130
162
|
simple_formatter = logging.Formatter(
|
131
163
|
"%(asctime)s [%(levelname)s] %(message)s"
|
132
164
|
)
|
133
|
-
|
165
|
+
|
134
166
|
# Console handler
|
135
167
|
console_handler = logging.StreamHandler()
|
136
168
|
console_handler.setLevel(level)
|
137
169
|
console_handler.setFormatter(simple_formatter)
|
138
170
|
root_logger.addHandler(console_handler)
|
139
|
-
|
171
|
+
|
140
172
|
# File handler with unique timestamp
|
141
173
|
log_filename = f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log"
|
142
174
|
try:
|
143
|
-
file_handler = logging.FileHandler(
|
175
|
+
file_handler = logging.FileHandler(
|
176
|
+
log_filename, mode='w', encoding='utf-8')
|
144
177
|
file_handler.setLevel(logging.DEBUG) # Always capture DEBUG+ to file
|
145
178
|
file_handler.setFormatter(detailed_formatter)
|
146
179
|
root_logger.addHandler(file_handler)
|
147
|
-
|
180
|
+
|
148
181
|
# Log a test message to verify file handler works
|
149
182
|
logging.info(f"Logging initialized - file: {log_filename}")
|
150
|
-
|
183
|
+
|
151
184
|
except Exception as e:
|
152
185
|
logging.error(f"Failed to create log file {log_filename}: {e}")
|
153
186
|
print(f"Warning: Could not create log file: {e}")
|
154
|
-
|
187
|
+
|
155
188
|
# Set library loggers to less verbose levels
|
156
189
|
logging.getLogger('requests').setLevel(logging.WARNING)
|
157
190
|
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
158
191
|
logging.getLogger('requests_cache').setLevel(logging.WARNING)
|
159
192
|
logging.getLogger('llama_cpp').setLevel(logging.WARNING)
|
160
193
|
|
194
|
+
|
161
195
|
def check_disk_space():
|
162
196
|
if shutil.disk_usage(".").free < 1_000_000_000: # 1GB
|
163
197
|
logging.warning("Low disk space! This may affect performance.")
|
164
198
|
|
199
|
+
|
165
200
|
def main():
|
166
201
|
# Setup production environment first for optimal logging
|
167
202
|
prod_config = setup_production_environment()
|
168
|
-
|
203
|
+
|
169
204
|
args = parse_arguments()
|
170
205
|
configure_logging(args.log_level)
|
171
206
|
check_disk_space()
|
172
|
-
|
207
|
+
|
173
208
|
# Check GitHub token before proceeding
|
174
209
|
if not check_and_setup_github_token():
|
175
210
|
logging.error("GitHub token setup cancelled or failed. Exiting.")
|
176
211
|
sys.exit(1)
|
177
|
-
|
212
|
+
|
178
213
|
try:
|
179
214
|
# Create config from command line arguments
|
180
215
|
config_kwargs = {}
|
181
|
-
|
216
|
+
|
182
217
|
# Apply production optimizations if available
|
183
218
|
if prod_config:
|
184
219
|
config_kwargs.update({
|
@@ -187,7 +222,7 @@ def main():
|
|
187
222
|
'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
|
188
223
|
'cache_ttl': prod_config.get('cache_ttl', 3600),
|
189
224
|
})
|
190
|
-
|
225
|
+
|
191
226
|
if args.batch_size:
|
192
227
|
config_kwargs['batch_size'] = args.batch_size
|
193
228
|
if args.workers:
|
@@ -198,16 +233,23 @@ def main():
|
|
198
233
|
config_kwargs['max_tokens'] = args.max_tokens
|
199
234
|
if args.checkpoint_interval:
|
200
235
|
config_kwargs['checkpoint_interval'] = args.checkpoint_interval
|
201
|
-
|
202
|
-
# Load config file if provided
|
236
|
+
# Load config file if provided
|
203
237
|
if args.config_file:
|
204
238
|
import json
|
205
239
|
with open(args.config_file, 'r') as f:
|
206
240
|
file_config = json.load(f)
|
207
241
|
config_kwargs.update(file_config)
|
208
|
-
|
242
|
+
|
243
|
+
# Handle Crawl4AI configuration
|
244
|
+
enable_crawl4ai = args.enable_crawl4ai and not args.disable_crawl4ai if hasattr(
|
245
|
+
args, 'disable_crawl4ai') else True
|
246
|
+
config_kwargs.update({
|
247
|
+
'enable_crawl4ai': enable_crawl4ai,
|
248
|
+
'crawl4ai_model': getattr(args, 'crawl4ai_model', 'ollama/deepseek-coder:6.7b')
|
249
|
+
})
|
250
|
+
|
209
251
|
config = PipelineConfig(**config_kwargs)
|
210
|
-
|
252
|
+
|
211
253
|
# Pass additional arguments to pipeline
|
212
254
|
pipeline_kwargs = {}
|
213
255
|
if args.output_dir:
|
@@ -220,15 +262,36 @@ def main():
|
|
220
262
|
pipeline_kwargs['skip_ai'] = True
|
221
263
|
if args.skip_source_analysis:
|
222
264
|
pipeline_kwargs['skip_source'] = True
|
223
|
-
|
224
|
-
|
225
|
-
|
265
|
+
|
266
|
+
# Sigil Protocol integration
|
267
|
+
if hasattr(
|
268
|
+
args,
|
269
|
+
'enable_sigil_protocol') and args.enable_sigil_protocol:
|
270
|
+
# Import Sigil enhanced pipeline
|
271
|
+
try:
|
272
|
+
import sys
|
273
|
+
sys.path.append('.') # Add current directory to path
|
274
|
+
from sigil_enhanced_pipeline import SigilCompliantPipeline
|
275
|
+
|
276
|
+
pipeline = SigilCompliantPipeline(config, **pipeline_kwargs)
|
277
|
+
logging.info(
|
278
|
+
"Starting Sigil Protocol compliant pipeline with Sacred Chain processing")
|
279
|
+
except ImportError as e:
|
280
|
+
logging.warning(f"Sigil enhanced pipeline not available: {e}")
|
281
|
+
logging.info("Falling back to standard pipeline")
|
282
|
+
pipeline = CrateDataPipeline(config, **pipeline_kwargs)
|
283
|
+
else:
|
284
|
+
pipeline = CrateDataPipeline(config, **pipeline_kwargs)
|
226
285
|
logging.info(f"Starting pipeline with {len(vars(args))} arguments")
|
227
|
-
|
228
|
-
|
286
|
+
|
287
|
+
# Run the pipeline asynchronously
|
288
|
+
import asyncio
|
289
|
+
asyncio.run(pipeline.run())
|
290
|
+
|
229
291
|
except Exception as e:
|
230
292
|
logging.critical(f"Pipeline failed: {str(e)}")
|
231
293
|
sys.exit(1)
|
232
294
|
|
295
|
+
|
233
296
|
if __name__ == "__main__":
|
234
|
-
main()
|
297
|
+
main()
|