rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +15 -6
- rust_crate_pipeline/ai_processing.py +260 -153
- rust_crate_pipeline/analysis.py +171 -160
- rust_crate_pipeline/config.py +23 -3
- rust_crate_pipeline/github_token_checker.py +30 -20
- rust_crate_pipeline/main.py +107 -45
- rust_crate_pipeline/network.py +109 -108
- rust_crate_pipeline/pipeline.py +269 -125
- rust_crate_pipeline/production_config.py +15 -9
- rust_crate_pipeline/utils/file_utils.py +14 -10
- rust_crate_pipeline/utils/logging_utils.py +25 -13
- rust_crate_pipeline/version.py +47 -2
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
- rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0
@@ -9,25 +9,29 @@ import sys
|
|
9
9
|
import requests
|
10
10
|
import logging
|
11
11
|
|
12
|
+
|
12
13
|
def check_github_token_quick():
|
13
14
|
"""Quick check if GitHub token is available and valid"""
|
14
15
|
token = os.getenv("GITHUB_TOKEN")
|
15
|
-
|
16
|
+
|
16
17
|
if not token:
|
17
18
|
return False, "GITHUB_TOKEN environment variable not set"
|
18
|
-
|
19
|
+
|
19
20
|
if len(token) < 20:
|
20
21
|
return False, "GITHUB_TOKEN seems too short - may be invalid"
|
21
|
-
|
22
|
+
|
22
23
|
try:
|
23
24
|
# Quick API check
|
24
25
|
headers = {
|
25
26
|
"Accept": "application/vnd.github.v3+json",
|
26
27
|
"Authorization": f"token {token}"
|
27
28
|
}
|
28
|
-
|
29
|
-
response = requests.get(
|
30
|
-
|
29
|
+
|
30
|
+
response = requests.get(
|
31
|
+
"https://api.github.com/rate_limit",
|
32
|
+
headers=headers,
|
33
|
+
timeout=10)
|
34
|
+
|
31
35
|
if response.status_code == 200:
|
32
36
|
data = response.json()
|
33
37
|
remaining = data["resources"]["core"]["remaining"]
|
@@ -35,18 +39,20 @@ def check_github_token_quick():
|
|
35
39
|
elif response.status_code == 401:
|
36
40
|
return False, "GitHub token is invalid or expired"
|
37
41
|
else:
|
38
|
-
return False, f"GitHub API returned status code: {
|
39
|
-
|
42
|
+
return False, f"GitHub API returned status code: {
|
43
|
+
response.status_code}"
|
44
|
+
|
40
45
|
except requests.exceptions.RequestException as e:
|
41
46
|
return False, f"Network error checking token: {str(e)}"
|
42
47
|
except Exception as e:
|
43
48
|
return False, f"Error checking token: {str(e)}"
|
44
49
|
|
50
|
+
|
45
51
|
def prompt_for_token_setup():
|
46
52
|
"""Prompt user to set up GitHub token"""
|
47
|
-
print("\n" + "="*60)
|
53
|
+
print("\n" + "=" * 60)
|
48
54
|
print("🔑 GitHub Token Required")
|
49
|
-
print("="*60)
|
55
|
+
print("=" * 60)
|
50
56
|
print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
|
51
57
|
print("to access repository information and avoid rate limits.")
|
52
58
|
print("\n📋 Quick Setup:")
|
@@ -57,11 +63,12 @@ def prompt_for_token_setup():
|
|
57
63
|
print("\n🔧 Setup Scripts Available:")
|
58
64
|
print(" ./setup_github_token.sh (Interactive setup)")
|
59
65
|
print(" python3 check_github_token.py (Full verification)")
|
60
|
-
print("\n" + "="*60)
|
61
|
-
|
66
|
+
print("\n" + "=" * 60)
|
67
|
+
|
62
68
|
# Ask if user wants to continue without token (limited functionality)
|
63
|
-
response = input(
|
64
|
-
|
69
|
+
response = input(
|
70
|
+
"\nContinue without GitHub token? (y/N): ").strip().lower()
|
71
|
+
|
65
72
|
if response in ['y', 'yes']:
|
66
73
|
print("⚠️ Running with limited GitHub API access (60 requests/hour)")
|
67
74
|
print(" You may encounter rate limit warnings.")
|
@@ -70,33 +77,36 @@ def prompt_for_token_setup():
|
|
70
77
|
print("\n🛑 Please set up your GitHub token and try again.")
|
71
78
|
return False
|
72
79
|
|
80
|
+
|
73
81
|
def check_and_setup_github_token():
|
74
82
|
"""
|
75
83
|
Check GitHub token and prompt for setup if missing.
|
76
84
|
Returns True if should continue, False if should exit.
|
77
85
|
"""
|
78
86
|
is_valid, message = check_github_token_quick()
|
79
|
-
|
87
|
+
|
80
88
|
if is_valid:
|
81
89
|
logging.debug(f"GitHub token check: {message}")
|
82
90
|
return True
|
83
|
-
|
91
|
+
|
84
92
|
# Token is missing or invalid
|
85
93
|
logging.warning(f"GitHub token issue: {message}")
|
86
|
-
|
94
|
+
|
87
95
|
# Check if we're in a non-interactive environment
|
88
96
|
if not sys.stdin.isatty():
|
89
|
-
logging.error(
|
97
|
+
logging.error(
|
98
|
+
"GitHub token not configured and running in non-interactive mode")
|
90
99
|
logging.error("Set GITHUB_TOKEN environment variable before running")
|
91
100
|
return False
|
92
|
-
|
101
|
+
|
93
102
|
# Interactive prompt
|
94
103
|
return prompt_for_token_setup()
|
95
104
|
|
105
|
+
|
96
106
|
if __name__ == "__main__":
|
97
107
|
# Allow running this module directly for testing
|
98
108
|
is_valid, message = check_github_token_quick()
|
99
109
|
print(f"Token check: {'✅' if is_valid else '❌'} {message}")
|
100
|
-
|
110
|
+
|
101
111
|
if not is_valid:
|
102
112
|
check_and_setup_github_token()
|
rust_crate_pipeline/main.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
# main.py
|
2
|
-
import os
|
3
2
|
import sys
|
4
3
|
import time
|
5
4
|
import logging
|
6
5
|
import shutil
|
7
6
|
import argparse
|
8
|
-
from typing import Optional
|
9
7
|
from .config import PipelineConfig
|
10
8
|
from .pipeline import CrateDataPipeline
|
11
9
|
from .production_config import setup_production_environment
|
12
10
|
from .github_token_checker import check_and_setup_github_token
|
13
11
|
|
12
|
+
|
14
13
|
def parse_arguments():
|
15
14
|
"""Parse command line arguments"""
|
16
15
|
parser = argparse.ArgumentParser(
|
@@ -26,102 +25,134 @@ Examples:
|
|
26
25
|
PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
|
27
26
|
"""
|
28
27
|
)
|
29
|
-
|
28
|
+
|
30
29
|
parser.add_argument(
|
31
30
|
'--limit', '-l',
|
32
31
|
type=int,
|
33
32
|
default=None,
|
34
33
|
help='Limit the number of crates to process (default: process all)'
|
35
34
|
)
|
36
|
-
|
35
|
+
|
37
36
|
parser.add_argument(
|
38
37
|
'--batch-size', '-b',
|
39
38
|
type=int,
|
40
39
|
default=10,
|
41
40
|
help='Number of crates to process in each batch (default: 10)'
|
42
41
|
)
|
43
|
-
|
42
|
+
|
44
43
|
parser.add_argument(
|
45
44
|
'--workers', '-w',
|
46
45
|
type=int,
|
47
46
|
default=4,
|
48
47
|
help='Number of parallel workers for API requests (default: 4)'
|
49
48
|
)
|
50
|
-
|
49
|
+
|
51
50
|
parser.add_argument(
|
52
51
|
'--output-dir', '-o',
|
53
52
|
type=str,
|
54
53
|
default=None,
|
55
54
|
help='Output directory for results (default: auto-generated timestamped directory)'
|
56
55
|
)
|
57
|
-
|
56
|
+
|
58
57
|
parser.add_argument(
|
59
58
|
'--model-path', '-m',
|
60
59
|
type=str,
|
61
60
|
default=None,
|
62
61
|
help='Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
|
63
62
|
)
|
64
|
-
|
63
|
+
|
65
64
|
parser.add_argument(
|
66
65
|
'--max-tokens',
|
67
66
|
type=int,
|
68
67
|
default=256,
|
69
68
|
help='Maximum tokens for LLM generation (default: 256)'
|
70
69
|
)
|
71
|
-
|
70
|
+
|
72
71
|
parser.add_argument(
|
73
72
|
'--checkpoint-interval',
|
74
73
|
type=int,
|
75
74
|
default=10,
|
76
75
|
help='Save checkpoint every N crates (default: 10)'
|
77
76
|
)
|
78
|
-
|
79
|
-
parser.add_argument(
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
77
|
+
|
78
|
+
parser.add_argument('--log-level',
|
79
|
+
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
|
80
|
+
default='INFO',
|
81
|
+
help='Logging level (default: INFO)'
|
82
|
+
)
|
83
|
+
|
86
84
|
parser.add_argument(
|
87
85
|
'--skip-ai',
|
88
86
|
action='store_true',
|
89
87
|
help='Skip AI enrichment (faster, metadata only)'
|
90
88
|
)
|
91
|
-
|
89
|
+
|
92
90
|
parser.add_argument(
|
93
91
|
'--skip-source-analysis',
|
94
92
|
action='store_true',
|
95
93
|
help='Skip source code analysis'
|
96
94
|
)
|
97
|
-
|
95
|
+
|
96
|
+
# Enhanced scraping with Crawl4AI
|
97
|
+
parser.add_argument(
|
98
|
+
'--enable-crawl4ai',
|
99
|
+
action='store_true',
|
100
|
+
default=True,
|
101
|
+
help='Enable enhanced web scraping with Crawl4AI (default: enabled)'
|
102
|
+
)
|
103
|
+
|
104
|
+
parser.add_argument(
|
105
|
+
'--disable-crawl4ai',
|
106
|
+
action='store_true',
|
107
|
+
help='Disable Crawl4AI enhanced scraping (use basic scraping only)' )
|
108
|
+
|
109
|
+
parser.add_argument(
|
110
|
+
'--crawl4ai-model',
|
111
|
+
type=str,
|
112
|
+
default='~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf',
|
113
|
+
help='GGUF model path for Crawl4AI content analysis (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
|
114
|
+
)
|
115
|
+
|
116
|
+
parser.add_argument(
|
117
|
+
'--enable-sigil-protocol',
|
118
|
+
action='store_true',
|
119
|
+
help='Enable Sigil Protocol Sacred Chain processing (Rule Zero compliance)')
|
120
|
+
|
121
|
+
parser.add_argument(
|
122
|
+
'--sigil-mode',
|
123
|
+
choices=['enhanced', 'direct-llm', 'hybrid'],
|
124
|
+
default='enhanced',
|
125
|
+
help='Sigil processing mode: enhanced (API-based), direct-llm (local), hybrid (both)'
|
126
|
+
)
|
127
|
+
|
98
128
|
parser.add_argument(
|
99
129
|
'--crate-list',
|
100
130
|
type=str,
|
101
131
|
nargs='+',
|
102
132
|
help='Specific crates to process (space-separated list)'
|
103
133
|
)
|
104
|
-
|
134
|
+
|
105
135
|
parser.add_argument(
|
106
136
|
'--config-file',
|
107
137
|
type=str,
|
108
138
|
help='JSON config file to override default settings'
|
109
139
|
)
|
110
|
-
|
140
|
+
|
111
141
|
return parser.parse_args()
|
112
142
|
|
143
|
+
|
113
144
|
def configure_logging(log_level: str = 'INFO'):
|
114
145
|
"""Configure logging with both console and file output"""
|
115
146
|
level = getattr(logging, log_level.upper())
|
116
|
-
|
147
|
+
|
117
148
|
# Clear any existing handlers to avoid conflicts
|
118
149
|
root_logger = logging.getLogger()
|
119
150
|
for handler in root_logger.handlers[:]:
|
120
151
|
root_logger.removeHandler(handler)
|
121
|
-
|
152
|
+
|
122
153
|
# Set root logger level
|
123
154
|
root_logger.setLevel(level)
|
124
|
-
|
155
|
+
|
125
156
|
# Create formatters
|
126
157
|
detailed_formatter = logging.Formatter(
|
127
158
|
"%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
@@ -130,55 +161,58 @@ def configure_logging(log_level: str = 'INFO'):
|
|
130
161
|
simple_formatter = logging.Formatter(
|
131
162
|
"%(asctime)s [%(levelname)s] %(message)s"
|
132
163
|
)
|
133
|
-
|
164
|
+
|
134
165
|
# Console handler
|
135
166
|
console_handler = logging.StreamHandler()
|
136
167
|
console_handler.setLevel(level)
|
137
168
|
console_handler.setFormatter(simple_formatter)
|
138
169
|
root_logger.addHandler(console_handler)
|
139
|
-
|
170
|
+
|
140
171
|
# File handler with unique timestamp
|
141
172
|
log_filename = f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log"
|
142
173
|
try:
|
143
|
-
file_handler = logging.FileHandler(
|
174
|
+
file_handler = logging.FileHandler(
|
175
|
+
log_filename, mode='w', encoding='utf-8')
|
144
176
|
file_handler.setLevel(logging.DEBUG) # Always capture DEBUG+ to file
|
145
177
|
file_handler.setFormatter(detailed_formatter)
|
146
178
|
root_logger.addHandler(file_handler)
|
147
|
-
|
179
|
+
|
148
180
|
# Log a test message to verify file handler works
|
149
181
|
logging.info(f"Logging initialized - file: {log_filename}")
|
150
|
-
|
182
|
+
|
151
183
|
except Exception as e:
|
152
184
|
logging.error(f"Failed to create log file {log_filename}: {e}")
|
153
185
|
print(f"Warning: Could not create log file: {e}")
|
154
|
-
|
186
|
+
|
155
187
|
# Set library loggers to less verbose levels
|
156
188
|
logging.getLogger('requests').setLevel(logging.WARNING)
|
157
189
|
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
158
190
|
logging.getLogger('requests_cache').setLevel(logging.WARNING)
|
159
191
|
logging.getLogger('llama_cpp').setLevel(logging.WARNING)
|
160
192
|
|
193
|
+
|
161
194
|
def check_disk_space():
|
162
195
|
if shutil.disk_usage(".").free < 1_000_000_000: # 1GB
|
163
196
|
logging.warning("Low disk space! This may affect performance.")
|
164
197
|
|
198
|
+
|
165
199
|
def main():
|
166
200
|
# Setup production environment first for optimal logging
|
167
201
|
prod_config = setup_production_environment()
|
168
|
-
|
202
|
+
|
169
203
|
args = parse_arguments()
|
170
204
|
configure_logging(args.log_level)
|
171
205
|
check_disk_space()
|
172
|
-
|
206
|
+
|
173
207
|
# Check GitHub token before proceeding
|
174
208
|
if not check_and_setup_github_token():
|
175
209
|
logging.error("GitHub token setup cancelled or failed. Exiting.")
|
176
210
|
sys.exit(1)
|
177
|
-
|
211
|
+
|
178
212
|
try:
|
179
213
|
# Create config from command line arguments
|
180
214
|
config_kwargs = {}
|
181
|
-
|
215
|
+
|
182
216
|
# Apply production optimizations if available
|
183
217
|
if prod_config:
|
184
218
|
config_kwargs.update({
|
@@ -187,7 +221,7 @@ def main():
|
|
187
221
|
'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
|
188
222
|
'cache_ttl': prod_config.get('cache_ttl', 3600),
|
189
223
|
})
|
190
|
-
|
224
|
+
|
191
225
|
if args.batch_size:
|
192
226
|
config_kwargs['batch_size'] = args.batch_size
|
193
227
|
if args.workers:
|
@@ -198,16 +232,23 @@ def main():
|
|
198
232
|
config_kwargs['max_tokens'] = args.max_tokens
|
199
233
|
if args.checkpoint_interval:
|
200
234
|
config_kwargs['checkpoint_interval'] = args.checkpoint_interval
|
201
|
-
|
202
|
-
# Load config file if provided
|
235
|
+
# Load config file if provided
|
203
236
|
if args.config_file:
|
204
237
|
import json
|
205
238
|
with open(args.config_file, 'r') as f:
|
206
239
|
file_config = json.load(f)
|
207
240
|
config_kwargs.update(file_config)
|
208
|
-
|
241
|
+
|
242
|
+
# Handle Crawl4AI configuration
|
243
|
+
enable_crawl4ai = args.enable_crawl4ai and not args.disable_crawl4ai if hasattr(
|
244
|
+
args, 'disable_crawl4ai') else True
|
245
|
+
config_kwargs.update({
|
246
|
+
'enable_crawl4ai': enable_crawl4ai,
|
247
|
+
'crawl4ai_model': getattr(args, 'crawl4ai_model', '~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf')
|
248
|
+
})
|
249
|
+
|
209
250
|
config = PipelineConfig(**config_kwargs)
|
210
|
-
|
251
|
+
|
211
252
|
# Pass additional arguments to pipeline
|
212
253
|
pipeline_kwargs = {}
|
213
254
|
if args.output_dir:
|
@@ -220,15 +261,36 @@ def main():
|
|
220
261
|
pipeline_kwargs['skip_ai'] = True
|
221
262
|
if args.skip_source_analysis:
|
222
263
|
pipeline_kwargs['skip_source'] = True
|
223
|
-
|
224
|
-
|
225
|
-
|
264
|
+
|
265
|
+
# Sigil Protocol integration
|
266
|
+
if hasattr(
|
267
|
+
args,
|
268
|
+
'enable_sigil_protocol') and args.enable_sigil_protocol:
|
269
|
+
# Import Sigil enhanced pipeline
|
270
|
+
try:
|
271
|
+
import sys
|
272
|
+
sys.path.append('.') # Add current directory to path
|
273
|
+
from sigil_enhanced_pipeline import SigilCompliantPipeline
|
274
|
+
|
275
|
+
pipeline = SigilCompliantPipeline(config, **pipeline_kwargs)
|
276
|
+
logging.info(
|
277
|
+
"Starting Sigil Protocol compliant pipeline with Sacred Chain processing")
|
278
|
+
except ImportError as e:
|
279
|
+
logging.warning(f"Sigil enhanced pipeline not available: {e}")
|
280
|
+
logging.info("Falling back to standard pipeline")
|
281
|
+
pipeline = CrateDataPipeline(config, **pipeline_kwargs)
|
282
|
+
else:
|
283
|
+
pipeline = CrateDataPipeline(config, **pipeline_kwargs)
|
226
284
|
logging.info(f"Starting pipeline with {len(vars(args))} arguments")
|
227
|
-
|
228
|
-
|
285
|
+
|
286
|
+
# Run the pipeline asynchronously
|
287
|
+
import asyncio
|
288
|
+
asyncio.run(pipeline.run())
|
289
|
+
|
229
290
|
except Exception as e:
|
230
291
|
logging.critical(f"Pipeline failed: {str(e)}")
|
231
292
|
sys.exit(1)
|
232
293
|
|
294
|
+
|
233
295
|
if __name__ == "__main__":
|
234
|
-
main()
|
296
|
+
main()
|