rust-crate-pipeline 1.2.5__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,25 +9,29 @@ import sys
9
9
  import requests
10
10
  import logging
11
11
 
12
+
12
13
  def check_github_token_quick():
13
14
  """Quick check if GitHub token is available and valid"""
14
15
  token = os.getenv("GITHUB_TOKEN")
15
-
16
+
16
17
  if not token:
17
18
  return False, "GITHUB_TOKEN environment variable not set"
18
-
19
+
19
20
  if len(token) < 20:
20
21
  return False, "GITHUB_TOKEN seems too short - may be invalid"
21
-
22
+
22
23
  try:
23
24
  # Quick API check
24
25
  headers = {
25
26
  "Accept": "application/vnd.github.v3+json",
26
27
  "Authorization": f"token {token}"
27
28
  }
28
-
29
- response = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
30
-
29
+
30
+ response = requests.get(
31
+ "https://api.github.com/rate_limit",
32
+ headers=headers,
33
+ timeout=10)
34
+
31
35
  if response.status_code == 200:
32
36
  data = response.json()
33
37
  remaining = data["resources"]["core"]["remaining"]
@@ -35,18 +39,20 @@ def check_github_token_quick():
35
39
  elif response.status_code == 401:
36
40
  return False, "GitHub token is invalid or expired"
37
41
  else:
38
- return False, f"GitHub API returned status code: {response.status_code}"
39
-
42
+ return False, f"GitHub API returned status code: {
43
+ response.status_code}"
44
+
40
45
  except requests.exceptions.RequestException as e:
41
46
  return False, f"Network error checking token: {str(e)}"
42
47
  except Exception as e:
43
48
  return False, f"Error checking token: {str(e)}"
44
49
 
50
+
45
51
  def prompt_for_token_setup():
46
52
  """Prompt user to set up GitHub token"""
47
- print("\n" + "="*60)
53
+ print("\n" + "=" * 60)
48
54
  print("🔑 GitHub Token Required")
49
- print("="*60)
55
+ print("=" * 60)
50
56
  print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
51
57
  print("to access repository information and avoid rate limits.")
52
58
  print("\n📋 Quick Setup:")
@@ -57,11 +63,12 @@ def prompt_for_token_setup():
57
63
  print("\n🔧 Setup Scripts Available:")
58
64
  print(" ./setup_github_token.sh (Interactive setup)")
59
65
  print(" python3 check_github_token.py (Full verification)")
60
- print("\n" + "="*60)
61
-
66
+ print("\n" + "=" * 60)
67
+
62
68
  # Ask if user wants to continue without token (limited functionality)
63
- response = input("\nContinue without GitHub token? (y/N): ").strip().lower()
64
-
69
+ response = input(
70
+ "\nContinue without GitHub token? (y/N): ").strip().lower()
71
+
65
72
  if response in ['y', 'yes']:
66
73
  print("⚠️ Running with limited GitHub API access (60 requests/hour)")
67
74
  print(" You may encounter rate limit warnings.")
@@ -70,33 +77,36 @@ def prompt_for_token_setup():
70
77
  print("\n🛑 Please set up your GitHub token and try again.")
71
78
  return False
72
79
 
80
+
73
81
  def check_and_setup_github_token():
74
82
  """
75
83
  Check GitHub token and prompt for setup if missing.
76
84
  Returns True if should continue, False if should exit.
77
85
  """
78
86
  is_valid, message = check_github_token_quick()
79
-
87
+
80
88
  if is_valid:
81
89
  logging.debug(f"GitHub token check: {message}")
82
90
  return True
83
-
91
+
84
92
  # Token is missing or invalid
85
93
  logging.warning(f"GitHub token issue: {message}")
86
-
94
+
87
95
  # Check if we're in a non-interactive environment
88
96
  if not sys.stdin.isatty():
89
- logging.error("GitHub token not configured and running in non-interactive mode")
97
+ logging.error(
98
+ "GitHub token not configured and running in non-interactive mode")
90
99
  logging.error("Set GITHUB_TOKEN environment variable before running")
91
100
  return False
92
-
101
+
93
102
  # Interactive prompt
94
103
  return prompt_for_token_setup()
95
104
 
105
+
96
106
  if __name__ == "__main__":
97
107
  # Allow running this module directly for testing
98
108
  is_valid, message = check_github_token_quick()
99
109
  print(f"Token check: {'✅' if is_valid else '❌'} {message}")
100
-
110
+
101
111
  if not is_valid:
102
112
  check_and_setup_github_token()
@@ -1,16 +1,15 @@
1
1
  # main.py
2
- import os
3
2
  import sys
4
3
  import time
5
4
  import logging
6
5
  import shutil
7
6
  import argparse
8
- from typing import Optional
9
7
  from .config import PipelineConfig
10
8
  from .pipeline import CrateDataPipeline
11
9
  from .production_config import setup_production_environment
12
10
  from .github_token_checker import check_and_setup_github_token
13
11
 
12
+
14
13
  def parse_arguments():
15
14
  """Parse command line arguments"""
16
15
  parser = argparse.ArgumentParser(
@@ -26,102 +25,135 @@ Examples:
26
25
  PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
27
26
  """
28
27
  )
29
-
28
+
30
29
  parser.add_argument(
31
30
  '--limit', '-l',
32
31
  type=int,
33
32
  default=None,
34
33
  help='Limit the number of crates to process (default: process all)'
35
34
  )
36
-
35
+
37
36
  parser.add_argument(
38
37
  '--batch-size', '-b',
39
38
  type=int,
40
39
  default=10,
41
40
  help='Number of crates to process in each batch (default: 10)'
42
41
  )
43
-
42
+
44
43
  parser.add_argument(
45
44
  '--workers', '-w',
46
45
  type=int,
47
46
  default=4,
48
47
  help='Number of parallel workers for API requests (default: 4)'
49
48
  )
50
-
49
+
51
50
  parser.add_argument(
52
51
  '--output-dir', '-o',
53
52
  type=str,
54
53
  default=None,
55
54
  help='Output directory for results (default: auto-generated timestamped directory)'
56
55
  )
57
-
56
+
58
57
  parser.add_argument(
59
58
  '--model-path', '-m',
60
59
  type=str,
61
60
  default=None,
62
61
  help='Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
63
62
  )
64
-
63
+
65
64
  parser.add_argument(
66
65
  '--max-tokens',
67
66
  type=int,
68
67
  default=256,
69
68
  help='Maximum tokens for LLM generation (default: 256)'
70
69
  )
71
-
70
+
72
71
  parser.add_argument(
73
72
  '--checkpoint-interval',
74
73
  type=int,
75
74
  default=10,
76
75
  help='Save checkpoint every N crates (default: 10)'
77
76
  )
78
-
79
- parser.add_argument(
80
- '--log-level',
81
- choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
82
- default='INFO',
83
- help='Logging level (default: INFO)'
84
- )
85
-
77
+
78
+ parser.add_argument('--log-level',
79
+ choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
80
+ default='INFO',
81
+ help='Logging level (default: INFO)'
82
+ )
83
+
86
84
  parser.add_argument(
87
85
  '--skip-ai',
88
86
  action='store_true',
89
87
  help='Skip AI enrichment (faster, metadata only)'
90
88
  )
91
-
89
+
92
90
  parser.add_argument(
93
91
  '--skip-source-analysis',
94
92
  action='store_true',
95
93
  help='Skip source code analysis'
96
94
  )
97
-
95
+
96
+ # Enhanced scraping with Crawl4AI
97
+ parser.add_argument(
98
+ '--enable-crawl4ai',
99
+ action='store_true',
100
+ default=True,
101
+ help='Enable enhanced web scraping with Crawl4AI (default: enabled)'
102
+ )
103
+
104
+ parser.add_argument(
105
+ '--disable-crawl4ai',
106
+ action='store_true',
107
+ help='Disable Crawl4AI enhanced scraping (use basic scraping only)'
108
+ )
109
+
110
+ parser.add_argument(
111
+ '--crawl4ai-model',
112
+ type=str,
113
+ default='ollama/deepseek-coder:6.7b',
114
+ help='Model to use with Crawl4AI (default: ollama/deepseek-coder:6.7b)'
115
+ )
116
+
117
+ parser.add_argument(
118
+ '--enable-sigil-protocol',
119
+ action='store_true',
120
+ help='Enable Sigil Protocol Sacred Chain processing (Rule Zero compliance)')
121
+
122
+ parser.add_argument(
123
+ '--sigil-mode',
124
+ choices=['enhanced', 'direct-llm', 'hybrid'],
125
+ default='enhanced',
126
+ help='Sigil processing mode: enhanced (API-based), direct-llm (local), hybrid (both)'
127
+ )
128
+
98
129
  parser.add_argument(
99
130
  '--crate-list',
100
131
  type=str,
101
132
  nargs='+',
102
133
  help='Specific crates to process (space-separated list)'
103
134
  )
104
-
135
+
105
136
  parser.add_argument(
106
137
  '--config-file',
107
138
  type=str,
108
139
  help='JSON config file to override default settings'
109
140
  )
110
-
141
+
111
142
  return parser.parse_args()
112
143
 
144
+
113
145
  def configure_logging(log_level: str = 'INFO'):
114
146
  """Configure logging with both console and file output"""
115
147
  level = getattr(logging, log_level.upper())
116
-
148
+
117
149
  # Clear any existing handlers to avoid conflicts
118
150
  root_logger = logging.getLogger()
119
151
  for handler in root_logger.handlers[:]:
120
152
  root_logger.removeHandler(handler)
121
-
153
+
122
154
  # Set root logger level
123
155
  root_logger.setLevel(level)
124
-
156
+
125
157
  # Create formatters
126
158
  detailed_formatter = logging.Formatter(
127
159
  "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
@@ -130,55 +162,58 @@ def configure_logging(log_level: str = 'INFO'):
130
162
  simple_formatter = logging.Formatter(
131
163
  "%(asctime)s [%(levelname)s] %(message)s"
132
164
  )
133
-
165
+
134
166
  # Console handler
135
167
  console_handler = logging.StreamHandler()
136
168
  console_handler.setLevel(level)
137
169
  console_handler.setFormatter(simple_formatter)
138
170
  root_logger.addHandler(console_handler)
139
-
171
+
140
172
  # File handler with unique timestamp
141
173
  log_filename = f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log"
142
174
  try:
143
- file_handler = logging.FileHandler(log_filename, mode='w', encoding='utf-8')
175
+ file_handler = logging.FileHandler(
176
+ log_filename, mode='w', encoding='utf-8')
144
177
  file_handler.setLevel(logging.DEBUG) # Always capture DEBUG+ to file
145
178
  file_handler.setFormatter(detailed_formatter)
146
179
  root_logger.addHandler(file_handler)
147
-
180
+
148
181
  # Log a test message to verify file handler works
149
182
  logging.info(f"Logging initialized - file: {log_filename}")
150
-
183
+
151
184
  except Exception as e:
152
185
  logging.error(f"Failed to create log file {log_filename}: {e}")
153
186
  print(f"Warning: Could not create log file: {e}")
154
-
187
+
155
188
  # Set library loggers to less verbose levels
156
189
  logging.getLogger('requests').setLevel(logging.WARNING)
157
190
  logging.getLogger('urllib3').setLevel(logging.WARNING)
158
191
  logging.getLogger('requests_cache').setLevel(logging.WARNING)
159
192
  logging.getLogger('llama_cpp').setLevel(logging.WARNING)
160
193
 
194
+
161
195
  def check_disk_space():
162
196
  if shutil.disk_usage(".").free < 1_000_000_000: # 1GB
163
197
  logging.warning("Low disk space! This may affect performance.")
164
198
 
199
+
165
200
  def main():
166
201
  # Setup production environment first for optimal logging
167
202
  prod_config = setup_production_environment()
168
-
203
+
169
204
  args = parse_arguments()
170
205
  configure_logging(args.log_level)
171
206
  check_disk_space()
172
-
207
+
173
208
  # Check GitHub token before proceeding
174
209
  if not check_and_setup_github_token():
175
210
  logging.error("GitHub token setup cancelled or failed. Exiting.")
176
211
  sys.exit(1)
177
-
212
+
178
213
  try:
179
214
  # Create config from command line arguments
180
215
  config_kwargs = {}
181
-
216
+
182
217
  # Apply production optimizations if available
183
218
  if prod_config:
184
219
  config_kwargs.update({
@@ -187,7 +222,7 @@ def main():
187
222
  'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
188
223
  'cache_ttl': prod_config.get('cache_ttl', 3600),
189
224
  })
190
-
225
+
191
226
  if args.batch_size:
192
227
  config_kwargs['batch_size'] = args.batch_size
193
228
  if args.workers:
@@ -198,16 +233,23 @@ def main():
198
233
  config_kwargs['max_tokens'] = args.max_tokens
199
234
  if args.checkpoint_interval:
200
235
  config_kwargs['checkpoint_interval'] = args.checkpoint_interval
201
-
202
- # Load config file if provided
236
+ # Load config file if provided
203
237
  if args.config_file:
204
238
  import json
205
239
  with open(args.config_file, 'r') as f:
206
240
  file_config = json.load(f)
207
241
  config_kwargs.update(file_config)
208
-
242
+
243
+ # Handle Crawl4AI configuration
244
+ enable_crawl4ai = args.enable_crawl4ai and not args.disable_crawl4ai if hasattr(
245
+ args, 'disable_crawl4ai') else True
246
+ config_kwargs.update({
247
+ 'enable_crawl4ai': enable_crawl4ai,
248
+ 'crawl4ai_model': getattr(args, 'crawl4ai_model', 'ollama/deepseek-coder:6.7b')
249
+ })
250
+
209
251
  config = PipelineConfig(**config_kwargs)
210
-
252
+
211
253
  # Pass additional arguments to pipeline
212
254
  pipeline_kwargs = {}
213
255
  if args.output_dir:
@@ -220,15 +262,36 @@ def main():
220
262
  pipeline_kwargs['skip_ai'] = True
221
263
  if args.skip_source_analysis:
222
264
  pipeline_kwargs['skip_source'] = True
223
-
224
- pipeline = CrateDataPipeline(config, **pipeline_kwargs)
225
-
265
+
266
+ # Sigil Protocol integration
267
+ if hasattr(
268
+ args,
269
+ 'enable_sigil_protocol') and args.enable_sigil_protocol:
270
+ # Import Sigil enhanced pipeline
271
+ try:
272
+ import sys
273
+ sys.path.append('.') # Add current directory to path
274
+ from sigil_enhanced_pipeline import SigilCompliantPipeline
275
+
276
+ pipeline = SigilCompliantPipeline(config, **pipeline_kwargs)
277
+ logging.info(
278
+ "Starting Sigil Protocol compliant pipeline with Sacred Chain processing")
279
+ except ImportError as e:
280
+ logging.warning(f"Sigil enhanced pipeline not available: {e}")
281
+ logging.info("Falling back to standard pipeline")
282
+ pipeline = CrateDataPipeline(config, **pipeline_kwargs)
283
+ else:
284
+ pipeline = CrateDataPipeline(config, **pipeline_kwargs)
226
285
  logging.info(f"Starting pipeline with {len(vars(args))} arguments")
227
- pipeline.run()
228
-
286
+
287
+ # Run the pipeline asynchronously
288
+ import asyncio
289
+ asyncio.run(pipeline.run())
290
+
229
291
  except Exception as e:
230
292
  logging.critical(f"Pipeline failed: {str(e)}")
231
293
  sys.exit(1)
232
294
 
295
+
233
296
  if __name__ == "__main__":
234
- main()
297
+ main()