rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,25 +9,29 @@ import sys
9
9
  import requests
10
10
  import logging
11
11
 
12
+
12
13
  def check_github_token_quick():
13
14
  """Quick check if GitHub token is available and valid"""
14
15
  token = os.getenv("GITHUB_TOKEN")
15
-
16
+
16
17
  if not token:
17
18
  return False, "GITHUB_TOKEN environment variable not set"
18
-
19
+
19
20
  if len(token) < 20:
20
21
  return False, "GITHUB_TOKEN seems too short - may be invalid"
21
-
22
+
22
23
  try:
23
24
  # Quick API check
24
25
  headers = {
25
26
  "Accept": "application/vnd.github.v3+json",
26
27
  "Authorization": f"token {token}"
27
28
  }
28
-
29
- response = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
30
-
29
+
30
+ response = requests.get(
31
+ "https://api.github.com/rate_limit",
32
+ headers=headers,
33
+ timeout=10)
34
+
31
35
  if response.status_code == 200:
32
36
  data = response.json()
33
37
  remaining = data["resources"]["core"]["remaining"]
@@ -35,18 +39,20 @@ def check_github_token_quick():
35
39
  elif response.status_code == 401:
36
40
  return False, "GitHub token is invalid or expired"
37
41
  else:
38
- return False, f"GitHub API returned status code: {response.status_code}"
39
-
42
+ return False, f"GitHub API returned status code: {
43
+ response.status_code}"
44
+
40
45
  except requests.exceptions.RequestException as e:
41
46
  return False, f"Network error checking token: {str(e)}"
42
47
  except Exception as e:
43
48
  return False, f"Error checking token: {str(e)}"
44
49
 
50
+
45
51
  def prompt_for_token_setup():
46
52
  """Prompt user to set up GitHub token"""
47
- print("\n" + "="*60)
53
+ print("\n" + "=" * 60)
48
54
  print("🔑 GitHub Token Required")
49
- print("="*60)
55
+ print("=" * 60)
50
56
  print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
51
57
  print("to access repository information and avoid rate limits.")
52
58
  print("\n📋 Quick Setup:")
@@ -57,11 +63,12 @@ def prompt_for_token_setup():
57
63
  print("\n🔧 Setup Scripts Available:")
58
64
  print(" ./setup_github_token.sh (Interactive setup)")
59
65
  print(" python3 check_github_token.py (Full verification)")
60
- print("\n" + "="*60)
61
-
66
+ print("\n" + "=" * 60)
67
+
62
68
  # Ask if user wants to continue without token (limited functionality)
63
- response = input("\nContinue without GitHub token? (y/N): ").strip().lower()
64
-
69
+ response = input(
70
+ "\nContinue without GitHub token? (y/N): ").strip().lower()
71
+
65
72
  if response in ['y', 'yes']:
66
73
  print("⚠️ Running with limited GitHub API access (60 requests/hour)")
67
74
  print(" You may encounter rate limit warnings.")
@@ -70,33 +77,36 @@ def prompt_for_token_setup():
70
77
  print("\n🛑 Please set up your GitHub token and try again.")
71
78
  return False
72
79
 
80
+
73
81
  def check_and_setup_github_token():
74
82
  """
75
83
  Check GitHub token and prompt for setup if missing.
76
84
  Returns True if should continue, False if should exit.
77
85
  """
78
86
  is_valid, message = check_github_token_quick()
79
-
87
+
80
88
  if is_valid:
81
89
  logging.debug(f"GitHub token check: {message}")
82
90
  return True
83
-
91
+
84
92
  # Token is missing or invalid
85
93
  logging.warning(f"GitHub token issue: {message}")
86
-
94
+
87
95
  # Check if we're in a non-interactive environment
88
96
  if not sys.stdin.isatty():
89
- logging.error("GitHub token not configured and running in non-interactive mode")
97
+ logging.error(
98
+ "GitHub token not configured and running in non-interactive mode")
90
99
  logging.error("Set GITHUB_TOKEN environment variable before running")
91
100
  return False
92
-
101
+
93
102
  # Interactive prompt
94
103
  return prompt_for_token_setup()
95
104
 
105
+
96
106
  if __name__ == "__main__":
97
107
  # Allow running this module directly for testing
98
108
  is_valid, message = check_github_token_quick()
99
109
  print(f"Token check: {'✅' if is_valid else '❌'} {message}")
100
-
110
+
101
111
  if not is_valid:
102
112
  check_and_setup_github_token()
@@ -1,16 +1,15 @@
1
1
  # main.py
2
- import os
3
2
  import sys
4
3
  import time
5
4
  import logging
6
5
  import shutil
7
6
  import argparse
8
- from typing import Optional
9
7
  from .config import PipelineConfig
10
8
  from .pipeline import CrateDataPipeline
11
9
  from .production_config import setup_production_environment
12
10
  from .github_token_checker import check_and_setup_github_token
13
11
 
12
+
14
13
  def parse_arguments():
15
14
  """Parse command line arguments"""
16
15
  parser = argparse.ArgumentParser(
@@ -26,102 +25,134 @@ Examples:
26
25
  PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
27
26
  """
28
27
  )
29
-
28
+
30
29
  parser.add_argument(
31
30
  '--limit', '-l',
32
31
  type=int,
33
32
  default=None,
34
33
  help='Limit the number of crates to process (default: process all)'
35
34
  )
36
-
35
+
37
36
  parser.add_argument(
38
37
  '--batch-size', '-b',
39
38
  type=int,
40
39
  default=10,
41
40
  help='Number of crates to process in each batch (default: 10)'
42
41
  )
43
-
42
+
44
43
  parser.add_argument(
45
44
  '--workers', '-w',
46
45
  type=int,
47
46
  default=4,
48
47
  help='Number of parallel workers for API requests (default: 4)'
49
48
  )
50
-
49
+
51
50
  parser.add_argument(
52
51
  '--output-dir', '-o',
53
52
  type=str,
54
53
  default=None,
55
54
  help='Output directory for results (default: auto-generated timestamped directory)'
56
55
  )
57
-
56
+
58
57
  parser.add_argument(
59
58
  '--model-path', '-m',
60
59
  type=str,
61
60
  default=None,
62
61
  help='Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
63
62
  )
64
-
63
+
65
64
  parser.add_argument(
66
65
  '--max-tokens',
67
66
  type=int,
68
67
  default=256,
69
68
  help='Maximum tokens for LLM generation (default: 256)'
70
69
  )
71
-
70
+
72
71
  parser.add_argument(
73
72
  '--checkpoint-interval',
74
73
  type=int,
75
74
  default=10,
76
75
  help='Save checkpoint every N crates (default: 10)'
77
76
  )
78
-
79
- parser.add_argument(
80
- '--log-level',
81
- choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
82
- default='INFO',
83
- help='Logging level (default: INFO)'
84
- )
85
-
77
+
78
+ parser.add_argument('--log-level',
79
+ choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
80
+ default='INFO',
81
+ help='Logging level (default: INFO)'
82
+ )
83
+
86
84
  parser.add_argument(
87
85
  '--skip-ai',
88
86
  action='store_true',
89
87
  help='Skip AI enrichment (faster, metadata only)'
90
88
  )
91
-
89
+
92
90
  parser.add_argument(
93
91
  '--skip-source-analysis',
94
92
  action='store_true',
95
93
  help='Skip source code analysis'
96
94
  )
97
-
95
+
96
+ # Enhanced scraping with Crawl4AI
97
+ parser.add_argument(
98
+ '--enable-crawl4ai',
99
+ action='store_true',
100
+ default=True,
101
+ help='Enable enhanced web scraping with Crawl4AI (default: enabled)'
102
+ )
103
+
104
+ parser.add_argument(
105
+ '--disable-crawl4ai',
106
+ action='store_true',
107
+ help='Disable Crawl4AI enhanced scraping (use basic scraping only)' )
108
+
109
+ parser.add_argument(
110
+ '--crawl4ai-model',
111
+ type=str,
112
+ default='~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf',
113
+ help='GGUF model path for Crawl4AI content analysis (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
114
+ )
115
+
116
+ parser.add_argument(
117
+ '--enable-sigil-protocol',
118
+ action='store_true',
119
+ help='Enable Sigil Protocol Sacred Chain processing (Rule Zero compliance)')
120
+
121
+ parser.add_argument(
122
+ '--sigil-mode',
123
+ choices=['enhanced', 'direct-llm', 'hybrid'],
124
+ default='enhanced',
125
+ help='Sigil processing mode: enhanced (API-based), direct-llm (local), hybrid (both)'
126
+ )
127
+
98
128
  parser.add_argument(
99
129
  '--crate-list',
100
130
  type=str,
101
131
  nargs='+',
102
132
  help='Specific crates to process (space-separated list)'
103
133
  )
104
-
134
+
105
135
  parser.add_argument(
106
136
  '--config-file',
107
137
  type=str,
108
138
  help='JSON config file to override default settings'
109
139
  )
110
-
140
+
111
141
  return parser.parse_args()
112
142
 
143
+
113
144
  def configure_logging(log_level: str = 'INFO'):
114
145
  """Configure logging with both console and file output"""
115
146
  level = getattr(logging, log_level.upper())
116
-
147
+
117
148
  # Clear any existing handlers to avoid conflicts
118
149
  root_logger = logging.getLogger()
119
150
  for handler in root_logger.handlers[:]:
120
151
  root_logger.removeHandler(handler)
121
-
152
+
122
153
  # Set root logger level
123
154
  root_logger.setLevel(level)
124
-
155
+
125
156
  # Create formatters
126
157
  detailed_formatter = logging.Formatter(
127
158
  "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
@@ -130,55 +161,58 @@ def configure_logging(log_level: str = 'INFO'):
130
161
  simple_formatter = logging.Formatter(
131
162
  "%(asctime)s [%(levelname)s] %(message)s"
132
163
  )
133
-
164
+
134
165
  # Console handler
135
166
  console_handler = logging.StreamHandler()
136
167
  console_handler.setLevel(level)
137
168
  console_handler.setFormatter(simple_formatter)
138
169
  root_logger.addHandler(console_handler)
139
-
170
+
140
171
  # File handler with unique timestamp
141
172
  log_filename = f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log"
142
173
  try:
143
- file_handler = logging.FileHandler(log_filename, mode='w', encoding='utf-8')
174
+ file_handler = logging.FileHandler(
175
+ log_filename, mode='w', encoding='utf-8')
144
176
  file_handler.setLevel(logging.DEBUG) # Always capture DEBUG+ to file
145
177
  file_handler.setFormatter(detailed_formatter)
146
178
  root_logger.addHandler(file_handler)
147
-
179
+
148
180
  # Log a test message to verify file handler works
149
181
  logging.info(f"Logging initialized - file: {log_filename}")
150
-
182
+
151
183
  except Exception as e:
152
184
  logging.error(f"Failed to create log file {log_filename}: {e}")
153
185
  print(f"Warning: Could not create log file: {e}")
154
-
186
+
155
187
  # Set library loggers to less verbose levels
156
188
  logging.getLogger('requests').setLevel(logging.WARNING)
157
189
  logging.getLogger('urllib3').setLevel(logging.WARNING)
158
190
  logging.getLogger('requests_cache').setLevel(logging.WARNING)
159
191
  logging.getLogger('llama_cpp').setLevel(logging.WARNING)
160
192
 
193
+
161
194
  def check_disk_space():
162
195
  if shutil.disk_usage(".").free < 1_000_000_000: # 1GB
163
196
  logging.warning("Low disk space! This may affect performance.")
164
197
 
198
+
165
199
  def main():
166
200
  # Setup production environment first for optimal logging
167
201
  prod_config = setup_production_environment()
168
-
202
+
169
203
  args = parse_arguments()
170
204
  configure_logging(args.log_level)
171
205
  check_disk_space()
172
-
206
+
173
207
  # Check GitHub token before proceeding
174
208
  if not check_and_setup_github_token():
175
209
  logging.error("GitHub token setup cancelled or failed. Exiting.")
176
210
  sys.exit(1)
177
-
211
+
178
212
  try:
179
213
  # Create config from command line arguments
180
214
  config_kwargs = {}
181
-
215
+
182
216
  # Apply production optimizations if available
183
217
  if prod_config:
184
218
  config_kwargs.update({
@@ -187,7 +221,7 @@ def main():
187
221
  'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
188
222
  'cache_ttl': prod_config.get('cache_ttl', 3600),
189
223
  })
190
-
224
+
191
225
  if args.batch_size:
192
226
  config_kwargs['batch_size'] = args.batch_size
193
227
  if args.workers:
@@ -198,16 +232,23 @@ def main():
198
232
  config_kwargs['max_tokens'] = args.max_tokens
199
233
  if args.checkpoint_interval:
200
234
  config_kwargs['checkpoint_interval'] = args.checkpoint_interval
201
-
202
- # Load config file if provided
235
+ # Load config file if provided
203
236
  if args.config_file:
204
237
  import json
205
238
  with open(args.config_file, 'r') as f:
206
239
  file_config = json.load(f)
207
240
  config_kwargs.update(file_config)
208
-
241
+
242
+ # Handle Crawl4AI configuration
243
+ enable_crawl4ai = args.enable_crawl4ai and not args.disable_crawl4ai if hasattr(
244
+ args, 'disable_crawl4ai') else True
245
+ config_kwargs.update({
246
+ 'enable_crawl4ai': enable_crawl4ai,
247
+ 'crawl4ai_model': getattr(args, 'crawl4ai_model', '~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf')
248
+ })
249
+
209
250
  config = PipelineConfig(**config_kwargs)
210
-
251
+
211
252
  # Pass additional arguments to pipeline
212
253
  pipeline_kwargs = {}
213
254
  if args.output_dir:
@@ -220,15 +261,36 @@ def main():
220
261
  pipeline_kwargs['skip_ai'] = True
221
262
  if args.skip_source_analysis:
222
263
  pipeline_kwargs['skip_source'] = True
223
-
224
- pipeline = CrateDataPipeline(config, **pipeline_kwargs)
225
-
264
+
265
+ # Sigil Protocol integration
266
+ if hasattr(
267
+ args,
268
+ 'enable_sigil_protocol') and args.enable_sigil_protocol:
269
+ # Import Sigil enhanced pipeline
270
+ try:
271
+ import sys
272
+ sys.path.append('.') # Add current directory to path
273
+ from sigil_enhanced_pipeline import SigilCompliantPipeline
274
+
275
+ pipeline = SigilCompliantPipeline(config, **pipeline_kwargs)
276
+ logging.info(
277
+ "Starting Sigil Protocol compliant pipeline with Sacred Chain processing")
278
+ except ImportError as e:
279
+ logging.warning(f"Sigil enhanced pipeline not available: {e}")
280
+ logging.info("Falling back to standard pipeline")
281
+ pipeline = CrateDataPipeline(config, **pipeline_kwargs)
282
+ else:
283
+ pipeline = CrateDataPipeline(config, **pipeline_kwargs)
226
284
  logging.info(f"Starting pipeline with {len(vars(args))} arguments")
227
- pipeline.run()
228
-
285
+
286
+ # Run the pipeline asynchronously
287
+ import asyncio
288
+ asyncio.run(pipeline.run())
289
+
229
290
  except Exception as e:
230
291
  logging.critical(f"Pipeline failed: {str(e)}")
231
292
  sys.exit(1)
232
293
 
294
+
233
295
  if __name__ == "__main__":
234
- main()
296
+ main()