rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +25 -25
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +309 -200
- rust_crate_pipeline/analysis.py +304 -368
- rust_crate_pipeline/azure_ai_processing.py +453 -0
- rust_crate_pipeline/config.py +57 -19
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +42 -36
- rust_crate_pipeline/main.py +386 -102
- rust_crate_pipeline/network.py +153 -133
- rust_crate_pipeline/pipeline.py +340 -264
- rust_crate_pipeline/production_config.py +35 -32
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +45 -14
- rust_crate_pipeline/utils/logging_utils.py +34 -17
- rust_crate_pipeline/version.py +53 -2
- rust_crate_pipeline-1.3.1.dist-info/METADATA +357 -0
- rust_crate_pipeline-1.3.1.dist-info/RECORD +30 -0
- rust_crate_pipeline-1.2.6.dist-info/METADATA +0 -573
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/main.py
CHANGED
@@ -1,17 +1,36 @@
|
|
1
1
|
# main.py
|
2
|
-
import os
|
3
2
|
import sys
|
4
3
|
import time
|
5
4
|
import logging
|
6
5
|
import shutil
|
7
6
|
import argparse
|
8
|
-
|
7
|
+
import os
|
8
|
+
import subprocess
|
9
|
+
from typing import Any, TYPE_CHECKING
|
10
|
+
|
9
11
|
from .config import PipelineConfig
|
10
12
|
from .pipeline import CrateDataPipeline
|
11
13
|
from .production_config import setup_production_environment
|
12
14
|
from .github_token_checker import check_and_setup_github_token
|
13
15
|
|
14
|
-
|
16
|
+
# Optional Sigil import with fallback
|
17
|
+
_sigil_available = True
|
18
|
+
SigilCompliantPipeline = None
|
19
|
+
|
20
|
+
try:
|
21
|
+
sys.path.append(".") # Add current directory to path
|
22
|
+
from sigil_enhanced_pipeline import SigilCompliantPipeline
|
23
|
+
|
24
|
+
_sigil_available = True
|
25
|
+
except ImportError:
|
26
|
+
_sigil_available = False
|
27
|
+
if TYPE_CHECKING:
|
28
|
+
from sigil_enhanced_pipeline import SigilCompliantPipeline
|
29
|
+
else:
|
30
|
+
SigilCompliantPipeline = None # type: ignore[assignment,misc]
|
31
|
+
|
32
|
+
|
33
|
+
def parse_arguments() -> argparse.Namespace:
|
15
34
|
"""Parse command line arguments"""
|
16
35
|
parser = argparse.ArgumentParser(
|
17
36
|
description="Rust Crate Data Processing Pipeline",
|
@@ -24,211 +43,476 @@ Examples:
|
|
24
43
|
python -m rust_crate_pipeline --output-dir ./data # Custom output directory
|
25
44
|
python -m rust_crate_pipeline --log-level DEBUG # Verbose logging
|
26
45
|
PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
|
27
|
-
"""
|
46
|
+
""",
|
28
47
|
)
|
29
|
-
|
48
|
+
|
30
49
|
parser.add_argument(
|
31
|
-
|
50
|
+
"--limit",
|
51
|
+
"-l",
|
32
52
|
type=int,
|
33
53
|
default=None,
|
34
|
-
help=
|
54
|
+
help="Limit the number of crates to process (default: process all)",
|
35
55
|
)
|
36
|
-
|
56
|
+
|
37
57
|
parser.add_argument(
|
38
|
-
|
58
|
+
"--batch-size",
|
59
|
+
"-b",
|
39
60
|
type=int,
|
40
61
|
default=10,
|
41
|
-
help=
|
62
|
+
help="Number of crates to process in each batch (default: 10)",
|
42
63
|
)
|
43
|
-
|
64
|
+
|
44
65
|
parser.add_argument(
|
45
|
-
|
66
|
+
"--workers",
|
67
|
+
"-w",
|
46
68
|
type=int,
|
47
69
|
default=4,
|
48
|
-
help=
|
70
|
+
help="Number of parallel workers for API requests (default: 4)",
|
49
71
|
)
|
50
|
-
|
72
|
+
|
51
73
|
parser.add_argument(
|
52
|
-
|
74
|
+
"--output-dir",
|
75
|
+
"-o",
|
53
76
|
type=str,
|
54
77
|
default=None,
|
55
|
-
help=
|
78
|
+
help=(
|
79
|
+
"Output directory for results (default: auto-generated timestamped "
|
80
|
+
"directory)"
|
81
|
+
),
|
56
82
|
)
|
57
|
-
|
83
|
+
|
58
84
|
parser.add_argument(
|
59
|
-
|
85
|
+
"--model-path",
|
86
|
+
"-m",
|
60
87
|
type=str,
|
61
88
|
default=None,
|
62
|
-
help=
|
89
|
+
help=(
|
90
|
+
"Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-"
|
91
|
+
"6.7b-instruct.Q4_K_M.gguf)"
|
92
|
+
),
|
63
93
|
)
|
64
|
-
|
94
|
+
|
65
95
|
parser.add_argument(
|
66
|
-
|
96
|
+
"--max-tokens",
|
67
97
|
type=int,
|
68
98
|
default=256,
|
69
|
-
help=
|
99
|
+
help="Maximum tokens for LLM generation (default: 256)",
|
70
100
|
)
|
71
|
-
|
101
|
+
|
72
102
|
parser.add_argument(
|
73
|
-
|
103
|
+
"--checkpoint-interval",
|
74
104
|
type=int,
|
75
105
|
default=10,
|
76
|
-
help=
|
106
|
+
help="Save checkpoint every N crates (default: 10)",
|
107
|
+
)
|
108
|
+
|
109
|
+
parser.add_argument(
|
110
|
+
"--log-level",
|
111
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
112
|
+
default="INFO",
|
113
|
+
help="Logging level (default: INFO)",
|
114
|
+
)
|
115
|
+
|
116
|
+
parser.add_argument(
|
117
|
+
"--skip-ai",
|
118
|
+
action="store_true",
|
119
|
+
help="Skip AI enrichment (faster, metadata only)",
|
120
|
+
)
|
121
|
+
|
122
|
+
parser.add_argument(
|
123
|
+
"--skip-source-analysis",
|
124
|
+
action="store_true",
|
125
|
+
help="Skip source code analysis",
|
77
126
|
)
|
78
|
-
|
127
|
+
|
128
|
+
# Enhanced scraping with Crawl4AI
|
129
|
+
parser.add_argument(
|
130
|
+
"--enable-crawl4ai",
|
131
|
+
action="store_true",
|
132
|
+
default=True,
|
133
|
+
help="Enable enhanced web scraping with Crawl4AI (default: enabled)",
|
134
|
+
)
|
135
|
+
|
136
|
+
parser.add_argument(
|
137
|
+
"--disable-crawl4ai",
|
138
|
+
action="store_true",
|
139
|
+
help="Disable Crawl4AI enhanced scraping (use basic scraping only)",
|
140
|
+
)
|
141
|
+
|
79
142
|
parser.add_argument(
|
80
|
-
|
81
|
-
|
82
|
-
default=
|
83
|
-
help=
|
143
|
+
"--crawl4ai-model",
|
144
|
+
type=str,
|
145
|
+
default="~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
|
146
|
+
help=(
|
147
|
+
"GGUF model path for Crawl4AI content analysis (default: ~/models/deepseek/"
|
148
|
+
"deepseek-coder-6.7b-instruct.Q4_K_M.gguf)"
|
149
|
+
),
|
84
150
|
)
|
85
|
-
|
151
|
+
|
86
152
|
parser.add_argument(
|
87
|
-
|
88
|
-
action=
|
89
|
-
help=
|
153
|
+
"--enable-sigil-protocol",
|
154
|
+
action="store_true",
|
155
|
+
help="Enable Sigil Protocol Sacred Chain processing (Rule Zero compliance)",
|
90
156
|
)
|
91
|
-
|
157
|
+
|
92
158
|
parser.add_argument(
|
93
|
-
|
94
|
-
|
95
|
-
|
159
|
+
"--sigil-mode",
|
160
|
+
choices=["enhanced", "direct-llm", "hybrid"],
|
161
|
+
default="enhanced",
|
162
|
+
help=(
|
163
|
+
"Sigil processing mode: enhanced (API-based), direct-llm (local), "
|
164
|
+
"hybrid (both)"
|
165
|
+
),
|
96
166
|
)
|
97
|
-
|
167
|
+
|
98
168
|
parser.add_argument(
|
99
|
-
|
169
|
+
"--crate-list",
|
100
170
|
type=str,
|
101
|
-
nargs=
|
102
|
-
help=
|
171
|
+
nargs="+",
|
172
|
+
help="Specific crates to process (space-separated list)",
|
103
173
|
)
|
104
|
-
|
174
|
+
|
105
175
|
parser.add_argument(
|
106
|
-
|
176
|
+
"--config-file",
|
107
177
|
type=str,
|
108
|
-
help=
|
178
|
+
help="JSON config file to override default settings",
|
109
179
|
)
|
110
|
-
|
180
|
+
|
111
181
|
return parser.parse_args()
|
112
182
|
|
113
|
-
|
183
|
+
|
184
|
+
def configure_logging(log_level: str = "INFO") -> None:
|
114
185
|
"""Configure logging with both console and file output"""
|
115
186
|
level = getattr(logging, log_level.upper())
|
116
|
-
|
187
|
+
|
117
188
|
# Clear any existing handlers to avoid conflicts
|
118
189
|
root_logger = logging.getLogger()
|
119
190
|
for handler in root_logger.handlers[:]:
|
120
191
|
root_logger.removeHandler(handler)
|
121
|
-
|
192
|
+
|
122
193
|
# Set root logger level
|
123
194
|
root_logger.setLevel(level)
|
124
|
-
|
195
|
+
|
125
196
|
# Create formatters
|
126
197
|
detailed_formatter = logging.Formatter(
|
127
198
|
"%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
128
|
-
datefmt=
|
129
|
-
)
|
130
|
-
simple_formatter = logging.Formatter(
|
131
|
-
"%(asctime)s [%(levelname)s] %(message)s"
|
199
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
132
200
|
)
|
133
|
-
|
201
|
+
simple_formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
|
202
|
+
|
134
203
|
# Console handler
|
135
204
|
console_handler = logging.StreamHandler()
|
136
205
|
console_handler.setLevel(level)
|
137
206
|
console_handler.setFormatter(simple_formatter)
|
138
207
|
root_logger.addHandler(console_handler)
|
139
|
-
|
208
|
+
|
140
209
|
# File handler with unique timestamp
|
141
210
|
log_filename = f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log"
|
142
211
|
try:
|
143
|
-
file_handler = logging.FileHandler(log_filename, mode=
|
212
|
+
file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8")
|
144
213
|
file_handler.setLevel(logging.DEBUG) # Always capture DEBUG+ to file
|
145
214
|
file_handler.setFormatter(detailed_formatter)
|
146
215
|
root_logger.addHandler(file_handler)
|
147
|
-
|
216
|
+
|
148
217
|
# Log a test message to verify file handler works
|
149
218
|
logging.info(f"Logging initialized - file: {log_filename}")
|
150
|
-
|
219
|
+
|
151
220
|
except Exception as e:
|
152
221
|
logging.error(f"Failed to create log file {log_filename}: {e}")
|
153
222
|
print(f"Warning: Could not create log file: {e}")
|
154
|
-
|
223
|
+
|
155
224
|
# Set library loggers to less verbose levels
|
156
|
-
logging.getLogger(
|
157
|
-
logging.getLogger(
|
158
|
-
logging.getLogger(
|
159
|
-
logging.getLogger(
|
225
|
+
logging.getLogger("requests").setLevel(logging.WARNING)
|
226
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
227
|
+
logging.getLogger("requests_cache").setLevel(logging.WARNING)
|
228
|
+
logging.getLogger("llama_cpp").setLevel(logging.WARNING)
|
160
229
|
|
161
|
-
|
230
|
+
|
231
|
+
def check_disk_space() -> None:
|
232
|
+
"""Check if there is at least 1GB of free disk space, log a warning if not."""
|
162
233
|
if shutil.disk_usage(".").free < 1_000_000_000: # 1GB
|
163
234
|
logging.warning("Low disk space! This may affect performance.")
|
164
235
|
|
165
|
-
|
236
|
+
|
237
|
+
def enforce_rule_zero_reinforcement() -> None:
|
238
|
+
"""
|
239
|
+
Enforce Rule Zero rigor by validating the canonical DB hash/signature
|
240
|
+
before pipeline actions.
|
241
|
+
|
242
|
+
Allows override for local dev, but enforces in CI/prod. Logs all events
|
243
|
+
for traceability.
|
244
|
+
"""
|
245
|
+
enforce: bool = (
|
246
|
+
os.environ.get("ENFORCE_RULE_ZERO", "false").lower() == "true"
|
247
|
+
or os.environ.get("CI", "false").lower() == "true"
|
248
|
+
or os.environ.get("PRODUCTION", "false").lower() == "true"
|
249
|
+
)
|
250
|
+
if not enforce:
|
251
|
+
logging.info("Rule Zero DB hash/signature check skipped (dev mode or override)")
|
252
|
+
return
|
253
|
+
|
254
|
+
# Detect project root robustly (works in subdirs, CI, etc.)
|
255
|
+
try:
|
256
|
+
result = subprocess.run(
|
257
|
+
["git", "rev-parse", "--show-toplevel"],
|
258
|
+
capture_output=True,
|
259
|
+
text=True,
|
260
|
+
check=True,
|
261
|
+
)
|
262
|
+
project_root: str = result.stdout.strip()
|
263
|
+
except Exception as e:
|
264
|
+
logging.critical(f"Failed to detect project root for Rule Zero validation: {e}")
|
265
|
+
sys.exit(1)
|
266
|
+
|
267
|
+
db_path: str = os.path.join(project_root, "sigil_rag_cache.db")
|
268
|
+
hash_path: str = os.path.join(project_root, "sigil_rag_cache.hash")
|
269
|
+
|
270
|
+
# Validate DB hash/signature using the provided script with explicit arguments
|
271
|
+
try:
|
272
|
+
logging.info("Validating Rule Zero DB hash/signature...")
|
273
|
+
result = subprocess.run(
|
274
|
+
[
|
275
|
+
sys.executable,
|
276
|
+
os.path.join(project_root, "audits", "validate_db_hash.py"),
|
277
|
+
"--db",
|
278
|
+
db_path,
|
279
|
+
"--expected-hash",
|
280
|
+
hash_path,
|
281
|
+
],
|
282
|
+
capture_output=True,
|
283
|
+
text=True,
|
284
|
+
check=False,
|
285
|
+
)
|
286
|
+
if result.returncode != 0:
|
287
|
+
logging.error(
|
288
|
+
f"Rule Zero DB hash/signature validation failed: "
|
289
|
+
f"{result.stdout}\n{result.stderr}"
|
290
|
+
)
|
291
|
+
# Allow manual override with justification
|
292
|
+
override_justification = os.environ.get("RULE_ZERO_OVERRIDE", "")
|
293
|
+
if override_justification:
|
294
|
+
logging.warning(
|
295
|
+
"Manual override of Rule Zero DB hash/signature validation enabled."
|
296
|
+
)
|
297
|
+
logging.warning(f"Override justification: {override_justification}")
|
298
|
+
else:
|
299
|
+
logging.critical(
|
300
|
+
"Rule Zero DB hash/signature validation failed and no override "
|
301
|
+
"provided. Exiting."
|
302
|
+
)
|
303
|
+
sys.exit(1)
|
304
|
+
else:
|
305
|
+
logging.info("Rule Zero DB hash/signature validation successful.")
|
306
|
+
except Exception as e:
|
307
|
+
logging.critical(
|
308
|
+
f"Exception during Rule Zero DB hash/signature validation: {e}"
|
309
|
+
)
|
310
|
+
sys.exit(1)
|
311
|
+
|
312
|
+
# Log environment metadata for traceability
|
313
|
+
try:
|
314
|
+
subprocess.run(
|
315
|
+
[
|
316
|
+
sys.executable,
|
317
|
+
os.path.join(project_root, "scripts", "cache_env_metadata.py"),
|
318
|
+
],
|
319
|
+
capture_output=True,
|
320
|
+
text=True,
|
321
|
+
check=False,
|
322
|
+
)
|
323
|
+
except Exception as e:
|
324
|
+
logging.warning(f"Failed to cache environment metadata: {e}")
|
325
|
+
|
326
|
+
|
327
|
+
def main() -> None:
|
328
|
+
# Enforce Rule Zero rigor before any pipeline action
|
329
|
+
enforce_rule_zero_reinforcement()
|
330
|
+
|
166
331
|
# Setup production environment first for optimal logging
|
167
|
-
|
168
|
-
|
332
|
+
logging.debug("Starting main() function - setting up production environment")
|
333
|
+
prod_config: dict[str, Any] = setup_production_environment()
|
334
|
+
logging.debug(f"Production environment setup complete: {bool(prod_config)}")
|
335
|
+
|
336
|
+
logging.debug("Parsing command line arguments")
|
169
337
|
args = parse_arguments()
|
338
|
+
logging.debug(f"Arguments parsed: {vars(args)}")
|
339
|
+
|
340
|
+
logging.debug(f"Configuring logging with level: {args.log_level}")
|
170
341
|
configure_logging(args.log_level)
|
342
|
+
logging.info("Logging configuration complete")
|
343
|
+
|
344
|
+
logging.debug("Checking disk space")
|
171
345
|
check_disk_space()
|
172
|
-
|
346
|
+
logging.debug("Disk space check complete")
|
347
|
+
|
173
348
|
# Check GitHub token before proceeding
|
349
|
+
logging.debug("Checking GitHub token setup")
|
174
350
|
if not check_and_setup_github_token():
|
175
351
|
logging.error("GitHub token setup cancelled or failed. Exiting.")
|
176
352
|
sys.exit(1)
|
177
|
-
|
353
|
+
logging.info("GitHub token validation successful")
|
354
|
+
|
178
355
|
try:
|
179
356
|
# Create config from command line arguments
|
180
|
-
|
181
|
-
|
357
|
+
logging.debug("Building configuration from arguments")
|
358
|
+
config_kwargs: dict[str, Any] = {}
|
359
|
+
|
182
360
|
# Apply production optimizations if available
|
183
361
|
if prod_config:
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
362
|
+
logging.debug(f"Applying production config: {prod_config}")
|
363
|
+
config_kwargs.update(
|
364
|
+
{
|
365
|
+
"max_retries": prod_config.get("max_retries", 3),
|
366
|
+
"batch_size": prod_config.get("batch_size", 10),
|
367
|
+
"checkpoint_interval": prod_config.get("checkpoint_interval", 10),
|
368
|
+
}
|
369
|
+
)
|
370
|
+
|
191
371
|
if args.batch_size:
|
192
|
-
|
372
|
+
logging.debug(f"Setting batch_size to {args.batch_size}")
|
373
|
+
config_kwargs["batch_size"] = args.batch_size
|
193
374
|
if args.workers:
|
194
|
-
|
375
|
+
logging.debug(f"Setting n_workers to {args.workers}")
|
376
|
+
config_kwargs["n_workers"] = args.workers
|
195
377
|
if args.model_path:
|
196
|
-
|
378
|
+
logging.debug(f"Setting model_path to {args.model_path}")
|
379
|
+
config_kwargs["model_path"] = args.model_path
|
197
380
|
if args.max_tokens:
|
198
|
-
|
381
|
+
logging.debug(f"Setting max_tokens to {args.max_tokens}")
|
382
|
+
config_kwargs["max_tokens"] = args.max_tokens
|
199
383
|
if args.checkpoint_interval:
|
200
|
-
|
201
|
-
|
384
|
+
logging.debug(f"Setting checkpoint_interval to {args.checkpoint_interval}")
|
385
|
+
config_kwargs["checkpoint_interval"] = args.checkpoint_interval
|
386
|
+
|
202
387
|
# Load config file if provided
|
203
388
|
if args.config_file:
|
389
|
+
logging.debug(f"Loading config file: {args.config_file}")
|
204
390
|
import json
|
205
|
-
|
391
|
+
|
392
|
+
with open(args.config_file) as f:
|
206
393
|
file_config = json.load(f)
|
207
|
-
|
208
|
-
|
394
|
+
logging.debug(f"Config file loaded: {file_config}")
|
395
|
+
config_kwargs.update(file_config) # type: ignore
|
396
|
+
|
397
|
+
# Handle Crawl4AI configuration
|
398
|
+
logging.debug("Configuring Crawl4AI settings")
|
399
|
+
enable_crawl4ai = (
|
400
|
+
args.enable_crawl4ai and not args.disable_crawl4ai
|
401
|
+
if hasattr(args, "disable_crawl4ai")
|
402
|
+
else True
|
403
|
+
)
|
404
|
+
logging.debug(f"Crawl4AI enabled: {enable_crawl4ai}")
|
405
|
+
config_kwargs.update(
|
406
|
+
{
|
407
|
+
"enable_crawl4ai": enable_crawl4ai,
|
408
|
+
"crawl4ai_model": getattr(
|
409
|
+
args,
|
410
|
+
"crawl4ai_model",
|
411
|
+
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
|
412
|
+
),
|
413
|
+
}
|
414
|
+
)
|
415
|
+
|
416
|
+
logging.debug(f"Creating PipelineConfig with kwargs: {config_kwargs}")
|
209
417
|
config = PipelineConfig(**config_kwargs)
|
210
|
-
|
418
|
+
logging.info("Pipeline configuration created successfully")
|
419
|
+
|
211
420
|
# Pass additional arguments to pipeline
|
212
|
-
|
421
|
+
logging.debug("Building pipeline kwargs")
|
422
|
+
pipeline_kwargs: dict[str, Any] = {}
|
213
423
|
if args.output_dir:
|
214
|
-
|
424
|
+
logging.debug(f"Setting output_dir to {args.output_dir}")
|
425
|
+
pipeline_kwargs["output_dir"] = args.output_dir
|
215
426
|
if args.limit:
|
216
|
-
|
427
|
+
logging.debug(f"Setting limit to {args.limit}")
|
428
|
+
pipeline_kwargs["limit"] = args.limit
|
217
429
|
if args.crate_list:
|
218
|
-
|
430
|
+
logging.debug(f"Setting crate_list to {args.crate_list}")
|
431
|
+
pipeline_kwargs["crate_list"] = args.crate_list
|
219
432
|
if args.skip_ai:
|
220
|
-
|
433
|
+
logging.debug("Enabling skip_ai mode")
|
434
|
+
pipeline_kwargs["skip_ai"] = True
|
221
435
|
if args.skip_source_analysis:
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
pipeline
|
228
|
-
|
436
|
+
logging.debug("Enabling skip_source mode")
|
437
|
+
pipeline_kwargs["skip_source"] = True
|
438
|
+
|
439
|
+
logging.debug(f"Pipeline kwargs: {pipeline_kwargs}")
|
440
|
+
|
441
|
+
# Sigil Protocol integration - handle pipeline creation properly
|
442
|
+
if hasattr(args, "enable_sigil_protocol") and args.enable_sigil_protocol:
|
443
|
+
logging.info("Sigil Protocol mode requested")
|
444
|
+
logging.debug(
|
445
|
+
f"Sigil available: {_sigil_available}, SigilCompliantPipeline: {
|
446
|
+
SigilCompliantPipeline is not None}"
|
447
|
+
)
|
448
|
+
|
449
|
+
# Import Sigil enhanced pipeline
|
450
|
+
if _sigil_available and SigilCompliantPipeline is not None:
|
451
|
+
logging.info("Creating Sigil Protocol compliant pipeline")
|
452
|
+
sigil_pipeline = SigilCompliantPipeline(config, **pipeline_kwargs)
|
453
|
+
logging.info(
|
454
|
+
"Starting Sigil Protocol compliant pipeline with "
|
455
|
+
"Sacred Chain processing"
|
456
|
+
)
|
457
|
+
|
458
|
+
# Run Sigil pipeline (synchronous)
|
459
|
+
logging.debug("About to run Sigil pipeline - this is synchronous")
|
460
|
+
result = sigil_pipeline.run() # type: ignore[misc]
|
461
|
+
logging.debug(f"Sigil pipeline run() returned: {result}")
|
462
|
+
|
463
|
+
if result:
|
464
|
+
logging.info("Sigil pipeline completed successfully")
|
465
|
+
else:
|
466
|
+
logging.warning("Sigil pipeline completed with no results")
|
467
|
+
else:
|
468
|
+
logging.warning("Sigil enhanced pipeline not available")
|
469
|
+
logging.info("Falling back to standard pipeline")
|
470
|
+
|
471
|
+
logging.debug("Creating standard pipeline as Sigil fallback")
|
472
|
+
standard_pipeline = CrateDataPipeline(config)
|
473
|
+
logging.debug("Standard pipeline created, about to run asynchronously")
|
474
|
+
|
475
|
+
# Run standard pipeline (asynchronous)
|
476
|
+
import asyncio
|
477
|
+
|
478
|
+
logging.debug("Starting asyncio.run() for standard pipeline")
|
479
|
+
result = asyncio.run(
|
480
|
+
standard_pipeline.run()
|
481
|
+
) # type: ignore[misc,assignment]
|
482
|
+
logging.debug(f"Standard pipeline asyncio.run() returned: {result}")
|
483
|
+
|
484
|
+
if result:
|
485
|
+
logging.info("Standard pipeline completed successfully")
|
486
|
+
else:
|
487
|
+
logging.warning("Standard pipeline completed with no results")
|
488
|
+
else:
|
489
|
+
logging.info("Standard pipeline mode")
|
490
|
+
logging.debug("Creating standard pipeline")
|
491
|
+
standard_pipeline = CrateDataPipeline(config)
|
492
|
+
logging.info(f"Starting pipeline with {len(vars(args))} arguments")
|
493
|
+
logging.debug("Standard pipeline created, about to run asynchronously")
|
494
|
+
|
495
|
+
# Run standard pipeline (asynchronous)
|
496
|
+
import asyncio
|
497
|
+
|
498
|
+
logging.debug("Starting asyncio.run() for standard pipeline")
|
499
|
+
result = asyncio.run(
|
500
|
+
standard_pipeline.run()
|
501
|
+
) # type: ignore[misc,assignment]
|
502
|
+
logging.debug(f"Standard pipeline asyncio.run() returned: {result}")
|
503
|
+
|
504
|
+
if result:
|
505
|
+
logging.info("Standard pipeline completed successfully")
|
506
|
+
else:
|
507
|
+
logging.warning("Standard pipeline completed with no results")
|
508
|
+
|
509
|
+
logging.info("Main function execution completed successfully")
|
510
|
+
|
229
511
|
except Exception as e:
|
230
512
|
logging.critical(f"Pipeline failed: {str(e)}")
|
513
|
+
logging.debug(f"Exception details: {type(e).__name__}: {str(e)}", exc_info=True)
|
231
514
|
sys.exit(1)
|
232
515
|
|
516
|
+
|
233
517
|
if __name__ == "__main__":
|
234
|
-
main()
|
518
|
+
main()
|