rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/main.py
CHANGED
@@ -4,13 +4,33 @@ import time
|
|
4
4
|
import logging
|
5
5
|
import shutil
|
6
6
|
import argparse
|
7
|
+
import os
|
8
|
+
import subprocess
|
9
|
+
from typing import Any, TYPE_CHECKING
|
10
|
+
|
7
11
|
from .config import PipelineConfig
|
8
12
|
from .pipeline import CrateDataPipeline
|
9
13
|
from .production_config import setup_production_environment
|
10
14
|
from .github_token_checker import check_and_setup_github_token
|
11
15
|
|
16
|
+
# Optional Sigil import with fallback
|
17
|
+
_sigil_available = True
|
18
|
+
SigilCompliantPipeline = None
|
19
|
+
|
20
|
+
try:
|
21
|
+
sys.path.append(".") # Add current directory to path
|
22
|
+
from sigil_enhanced_pipeline import SigilCompliantPipeline
|
23
|
+
|
24
|
+
_sigil_available = True
|
25
|
+
except ImportError:
|
26
|
+
_sigil_available = False
|
27
|
+
if TYPE_CHECKING:
|
28
|
+
from sigil_enhanced_pipeline import SigilCompliantPipeline
|
29
|
+
else:
|
30
|
+
SigilCompliantPipeline = None # type: ignore[assignment,misc]
|
12
31
|
|
13
|
-
|
32
|
+
|
33
|
+
def parse_arguments() -> argparse.Namespace:
|
14
34
|
"""Parse command line arguments"""
|
15
35
|
parser = argparse.ArgumentParser(
|
16
36
|
description="Rust Crate Data Processing Pipeline",
|
@@ -23,126 +43,145 @@ Examples:
|
|
23
43
|
python -m rust_crate_pipeline --output-dir ./data # Custom output directory
|
24
44
|
python -m rust_crate_pipeline --log-level DEBUG # Verbose logging
|
25
45
|
PRODUCTION=true python -m rust_crate_pipeline # Production mode (quieter)
|
26
|
-
"""
|
46
|
+
""",
|
27
47
|
)
|
28
48
|
|
29
49
|
parser.add_argument(
|
30
|
-
|
50
|
+
"--limit",
|
51
|
+
"-l",
|
31
52
|
type=int,
|
32
53
|
default=None,
|
33
|
-
help=
|
54
|
+
help="Limit the number of crates to process (default: process all)",
|
34
55
|
)
|
35
56
|
|
36
57
|
parser.add_argument(
|
37
|
-
|
58
|
+
"--batch-size",
|
59
|
+
"-b",
|
38
60
|
type=int,
|
39
61
|
default=10,
|
40
|
-
help=
|
62
|
+
help="Number of crates to process in each batch (default: 10)",
|
41
63
|
)
|
42
64
|
|
43
65
|
parser.add_argument(
|
44
|
-
|
66
|
+
"--workers",
|
67
|
+
"-w",
|
45
68
|
type=int,
|
46
69
|
default=4,
|
47
|
-
help=
|
70
|
+
help="Number of parallel workers for API requests (default: 4)",
|
48
71
|
)
|
49
72
|
|
50
73
|
parser.add_argument(
|
51
|
-
|
74
|
+
"--output-dir",
|
75
|
+
"-o",
|
52
76
|
type=str,
|
53
77
|
default=None,
|
54
|
-
help=
|
78
|
+
help=(
|
79
|
+
"Output directory for results (default: auto-generated timestamped "
|
80
|
+
"directory)"
|
81
|
+
),
|
55
82
|
)
|
56
83
|
|
57
84
|
parser.add_argument(
|
58
|
-
|
85
|
+
"--model-path",
|
86
|
+
"-m",
|
59
87
|
type=str,
|
60
88
|
default=None,
|
61
|
-
help=
|
89
|
+
help=(
|
90
|
+
"Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-"
|
91
|
+
"6.7b-instruct.Q4_K_M.gguf)"
|
92
|
+
),
|
62
93
|
)
|
63
94
|
|
64
95
|
parser.add_argument(
|
65
|
-
|
96
|
+
"--max-tokens",
|
66
97
|
type=int,
|
67
98
|
default=256,
|
68
|
-
help=
|
99
|
+
help="Maximum tokens for LLM generation (default: 256)",
|
69
100
|
)
|
70
101
|
|
71
102
|
parser.add_argument(
|
72
|
-
|
103
|
+
"--checkpoint-interval",
|
73
104
|
type=int,
|
74
105
|
default=10,
|
75
|
-
help=
|
106
|
+
help="Save checkpoint every N crates (default: 10)",
|
76
107
|
)
|
77
108
|
|
78
|
-
parser.add_argument(
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
109
|
+
parser.add_argument(
|
110
|
+
"--log-level",
|
111
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
112
|
+
default="INFO",
|
113
|
+
help="Logging level (default: INFO)",
|
114
|
+
)
|
83
115
|
|
84
116
|
parser.add_argument(
|
85
|
-
|
86
|
-
action=
|
87
|
-
help=
|
117
|
+
"--skip-ai",
|
118
|
+
action="store_true",
|
119
|
+
help="Skip AI enrichment (faster, metadata only)",
|
88
120
|
)
|
89
121
|
|
90
122
|
parser.add_argument(
|
91
|
-
|
92
|
-
action=
|
93
|
-
help=
|
123
|
+
"--skip-source-analysis",
|
124
|
+
action="store_true",
|
125
|
+
help="Skip source code analysis",
|
94
126
|
)
|
95
127
|
|
96
128
|
# Enhanced scraping with Crawl4AI
|
97
129
|
parser.add_argument(
|
98
|
-
|
99
|
-
action=
|
130
|
+
"--enable-crawl4ai",
|
131
|
+
action="store_true",
|
100
132
|
default=True,
|
101
|
-
help=
|
133
|
+
help="Enable enhanced web scraping with Crawl4AI (default: enabled)",
|
102
134
|
)
|
103
135
|
|
104
136
|
parser.add_argument(
|
105
|
-
|
106
|
-
action=
|
107
|
-
help=
|
137
|
+
"--disable-crawl4ai",
|
138
|
+
action="store_true",
|
139
|
+
help="Disable Crawl4AI enhanced scraping (use basic scraping only)",
|
108
140
|
)
|
109
141
|
|
110
142
|
parser.add_argument(
|
111
|
-
|
143
|
+
"--crawl4ai-model",
|
112
144
|
type=str,
|
113
|
-
default=
|
114
|
-
help=
|
145
|
+
default="~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
|
146
|
+
help=(
|
147
|
+
"GGUF model path for Crawl4AI content analysis (default: ~/models/deepseek/"
|
148
|
+
"deepseek-coder-6.7b-instruct.Q4_K_M.gguf)"
|
149
|
+
),
|
115
150
|
)
|
116
151
|
|
117
152
|
parser.add_argument(
|
118
|
-
|
119
|
-
action=
|
120
|
-
help=
|
153
|
+
"--enable-sigil-protocol",
|
154
|
+
action="store_true",
|
155
|
+
help="Enable Sigil Protocol Sacred Chain processing (Rule Zero compliance)",
|
156
|
+
)
|
121
157
|
|
122
158
|
parser.add_argument(
|
123
|
-
|
124
|
-
choices=[
|
125
|
-
default=
|
126
|
-
help=
|
159
|
+
"--sigil-mode",
|
160
|
+
choices=["enhanced", "direct-llm", "hybrid"],
|
161
|
+
default="enhanced",
|
162
|
+
help=(
|
163
|
+
"Sigil processing mode: enhanced (API-based), direct-llm (local), "
|
164
|
+
"hybrid (both)"
|
165
|
+
),
|
127
166
|
)
|
128
167
|
|
129
168
|
parser.add_argument(
|
130
|
-
|
169
|
+
"--crate-list",
|
131
170
|
type=str,
|
132
|
-
nargs=
|
133
|
-
help=
|
171
|
+
nargs="+",
|
172
|
+
help="Specific crates to process (space-separated list)",
|
134
173
|
)
|
135
174
|
|
136
175
|
parser.add_argument(
|
137
|
-
|
176
|
+
"--config-file",
|
138
177
|
type=str,
|
139
|
-
help=
|
178
|
+
help="JSON config file to override default settings",
|
140
179
|
)
|
141
180
|
|
142
181
|
return parser.parse_args()
|
143
182
|
|
144
183
|
|
145
|
-
def configure_logging(log_level: str =
|
184
|
+
def configure_logging(log_level: str = "INFO") -> None:
|
146
185
|
"""Configure logging with both console and file output"""
|
147
186
|
level = getattr(logging, log_level.upper())
|
148
187
|
|
@@ -157,11 +196,9 @@ def configure_logging(log_level: str = 'INFO'):
|
|
157
196
|
# Create formatters
|
158
197
|
detailed_formatter = logging.Formatter(
|
159
198
|
"%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
160
|
-
datefmt=
|
161
|
-
)
|
162
|
-
simple_formatter = logging.Formatter(
|
163
|
-
"%(asctime)s [%(levelname)s] %(message)s"
|
199
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
164
200
|
)
|
201
|
+
simple_formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
|
165
202
|
|
166
203
|
# Console handler
|
167
204
|
console_handler = logging.StreamHandler()
|
@@ -172,8 +209,7 @@ def configure_logging(log_level: str = 'INFO'):
|
|
172
209
|
# File handler with unique timestamp
|
173
210
|
log_filename = f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log"
|
174
211
|
try:
|
175
|
-
file_handler = logging.FileHandler(
|
176
|
-
log_filename, mode='w', encoding='utf-8')
|
212
|
+
file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8")
|
177
213
|
file_handler.setLevel(logging.DEBUG) # Always capture DEBUG+ to file
|
178
214
|
file_handler.setFormatter(detailed_formatter)
|
179
215
|
root_logger.addHandler(file_handler)
|
@@ -186,110 +222,294 @@ def configure_logging(log_level: str = 'INFO'):
|
|
186
222
|
print(f"Warning: Could not create log file: {e}")
|
187
223
|
|
188
224
|
# Set library loggers to less verbose levels
|
189
|
-
logging.getLogger(
|
190
|
-
logging.getLogger(
|
191
|
-
logging.getLogger(
|
192
|
-
logging.getLogger(
|
225
|
+
logging.getLogger("requests").setLevel(logging.WARNING)
|
226
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
227
|
+
logging.getLogger("requests_cache").setLevel(logging.WARNING)
|
228
|
+
logging.getLogger("llama_cpp").setLevel(logging.WARNING)
|
193
229
|
|
194
230
|
|
195
|
-
def check_disk_space():
|
231
|
+
def check_disk_space() -> None:
|
232
|
+
"""Check if there is at least 1GB of free disk space, log a warning if not."""
|
196
233
|
if shutil.disk_usage(".").free < 1_000_000_000: # 1GB
|
197
234
|
logging.warning("Low disk space! This may affect performance.")
|
198
235
|
|
199
236
|
|
200
|
-
def
|
237
|
+
def enforce_rule_zero_reinforcement() -> None:
|
238
|
+
"""
|
239
|
+
Enforce Rule Zero rigor by validating the canonical DB hash/signature
|
240
|
+
before pipeline actions.
|
241
|
+
|
242
|
+
Allows override for local dev, but enforces in CI/prod. Logs all events
|
243
|
+
for traceability.
|
244
|
+
"""
|
245
|
+
enforce: bool = (
|
246
|
+
os.environ.get("ENFORCE_RULE_ZERO", "false").lower() == "true"
|
247
|
+
or os.environ.get("CI", "false").lower() == "true"
|
248
|
+
or os.environ.get("PRODUCTION", "false").lower() == "true"
|
249
|
+
)
|
250
|
+
if not enforce:
|
251
|
+
logging.info("Rule Zero DB hash/signature check skipped (dev mode or override)")
|
252
|
+
return
|
253
|
+
|
254
|
+
# Detect project root robustly (works in subdirs, CI, etc.)
|
255
|
+
try:
|
256
|
+
result = subprocess.run(
|
257
|
+
["git", "rev-parse", "--show-toplevel"],
|
258
|
+
capture_output=True,
|
259
|
+
text=True,
|
260
|
+
check=True,
|
261
|
+
)
|
262
|
+
project_root: str = result.stdout.strip()
|
263
|
+
except Exception as e:
|
264
|
+
logging.critical(f"Failed to detect project root for Rule Zero validation: {e}")
|
265
|
+
sys.exit(1)
|
266
|
+
|
267
|
+
db_path: str = os.path.join(project_root, "sigil_rag_cache.db")
|
268
|
+
hash_path: str = os.path.join(project_root, "sigil_rag_cache.hash")
|
269
|
+
|
270
|
+
# Validate DB hash/signature using the provided script with explicit arguments
|
271
|
+
try:
|
272
|
+
logging.info("Validating Rule Zero DB hash/signature...")
|
273
|
+
result = subprocess.run(
|
274
|
+
[
|
275
|
+
sys.executable,
|
276
|
+
os.path.join(project_root, "audits", "validate_db_hash.py"),
|
277
|
+
"--db",
|
278
|
+
db_path,
|
279
|
+
"--expected-hash",
|
280
|
+
hash_path,
|
281
|
+
],
|
282
|
+
capture_output=True,
|
283
|
+
text=True,
|
284
|
+
check=False,
|
285
|
+
)
|
286
|
+
if result.returncode != 0:
|
287
|
+
logging.error(
|
288
|
+
f"Rule Zero DB hash/signature validation failed: "
|
289
|
+
f"{result.stdout}\n{result.stderr}"
|
290
|
+
)
|
291
|
+
# Allow manual override with justification
|
292
|
+
override_justification = os.environ.get("RULE_ZERO_OVERRIDE", "")
|
293
|
+
if override_justification:
|
294
|
+
logging.warning(
|
295
|
+
"Manual override of Rule Zero DB hash/signature validation enabled."
|
296
|
+
)
|
297
|
+
logging.warning(f"Override justification: {override_justification}")
|
298
|
+
else:
|
299
|
+
logging.critical(
|
300
|
+
"Rule Zero DB hash/signature validation failed and no override "
|
301
|
+
"provided. Exiting."
|
302
|
+
)
|
303
|
+
sys.exit(1)
|
304
|
+
else:
|
305
|
+
logging.info("Rule Zero DB hash/signature validation successful.")
|
306
|
+
except Exception as e:
|
307
|
+
logging.critical(
|
308
|
+
f"Exception during Rule Zero DB hash/signature validation: {e}"
|
309
|
+
)
|
310
|
+
sys.exit(1)
|
311
|
+
|
312
|
+
# Log environment metadata for traceability
|
313
|
+
try:
|
314
|
+
subprocess.run(
|
315
|
+
[
|
316
|
+
sys.executable,
|
317
|
+
os.path.join(project_root, "scripts", "cache_env_metadata.py"),
|
318
|
+
],
|
319
|
+
capture_output=True,
|
320
|
+
text=True,
|
321
|
+
check=False,
|
322
|
+
)
|
323
|
+
except Exception as e:
|
324
|
+
logging.warning(f"Failed to cache environment metadata: {e}")
|
325
|
+
|
326
|
+
|
327
|
+
def main() -> None:
|
328
|
+
# Enforce Rule Zero rigor before any pipeline action
|
329
|
+
enforce_rule_zero_reinforcement()
|
330
|
+
|
201
331
|
# Setup production environment first for optimal logging
|
202
|
-
|
332
|
+
logging.debug("Starting main() function - setting up production environment")
|
333
|
+
prod_config: dict[str, Any] = setup_production_environment()
|
334
|
+
logging.debug(f"Production environment setup complete: {bool(prod_config)}")
|
203
335
|
|
336
|
+
logging.debug("Parsing command line arguments")
|
204
337
|
args = parse_arguments()
|
338
|
+
logging.debug(f"Arguments parsed: {vars(args)}")
|
339
|
+
|
340
|
+
logging.debug(f"Configuring logging with level: {args.log_level}")
|
205
341
|
configure_logging(args.log_level)
|
342
|
+
logging.info("Logging configuration complete")
|
343
|
+
|
344
|
+
logging.debug("Checking disk space")
|
206
345
|
check_disk_space()
|
346
|
+
logging.debug("Disk space check complete")
|
207
347
|
|
208
348
|
# Check GitHub token before proceeding
|
349
|
+
logging.debug("Checking GitHub token setup")
|
209
350
|
if not check_and_setup_github_token():
|
210
351
|
logging.error("GitHub token setup cancelled or failed. Exiting.")
|
211
352
|
sys.exit(1)
|
353
|
+
logging.info("GitHub token validation successful")
|
212
354
|
|
213
355
|
try:
|
214
356
|
# Create config from command line arguments
|
215
|
-
|
357
|
+
logging.debug("Building configuration from arguments")
|
358
|
+
config_kwargs: dict[str, Any] = {}
|
216
359
|
|
217
360
|
# Apply production optimizations if available
|
218
361
|
if prod_config:
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
362
|
+
logging.debug(f"Applying production config: {prod_config}")
|
363
|
+
config_kwargs.update(
|
364
|
+
{
|
365
|
+
"max_retries": prod_config.get("max_retries", 3),
|
366
|
+
"batch_size": prod_config.get("batch_size", 10),
|
367
|
+
"checkpoint_interval": prod_config.get("checkpoint_interval", 10),
|
368
|
+
}
|
369
|
+
)
|
225
370
|
|
226
371
|
if args.batch_size:
|
227
|
-
|
372
|
+
logging.debug(f"Setting batch_size to {args.batch_size}")
|
373
|
+
config_kwargs["batch_size"] = args.batch_size
|
228
374
|
if args.workers:
|
229
|
-
|
375
|
+
logging.debug(f"Setting n_workers to {args.workers}")
|
376
|
+
config_kwargs["n_workers"] = args.workers
|
230
377
|
if args.model_path:
|
231
|
-
|
378
|
+
logging.debug(f"Setting model_path to {args.model_path}")
|
379
|
+
config_kwargs["model_path"] = args.model_path
|
232
380
|
if args.max_tokens:
|
233
|
-
|
381
|
+
logging.debug(f"Setting max_tokens to {args.max_tokens}")
|
382
|
+
config_kwargs["max_tokens"] = args.max_tokens
|
234
383
|
if args.checkpoint_interval:
|
235
|
-
|
236
|
-
|
384
|
+
logging.debug(f"Setting checkpoint_interval to {args.checkpoint_interval}")
|
385
|
+
config_kwargs["checkpoint_interval"] = args.checkpoint_interval
|
386
|
+
|
387
|
+
# Load config file if provided
|
237
388
|
if args.config_file:
|
389
|
+
logging.debug(f"Loading config file: {args.config_file}")
|
238
390
|
import json
|
239
|
-
|
391
|
+
|
392
|
+
with open(args.config_file) as f:
|
240
393
|
file_config = json.load(f)
|
241
|
-
|
394
|
+
logging.debug(f"Config file loaded: {file_config}")
|
395
|
+
config_kwargs.update(file_config) # type: ignore
|
242
396
|
|
243
397
|
# Handle Crawl4AI configuration
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
398
|
+
logging.debug("Configuring Crawl4AI settings")
|
399
|
+
enable_crawl4ai = (
|
400
|
+
args.enable_crawl4ai and not args.disable_crawl4ai
|
401
|
+
if hasattr(args, "disable_crawl4ai")
|
402
|
+
else True
|
403
|
+
)
|
404
|
+
logging.debug(f"Crawl4AI enabled: {enable_crawl4ai}")
|
405
|
+
config_kwargs.update(
|
406
|
+
{
|
407
|
+
"enable_crawl4ai": enable_crawl4ai,
|
408
|
+
"crawl4ai_model": getattr(
|
409
|
+
args,
|
410
|
+
"crawl4ai_model",
|
411
|
+
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
|
412
|
+
),
|
413
|
+
}
|
414
|
+
)
|
415
|
+
|
416
|
+
logging.debug(f"Creating PipelineConfig with kwargs: {config_kwargs}")
|
251
417
|
config = PipelineConfig(**config_kwargs)
|
418
|
+
logging.info("Pipeline configuration created successfully")
|
252
419
|
|
253
420
|
# Pass additional arguments to pipeline
|
254
|
-
|
421
|
+
logging.debug("Building pipeline kwargs")
|
422
|
+
pipeline_kwargs: dict[str, Any] = {}
|
255
423
|
if args.output_dir:
|
256
|
-
|
424
|
+
logging.debug(f"Setting output_dir to {args.output_dir}")
|
425
|
+
pipeline_kwargs["output_dir"] = args.output_dir
|
257
426
|
if args.limit:
|
258
|
-
|
427
|
+
logging.debug(f"Setting limit to {args.limit}")
|
428
|
+
pipeline_kwargs["limit"] = args.limit
|
259
429
|
if args.crate_list:
|
260
|
-
|
430
|
+
logging.debug(f"Setting crate_list to {args.crate_list}")
|
431
|
+
pipeline_kwargs["crate_list"] = args.crate_list
|
261
432
|
if args.skip_ai:
|
262
|
-
|
433
|
+
logging.debug("Enabling skip_ai mode")
|
434
|
+
pipeline_kwargs["skip_ai"] = True
|
263
435
|
if args.skip_source_analysis:
|
264
|
-
|
436
|
+
logging.debug("Enabling skip_source mode")
|
437
|
+
pipeline_kwargs["skip_source"] = True
|
265
438
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
from sigil_enhanced_pipeline import SigilCompliantPipeline
|
439
|
+
logging.debug(f"Pipeline kwargs: {pipeline_kwargs}")
|
440
|
+
|
441
|
+
# Sigil Protocol integration - handle pipeline creation properly
|
442
|
+
if hasattr(args, "enable_sigil_protocol") and args.enable_sigil_protocol:
|
443
|
+
logging.info("Sigil Protocol mode requested")
|
444
|
+
logging.debug(
|
445
|
+
f"Sigil available: {_sigil_available}, SigilCompliantPipeline: {SigilCompliantPipeline is not None}"
|
446
|
+
)
|
275
447
|
|
276
|
-
|
448
|
+
# Import Sigil enhanced pipeline
|
449
|
+
if _sigil_available and SigilCompliantPipeline is not None:
|
450
|
+
logging.info("Creating Sigil Protocol compliant pipeline")
|
451
|
+
sigil_pipeline = SigilCompliantPipeline(config, **pipeline_kwargs)
|
277
452
|
logging.info(
|
278
|
-
"Starting Sigil Protocol compliant pipeline with
|
279
|
-
|
280
|
-
|
453
|
+
"Starting Sigil Protocol compliant pipeline with "
|
454
|
+
"Sacred Chain processing"
|
455
|
+
)
|
456
|
+
|
457
|
+
# Run Sigil pipeline (synchronous)
|
458
|
+
logging.debug("About to run Sigil pipeline - this is synchronous")
|
459
|
+
result = sigil_pipeline.run() # type: ignore[misc]
|
460
|
+
logging.debug(f"Sigil pipeline run() returned: {result}")
|
461
|
+
|
462
|
+
if result:
|
463
|
+
logging.info("Sigil pipeline completed successfully")
|
464
|
+
else:
|
465
|
+
logging.warning("Sigil pipeline completed with no results")
|
466
|
+
else:
|
467
|
+
logging.warning("Sigil enhanced pipeline not available")
|
281
468
|
logging.info("Falling back to standard pipeline")
|
282
|
-
|
469
|
+
|
470
|
+
logging.debug("Creating standard pipeline as Sigil fallback")
|
471
|
+
standard_pipeline = CrateDataPipeline(config)
|
472
|
+
logging.debug("Standard pipeline created, about to run asynchronously")
|
473
|
+
|
474
|
+
# Run standard pipeline (asynchronous)
|
475
|
+
import asyncio
|
476
|
+
|
477
|
+
logging.debug("Starting asyncio.run() for standard pipeline")
|
478
|
+
result = asyncio.run(
|
479
|
+
standard_pipeline.run()
|
480
|
+
) # type: ignore[misc,assignment]
|
481
|
+
logging.debug(f"Standard pipeline asyncio.run() returned: {result}")
|
482
|
+
|
483
|
+
if result:
|
484
|
+
logging.info("Standard pipeline completed successfully")
|
485
|
+
else:
|
486
|
+
logging.warning("Standard pipeline completed with no results")
|
283
487
|
else:
|
284
|
-
pipeline
|
285
|
-
|
488
|
+
logging.info("Standard pipeline mode")
|
489
|
+
logging.debug("Creating standard pipeline")
|
490
|
+
standard_pipeline = CrateDataPipeline(config)
|
491
|
+
logging.info(f"Starting pipeline with {len(vars(args))} arguments")
|
492
|
+
logging.debug("Standard pipeline created, about to run asynchronously")
|
493
|
+
|
494
|
+
# Run standard pipeline (asynchronous)
|
495
|
+
import asyncio
|
496
|
+
|
497
|
+
logging.debug("Starting asyncio.run() for standard pipeline")
|
498
|
+
result = asyncio.run(
|
499
|
+
standard_pipeline.run()
|
500
|
+
) # type: ignore[misc,assignment]
|
501
|
+
logging.debug(f"Standard pipeline asyncio.run() returned: {result}")
|
502
|
+
|
503
|
+
if result:
|
504
|
+
logging.info("Standard pipeline completed successfully")
|
505
|
+
else:
|
506
|
+
logging.warning("Standard pipeline completed with no results")
|
286
507
|
|
287
|
-
|
288
|
-
import asyncio
|
289
|
-
asyncio.run(pipeline.run())
|
508
|
+
logging.info("Main function execution completed successfully")
|
290
509
|
|
291
510
|
except Exception as e:
|
292
511
|
logging.critical(f"Pipeline failed: {str(e)}")
|
512
|
+
logging.debug(f"Exception details: {type(e).__name__}: {str(e)}", exc_info=True)
|
293
513
|
sys.exit(1)
|
294
514
|
|
295
515
|
|