rust-crate-pipeline 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,435 @@
1
+ # analysis.py
2
+ import os
3
+ import re
4
+ import io
5
+ import json
6
+ import time
7
+ import tarfile
8
+ import tempfile
9
+ import subprocess
10
+ import requests
11
+ from datetime import datetime, relativedelta
12
+ from bs4 import BeautifulSoup
13
+ from typing import Dict, Optional, List
14
+ from .config import EnrichedCrate
15
+
16
+ class SourceAnalyzer:
17
+ @staticmethod
18
+ def analyze_crate_source(crate: EnrichedCrate) -> Dict:
19
+ """Orchestrate source analysis from multiple sources"""
20
+ crate_name = crate.name
21
+ version = crate.version
22
+ repo_url = crate.repository
23
+
24
+ # Method 1: Try to download from crates.io
25
+ try:
26
+ url = f"https://crates.io/api/v1/crates/{crate_name}/{version}/download"
27
+ response = requests.get(url, stream=True)
28
+
29
+ if response.ok:
30
+ # We got the tarball, analyze it
31
+ return SourceAnalyzer.analyze_crate_tarball(response.content)
32
+ except Exception as e:
33
+ print(f"Failed to download from crates.io: {str(e)}")
34
+
35
+ # Method 2: Try GitHub if we have a GitHub URL
36
+ if "github.com" in repo_url:
37
+ try:
38
+ # Extract owner/repo from URL
39
+ match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
40
+ if match:
41
+ owner, repo_name = match.groups()
42
+ repo_name = repo_name.split('.')[0] # Remove .git extension
43
+
44
+ # Try to download tarball from GitHub
45
+ github_url = f"https://api.github.com/repos/{owner}/{repo_name}/tarball"
46
+ response = requests.get(github_url)
47
+
48
+ if response.ok:
49
+ return SourceAnalyzer.analyze_github_tarball(response.content)
50
+ except Exception as e:
51
+ print(f"Failed to analyze from GitHub: {str(e)}")
52
+
53
+ # Method 3: Try lib.rs
54
+ try:
55
+ # lib.rs doesn't have a direct download API, but redirects to crates.io or GitHub
56
+ url = f"https://lib.rs/crates/{crate_name}"
57
+ response = requests.get(url)
58
+
59
+ if response.ok:
60
+ soup = BeautifulSoup(response.text, 'html.parser')
61
+
62
+ # Look for repository links
63
+ repo_links = soup.select('a[href*="github.com"]')
64
+ if repo_links:
65
+ repo_url = repo_links[0]['href']
66
+
67
+ # We found a GitHub link, now analyze it
68
+ return SourceAnalyzer.analyze_crate_source_from_repo(crate_name, version, repo_url)
69
+ except Exception as e:
70
+ print(f"Failed to analyze from lib.rs: {str(e)}")
71
+
72
+ # If we get here, we failed to analyze from any source
73
+ return {
74
+ "error": "Could not analyze crate from any source",
75
+ "attempted_sources": ["crates.io", "github", "lib.rs"],
76
+ "file_count": 0,
77
+ "loc": 0
78
+ }
79
+
80
+ @staticmethod
81
+ def analyze_crate_tarball(content: bytes) -> Dict:
82
+ """Analyze a .crate tarball from crates.io"""
83
+ metrics = {
84
+ "file_count": 0,
85
+ "loc": 0,
86
+ "complexity": [],
87
+ "types": [],
88
+ "traits": [],
89
+ "functions": [],
90
+ "has_tests": False,
91
+ "has_examples": False,
92
+ "has_benchmarks": False
93
+ }
94
+
95
+ try:
96
+ # Open the tar file from the content
97
+ tar_content = io.BytesIO(content)
98
+ with tarfile.open(fileobj=tar_content, mode='r:gz') as tar:
99
+ # Get list of Rust files
100
+ rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
101
+ metrics["file_count"] = len(rust_files)
102
+
103
+ # Check for test/example/bench directories
104
+ all_files = tar.getnames()
105
+ metrics["has_tests"] = any('test' in f.lower() for f in all_files)
106
+ metrics["has_examples"] = any('example' in f.lower() for f in all_files)
107
+ metrics["has_benchmarks"] = any('bench' in f.lower() for f in all_files)
108
+
109
+ # Analyze each Rust file
110
+ for filename in rust_files:
111
+ try:
112
+ member = tar.getmember(filename)
113
+ if member.isfile():
114
+ file_content = tar.extractfile(member)
115
+ if file_content:
116
+ content_str = file_content.read().decode('utf-8', errors='ignore')
117
+
118
+ # Count lines of code
119
+ metrics["loc"] += len(content_str.splitlines())
120
+
121
+ # Extract code elements
122
+ fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content_str)
123
+ struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content_str)
124
+ trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
125
+
126
+ metrics["functions"].extend(fn_matches)
127
+ metrics["types"].extend(struct_matches)
128
+ metrics["traits"].extend(trait_matches)
129
+ except Exception as e:
130
+ print(f"Error analyzing file {filename}: {str(e)}")
131
+
132
+ except Exception as e:
133
+ metrics["error"] = str(e)
134
+
135
+ return metrics
136
+
137
+ @staticmethod
138
+ def analyze_github_tarball(content: bytes) -> Dict:
139
+ """Analyze a GitHub tarball (which has a different structure)"""
140
+ metrics = {
141
+ "file_count": 0,
142
+ "loc": 0,
143
+ "complexity": [],
144
+ "types": [],
145
+ "traits": [],
146
+ "functions": [],
147
+ "has_tests": False,
148
+ "has_examples": False,
149
+ "has_benchmarks": False
150
+ }
151
+
152
+ try:
153
+ # GitHub tarballs are typically gzipped tar files
154
+ tar_content = io.BytesIO(content)
155
+ with tarfile.open(fileobj=tar_content, mode='r:gz') as tar:
156
+ # GitHub tarballs include the repo name and commit as the top dir
157
+ # So we need to handle the different structure
158
+ rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
159
+ metrics["file_count"] = len(rust_files)
160
+
161
+ # Check for test/example/bench directories
162
+ all_files = tar.getnames()
163
+ metrics["has_tests"] = any('test' in f.lower() for f in all_files)
164
+ metrics["has_examples"] = any('example' in f.lower() for f in all_files)
165
+ metrics["has_benchmarks"] = any('bench' in f.lower() for f in all_files)
166
+
167
+ # Analyze each Rust file (same as crate tarball)
168
+ for filename in rust_files:
169
+ try:
170
+ member = tar.getmember(filename)
171
+ if member.isfile():
172
+ file_content = tar.extractfile(member)
173
+ if file_content:
174
+ content_str = file_content.read().decode('utf-8', errors='ignore')
175
+
176
+ # Count lines of code
177
+ metrics["loc"] += len(content_str.splitlines())
178
+
179
+ # Extract code elements
180
+ fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content_str)
181
+ struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content_str)
182
+ trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
183
+
184
+ metrics["functions"].extend(fn_matches)
185
+ metrics["types"].extend(struct_matches)
186
+ metrics["traits"].extend(trait_matches)
187
+ except Exception as e:
188
+ print(f"Error analyzing file {filename}: {str(e)}")
189
+
190
+ except Exception as e:
191
+ metrics["error"] = str(e)
192
+
193
+ return metrics
194
+
195
+ @staticmethod
196
+ def analyze_local_directory(directory: str) -> Dict:
197
+ """Analyze source code from a local directory"""
198
+ metrics = {
199
+ "file_count": 0,
200
+ "loc": 0,
201
+ "complexity": [],
202
+ "types": [],
203
+ "traits": [],
204
+ "functions": [],
205
+ "has_tests": False,
206
+ "has_examples": False,
207
+ "has_benchmarks": False
208
+ }
209
+
210
+ try:
211
+ # Find all Rust files
212
+ rust_files = []
213
+ for root, _, files in os.walk(directory):
214
+ if "target" in root or ".git" in root: # Skip build dirs and git
215
+ continue
216
+ rust_files.extend([os.path.join(root, f) for f in files if f.endswith(".rs")])
217
+
218
+ metrics["file_count"] = len(rust_files)
219
+
220
+ # Check if the crate has tests/examples/benchmarks
221
+ metrics["has_tests"] = any(os.path.exists(os.path.join(directory, d))
222
+ for d in ["tests", "test"])
223
+ metrics["has_examples"] = os.path.exists(os.path.join(directory, "examples"))
224
+ metrics["has_benchmarks"] = os.path.exists(os.path.join(directory, "benches"))
225
+
226
+ # Analyze each Rust file
227
+ for file_path in rust_files:
228
+ try:
229
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
230
+ content = f.read()
231
+
232
+ # Count lines of code
233
+ metrics["loc"] += len(content.splitlines())
234
+
235
+ # Extract code elements
236
+ fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content)
237
+ struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content)
238
+ trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content)
239
+
240
+ metrics["functions"].extend(fn_matches)
241
+ metrics["types"].extend(struct_matches)
242
+ metrics["traits"].extend(trait_matches)
243
+
244
+ except Exception as e:
245
+ print(f"Error analyzing file {file_path}: {str(e)}")
246
+
247
+ except Exception as e:
248
+ metrics["error"] = str(e)
249
+
250
+ return metrics
251
+
252
+ @staticmethod
253
+ def analyze_crate_source_from_repo(crate_name: str, version: str, repo_url: str) -> Dict:
254
+ """Clone and analyze a crate's source code from repository"""
255
+ temp_dir = f"/tmp/rust_analysis/{crate_name}"
256
+ os.makedirs(temp_dir, exist_ok=True)
257
+
258
+ try:
259
+ # Clone repository
260
+ if not os.path.exists(f"{temp_dir}/.git"):
261
+ subprocess.run(["git", "clone", "--depth=1", repo_url, temp_dir],
262
+ capture_output=True, text=True, check=True)
263
+
264
+ return SourceAnalyzer.analyze_local_directory(temp_dir)
265
+
266
+ except Exception as e:
267
+ return {
268
+ "error": f"Failed to clone and analyze repository: {str(e)}",
269
+ "file_count": 0,
270
+ "loc": 0
271
+ }
272
+ finally:
273
+ # Clean up (optional)
274
+ # subprocess.run(["rm", "-rf", temp_dir], capture_output=True)
275
+ pass
276
+
277
+ class SecurityAnalyzer:
278
+ @staticmethod
279
+ def check_security_metrics(crate: EnrichedCrate) -> Dict:
280
+ """Check security metrics for a crate"""
281
+ security_data = {
282
+ "advisories": [],
283
+ "vulnerability_count": 0,
284
+ "cargo_audit": None,
285
+ "clippy_warnings": 0,
286
+ "test_coverage": None
287
+ }
288
+
289
+ crate_name = crate.name
290
+ version = crate.version
291
+
292
+ # Check RustSec Advisory Database
293
+ try:
294
+ # This would require the RustSec advisory database
295
+ # For now, just return placeholder data
296
+ advisories_url = f"https://rustsec.org/advisories/{crate_name}.json"
297
+ response = requests.get(advisories_url)
298
+ if response.ok:
299
+ advisories = response.json()
300
+ security_data["advisories"] = advisories
301
+ security_data["vulnerability_count"] = len(advisories)
302
+ except Exception:
303
+ pass
304
+
305
+ # Check for common security patterns in code
306
+ try:
307
+ # This would analyze the source code for unsafe blocks, etc.
308
+ # Placeholder for now
309
+ security_data["unsafe_blocks"] = 0
310
+ security_data["security_patterns"] = []
311
+ except Exception:
312
+ pass
313
+
314
+ return security_data
315
+
316
+ class UserBehaviorAnalyzer:
317
+ @staticmethod
318
+ def fetch_user_behavior_data(crate: EnrichedCrate) -> Dict:
319
+ """Fetch user behavior data from GitHub and crates.io"""
320
+ result = {
321
+ "issues": [],
322
+ "pull_requests": [],
323
+ "version_adoption": {},
324
+ "community_metrics": {}
325
+ }
326
+
327
+ crate_name = crate.name
328
+ repo_url = crate.repository
329
+
330
+ # Extract owner/repo from URL
331
+ if not repo_url or "github.com" not in repo_url:
332
+ return result
333
+
334
+ parts = repo_url.rstrip('/').split('/')
335
+ if len(parts) < 2:
336
+ return result
337
+ owner, repo = parts[-2], parts[-1]
338
+
339
+ # Setup GitHub API access - use token if available
340
+ headers = {"Accept": "application/vnd.github.v3+json"}
341
+ if os.environ.get("GITHUB_TOKEN"):
342
+ headers["Authorization"] = f"token {os.environ.get('GITHUB_TOKEN')}"
343
+
344
+ # Fetch recent issues and PRs
345
+ try:
346
+ # Get issues (last 30)
347
+ issues_url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=30"
348
+ issues_resp = requests.get(issues_url, headers=headers)
349
+ if issues_resp.ok:
350
+ issues_data = issues_resp.json()
351
+
352
+ # Process issue data
353
+ for issue in issues_data:
354
+ if "pull_request" in issue:
355
+ # This is a PR, not an issue
356
+ result["pull_requests"].append({
357
+ "number": issue["number"],
358
+ "title": issue["title"],
359
+ "state": issue["state"],
360
+ "created_at": issue["created_at"],
361
+ "closed_at": issue["closed_at"],
362
+ "url": issue["html_url"]
363
+ })
364
+ else:
365
+ # Regular issue
366
+ result["issues"].append({
367
+ "number": issue["number"],
368
+ "title": issue["title"],
369
+ "state": issue["state"],
370
+ "created_at": issue["created_at"],
371
+ "closed_at": issue["closed_at"],
372
+ "url": issue["html_url"]
373
+ })
374
+
375
+ # Fetch commit activity for the past year
376
+ commits_url = f"https://api.github.com/repos/{owner}/{repo}/stats/commit_activity"
377
+ commits_resp = requests.get(commits_url, headers=headers)
378
+ if commits_resp.ok:
379
+ result["community_metrics"]["commit_activity"] = commits_resp.json()
380
+
381
+ # Rate limiting - be nice to GitHub API
382
+ time.sleep(1)
383
+ except Exception as e:
384
+ print(f"Error fetching GitHub data: {str(e)}")
385
+
386
+ # Get version adoption data from crates.io
387
+ try:
388
+ versions_url = f"https://crates.io/api/v1/crates/{crate_name}/versions"
389
+ versions_resp = requests.get(versions_url)
390
+ if versions_resp.ok:
391
+ versions_data = versions_resp.json()
392
+ versions = versions_data.get("versions", [])
393
+
394
+ # Process version data
395
+ for version in versions[:10]: # Top 10 versions
396
+ version_num = version["num"]
397
+ downloads = version["downloads"]
398
+ created_at = version["created_at"]
399
+
400
+ result["version_adoption"][version_num] = {
401
+ "downloads": downloads,
402
+ "created_at": created_at
403
+ }
404
+ except Exception as e:
405
+ print(f"Error fetching crates.io version data: {str(e)}")
406
+
407
+ return result
408
+
409
+ class DependencyAnalyzer:
410
+ @staticmethod
411
+ def analyze_dependencies(crates: List[EnrichedCrate]) -> Dict:
412
+ """Analyze dependencies between crates"""
413
+ dependency_graph = {}
414
+ crate_names = {crate.name for crate in crates}
415
+
416
+ for crate in crates:
417
+ deps = []
418
+ for dep in crate.dependencies:
419
+ if dep.get("crate_id") in crate_names:
420
+ deps.append(dep.get("crate_id"))
421
+ dependency_graph[crate.name] = deps
422
+
423
+ # Find most depended-upon crates
424
+ reverse_deps = {}
425
+ for crate_name, deps in dependency_graph.items():
426
+ for dep in deps:
427
+ if dep not in reverse_deps:
428
+ reverse_deps[dep] = []
429
+ reverse_deps[dep].append(crate_name)
430
+
431
+ return {
432
+ "dependency_graph": dependency_graph,
433
+ "reverse_dependencies": reverse_deps,
434
+ "most_depended": sorted(reverse_deps.items(), key=lambda x: len(x[1]), reverse=True)[:10]
435
+ }
@@ -0,0 +1,46 @@
1
+ # config.py
2
+ import os
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional, Dict, Any, List
5
+
6
+ @dataclass
7
+ class PipelineConfig:
8
+ model_path: str = os.path.expanduser("~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf")
9
+ max_tokens: int = 256
10
+ model_token_limit: int = 4096
11
+ prompt_token_margin: int = 3000
12
+ checkpoint_interval: int = 10
13
+ max_retries: int = 3
14
+ github_token: str = os.getenv("GITHUB_TOKEN", "")
15
+ cache_ttl: int = 3600 # 1 hour
16
+ batch_size: int = 10
17
+ n_workers: int = 4
18
+
19
+ @dataclass
20
+ class CrateMetadata:
21
+ name: str
22
+ version: str
23
+ description: str
24
+ repository: str
25
+ keywords: List[str]
26
+ categories: List[str]
27
+ readme: str
28
+ downloads: int
29
+ github_stars: int = 0
30
+ dependencies: List[Dict[str, Any]] = field(default_factory=list)
31
+ features: List[Dict[str, Any]] = field(default_factory=list)
32
+ code_snippets: List[str] = field(default_factory=list)
33
+ readme_sections: Dict[str, str] = field(default_factory=dict)
34
+ librs_downloads: Optional[int] = None
35
+ source: str = "crates.io"
36
+
37
+ @dataclass
38
+ class EnrichedCrate(CrateMetadata):
39
+ readme_summary: Optional[str] = None
40
+ feature_summary: Optional[str] = None
41
+ use_case: Optional[str] = None
42
+ score: Optional[float] = None
43
+ factual_counterfactual: Optional[str] = None
44
+ source_analysis: Optional[Dict[str, Any]] = None
45
+ user_behavior: Optional[Dict[str, Any]] = None
46
+ security: Optional[Dict[str, Any]] = None
@@ -0,0 +1,177 @@
1
+ # main.py
2
+ import os
3
+ import sys
4
+ import time
5
+ import logging
6
+ import shutil
7
+ import argparse
8
+ from typing import Optional
9
+ from .config import PipelineConfig
10
+ from .pipeline import CrateDataPipeline
11
+
12
+ def parse_arguments():
13
+ """Parse command line arguments"""
14
+ parser = argparse.ArgumentParser(
15
+ description="Rust Crate Data Processing Pipeline",
16
+ formatter_class=argparse.RawDescriptionHelpFormatter,
17
+ epilog="""
18
+ Examples:
19
+ python -m rust_crate_pipeline # Run with defaults
20
+ python -m rust_crate_pipeline --limit 50 # Process only 50 crates
21
+ python -m rust_crate_pipeline --batch-size 5 # Smaller batches
22
+ python -m rust_crate_pipeline --output-dir ./data # Custom output directory
23
+ python -m rust_crate_pipeline --log-level DEBUG # Verbose logging
24
+ """
25
+ )
26
+
27
+ parser.add_argument(
28
+ '--limit', '-l',
29
+ type=int,
30
+ default=None,
31
+ help='Limit the number of crates to process (default: process all)'
32
+ )
33
+
34
+ parser.add_argument(
35
+ '--batch-size', '-b',
36
+ type=int,
37
+ default=10,
38
+ help='Number of crates to process in each batch (default: 10)'
39
+ )
40
+
41
+ parser.add_argument(
42
+ '--workers', '-w',
43
+ type=int,
44
+ default=4,
45
+ help='Number of parallel workers for API requests (default: 4)'
46
+ )
47
+
48
+ parser.add_argument(
49
+ '--output-dir', '-o',
50
+ type=str,
51
+ default=None,
52
+ help='Output directory for results (default: auto-generated timestamped directory)'
53
+ )
54
+
55
+ parser.add_argument(
56
+ '--model-path', '-m',
57
+ type=str,
58
+ default=None,
59
+ help='Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
60
+ )
61
+
62
+ parser.add_argument(
63
+ '--max-tokens',
64
+ type=int,
65
+ default=256,
66
+ help='Maximum tokens for LLM generation (default: 256)'
67
+ )
68
+
69
+ parser.add_argument(
70
+ '--checkpoint-interval',
71
+ type=int,
72
+ default=10,
73
+ help='Save checkpoint every N crates (default: 10)'
74
+ )
75
+
76
+ parser.add_argument(
77
+ '--log-level',
78
+ choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
79
+ default='INFO',
80
+ help='Logging level (default: INFO)'
81
+ )
82
+
83
+ parser.add_argument(
84
+ '--skip-ai',
85
+ action='store_true',
86
+ help='Skip AI enrichment (faster, metadata only)'
87
+ )
88
+
89
+ parser.add_argument(
90
+ '--skip-source-analysis',
91
+ action='store_true',
92
+ help='Skip source code analysis'
93
+ )
94
+
95
+ parser.add_argument(
96
+ '--crate-list',
97
+ type=str,
98
+ nargs='+',
99
+ help='Specific crates to process (space-separated list)'
100
+ )
101
+
102
+ parser.add_argument(
103
+ '--config-file',
104
+ type=str,
105
+ help='JSON config file to override default settings'
106
+ )
107
+
108
+ return parser.parse_args()
109
+
110
+ def configure_logging(log_level: str = 'INFO'):
111
+ level = getattr(logging, log_level.upper())
112
+ logging.basicConfig(
113
+ level=level,
114
+ format="%(asctime)s [%(levelname)s] %(message)s",
115
+ handlers=[
116
+ logging.StreamHandler(),
117
+ logging.FileHandler(f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log")
118
+ ]
119
+ )
120
+
121
+ def check_disk_space():
122
+ if shutil.disk_usage(".").free < 1_000_000_000: # 1GB
123
+ logging.warning("Low disk space! This may affect performance.")
124
+
125
+ def main():
126
+ args = parse_arguments()
127
+ configure_logging(args.log_level)
128
+ check_disk_space()
129
+
130
+ try:
131
+ # Create config from command line arguments
132
+ config_kwargs = {}
133
+
134
+ if args.batch_size:
135
+ config_kwargs['batch_size'] = args.batch_size
136
+ if args.workers:
137
+ config_kwargs['n_workers'] = args.workers
138
+ if args.model_path:
139
+ config_kwargs['model_path'] = args.model_path
140
+ if args.max_tokens:
141
+ config_kwargs['max_tokens'] = args.max_tokens
142
+ if args.checkpoint_interval:
143
+ config_kwargs['checkpoint_interval'] = args.checkpoint_interval
144
+
145
+ # Load config file if provided
146
+ if args.config_file:
147
+ import json
148
+ with open(args.config_file, 'r') as f:
149
+ file_config = json.load(f)
150
+ config_kwargs.update(file_config)
151
+
152
+ config = PipelineConfig(**config_kwargs)
153
+
154
+ # Pass additional arguments to pipeline
155
+ pipeline_kwargs = {}
156
+ if args.output_dir:
157
+ pipeline_kwargs['output_dir'] = args.output_dir
158
+ if args.limit:
159
+ pipeline_kwargs['limit'] = args.limit
160
+ if args.crate_list:
161
+ pipeline_kwargs['crate_list'] = args.crate_list
162
+ if args.skip_ai:
163
+ pipeline_kwargs['skip_ai'] = True
164
+ if args.skip_source_analysis:
165
+ pipeline_kwargs['skip_source'] = True
166
+
167
+ pipeline = CrateDataPipeline(config, **pipeline_kwargs)
168
+
169
+ logging.info(f"Starting pipeline with {len(vars(args))} arguments")
170
+ pipeline.run()
171
+
172
+ except Exception as e:
173
+ logging.critical(f"Pipeline failed: {str(e)}")
174
+ sys.exit(1)
175
+
176
+ if __name__ == "__main__":
177
+ main()