rust-crate-pipeline 1.2.5__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +25 -25
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +309 -200
- rust_crate_pipeline/analysis.py +304 -368
- rust_crate_pipeline/azure_ai_processing.py +453 -0
- rust_crate_pipeline/config.py +57 -19
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +42 -36
- rust_crate_pipeline/main.py +386 -102
- rust_crate_pipeline/network.py +153 -133
- rust_crate_pipeline/pipeline.py +340 -264
- rust_crate_pipeline/production_config.py +35 -32
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +45 -14
- rust_crate_pipeline/utils/logging_utils.py +34 -17
- rust_crate_pipeline/version.py +47 -2
- rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
- rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
- rust_crate_pipeline-1.2.5.dist-info/METADATA +0 -573
- rust_crate_pipeline-1.2.5.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/pipeline.py
CHANGED
@@ -3,319 +3,395 @@ import os
|
|
3
3
|
import time
|
4
4
|
import logging
|
5
5
|
import json
|
6
|
-
|
7
|
-
from
|
8
|
-
|
6
|
+
import asyncio
|
7
|
+
from typing import Any, Union, TYPE_CHECKING
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from typing import Dict, List, Optional
|
11
|
+
|
9
12
|
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
10
13
|
from .network import CrateAPIClient, GitHubBatchClient
|
11
14
|
from .ai_processing import LLMEnricher
|
12
|
-
from .analysis import
|
15
|
+
from .analysis import DependencyAnalyzer
|
16
|
+
from .crate_analysis import CrateAnalyzer
|
17
|
+
|
18
|
+
# Import Azure OpenAI enricher
|
19
|
+
try:
|
20
|
+
from .azure_ai_processing import AzureOpenAIEnricher
|
21
|
+
AZURE_OPENAI_AVAILABLE = True
|
22
|
+
except ImportError:
|
23
|
+
AZURE_OPENAI_AVAILABLE = False
|
24
|
+
AzureOpenAIEnricher = None
|
25
|
+
|
26
|
+
# Import enhanced scraping capabilities
|
27
|
+
try:
|
28
|
+
import sys
|
29
|
+
|
30
|
+
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
31
|
+
from enhanced_scraping import (
|
32
|
+
CrateDocumentationScraper,
|
33
|
+
EnhancedScrapingResult,
|
34
|
+
)
|
35
|
+
|
36
|
+
ENHANCED_SCRAPING_AVAILABLE = True
|
37
|
+
except ImportError:
|
38
|
+
ENHANCED_SCRAPING_AVAILABLE = False
|
39
|
+
CrateDocumentationScraper = None # type: ignore[assignment,misc]
|
40
|
+
EnhancedScrapingResult = None # type: ignore[assignment,misc]
|
41
|
+
logging.warning("Enhanced scraping not available - using basic methods")
|
42
|
+
|
13
43
|
|
14
44
|
class CrateDataPipeline:
|
15
|
-
|
45
|
+
"""Orchestrates the entire data collection, enrichment, and analysis pipeline."""
|
46
|
+
|
47
|
+
def __init__(self, config: PipelineConfig) -> None:
|
16
48
|
self.config = config
|
17
49
|
self.api_client = CrateAPIClient(config)
|
18
50
|
self.github_client = GitHubBatchClient(config)
|
19
|
-
self.enricher = LLMEnricher(config)
|
20
|
-
self.crates = self.get_crate_list()
|
21
|
-
self.output_dir = self._create_output_dir()
|
22
51
|
|
52
|
+
# Initialize the appropriate AI enricher based on configuration
|
53
|
+
if config.use_azure_openai and AZURE_OPENAI_AVAILABLE and AzureOpenAIEnricher is not None:
|
54
|
+
try:
|
55
|
+
self.enricher = AzureOpenAIEnricher(config)
|
56
|
+
logging.info("✅ Using Azure OpenAI enricher")
|
57
|
+
except Exception as e:
|
58
|
+
logging.warning(f"⚠️ Failed to initialize Azure OpenAI enricher: {e}")
|
59
|
+
logging.info("🔄 Falling back to local LLM enricher")
|
60
|
+
self.enricher = LLMEnricher(config)
|
61
|
+
else:
|
62
|
+
if config.use_azure_openai and not AZURE_OPENAI_AVAILABLE:
|
63
|
+
logging.warning("⚠️ Azure OpenAI requested but not available")
|
64
|
+
self.enricher = LLMEnricher(config)
|
65
|
+
logging.info("✅ Using local LLM enricher")
|
66
|
+
|
67
|
+
# Initialize cargo analyzer
|
68
|
+
self.cargo_analyzer = CrateAnalyzer(".")
|
69
|
+
|
70
|
+
self.crates = self._get_crate_list()
|
71
|
+
self.output_dir = self._create_output_dir()
|
72
|
+
self.enhanced_scraper: Union[CrateDocumentationScraper, None] = (
|
73
|
+
self._initialize_enhanced_scraper()
|
74
|
+
)
|
75
|
+
|
76
|
+
def _initialize_enhanced_scraper(self) -> Union[CrateDocumentationScraper, None]:
|
77
|
+
"""Initializes the CrateDocumentationScraper if available and enabled."""
|
78
|
+
if (
|
79
|
+
not ENHANCED_SCRAPING_AVAILABLE
|
80
|
+
or not self.config.enable_crawl4ai
|
81
|
+
or CrateDocumentationScraper is None
|
82
|
+
):
|
83
|
+
return None
|
84
|
+
try:
|
85
|
+
scraper = CrateDocumentationScraper()
|
86
|
+
logging.info("✅ Enhanced scraping with Crawl4AI enabled")
|
87
|
+
return scraper
|
88
|
+
except Exception as e:
|
89
|
+
logging.warning(f"❌ Failed to initialize enhanced scraping: {e}")
|
90
|
+
return None
|
91
|
+
|
23
92
|
def _create_output_dir(self) -> str:
|
93
|
+
"""Creates a timestamped output directory for pipeline results."""
|
24
94
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
25
|
-
output_dir = f"crate_data_{timestamp}"
|
95
|
+
output_dir = os.path.join(self.config.output_path, f"crate_data_{timestamp}")
|
26
96
|
os.makedirs(output_dir, exist_ok=True)
|
27
97
|
return output_dir
|
28
98
|
|
29
|
-
def
|
30
|
-
"""
|
31
|
-
crates
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
"
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
"
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
"criterion", "proptest", "quickcheck", "rstest", "serial_test", "mockall",
|
109
|
-
"httpmock", "assert_cmd", "assert_fs", "predicates", "tempfile",
|
110
|
-
"insta", "goldenfile", "similar", "difference", "pretty_assertions",
|
111
|
-
|
112
|
-
# Configuration and environment
|
113
|
-
"config", "figment", "envy", "dotenv", "confy", "directories", "app_dirs",
|
114
|
-
"etcetera", "platform-dirs", "home", "which", "dunce", "normpath",
|
115
|
-
|
116
|
-
# Logging and observability
|
117
|
-
"log", "env_logger", "tracing", "tracing-subscriber", "tracing-futures",
|
118
|
-
"tracing-actix-web", "tracing-log", "slog", "fern", "flexi_logger",
|
119
|
-
"log4rs", "simplelog", "stderrlog", "pretty_env_logger", "fast_log",
|
120
|
-
|
121
|
-
# Time and date
|
122
|
-
"chrono", "time", "humantime", "chrono-tz", "chrono-english", "ical",
|
123
|
-
"cron", "tokio-cron-scheduler", "job_scheduler", "delay_timer",
|
124
|
-
|
125
|
-
# Machine Learning & AI
|
126
|
-
"tokenizers", "safetensors", "linfa", "ndarray", "smartcore", "burn",
|
127
|
-
"tract-core", "tract-onnx", "tract-hir", "tract-linalg", "tract-data",
|
128
|
-
"tract-nnef", "tract-onnx-opl", "tract-pulse", "tract-pulse-opl",
|
129
|
-
"tract-nnef-resources", "tch", "torch-sys", "ort", "ort-sys", "candle-core",
|
130
|
-
"candle-nn", "candle-transformers", "candle-kernels", "candle-onnx",
|
131
|
-
"candle-metal-kernels", "tiktoken-rs", "tensorflow", "tensorflow-sys",
|
132
|
-
"onnxruntime", "onnxruntime-sys", "onnx-protobuf", "llama-cpp-2",
|
133
|
-
"llama-cpp-sys-2", "llm", "llm-samplers", "llm-chain", "llm-chain-openai",
|
134
|
-
"llama-core", "llamaedge", "openai", "openai-api-rs", "openai_dive",
|
135
|
-
"genai", "aleph-alpha-client", "llm_api_access", "ollama-rs",
|
136
|
-
"rust-bert", "fastembed", "hf-hub", "whisper-rs-sys", "toktrie",
|
137
|
-
"toktrie_hf_tokenizers", "toktrie_hf_downloader", "rust_tokenizers",
|
99
|
+
def _get_crate_list(self) -> "List[str]":
|
100
|
+
"""
|
101
|
+
Loads the list of crates to process from an external file.
|
102
|
+
This approach is more modular and easier to maintain than a hardcoded list.
|
103
|
+
"""
|
104
|
+
crate_list_path = os.path.join(os.path.dirname(__file__), "crate_list.txt")
|
105
|
+
try:
|
106
|
+
with open(crate_list_path) as f:
|
107
|
+
crates = [line.strip() for line in f if line.strip()]
|
108
|
+
logging.info(f"Loaded {len(crates)} crates from {crate_list_path}")
|
109
|
+
if not crates:
|
110
|
+
logging.warning(f"Crate list at {crate_list_path} is empty.")
|
111
|
+
return crates
|
112
|
+
except FileNotFoundError:
|
113
|
+
logging.error(f"Crate list file not found at: {crate_list_path}")
|
114
|
+
return []
|
115
|
+
|
116
|
+
def get_crate_list(self) -> "List[str]":
|
117
|
+
"""
|
118
|
+
Public method to get the list of crates.
|
119
|
+
Returns the already loaded crate list or loads it if not available.
|
120
|
+
"""
|
121
|
+
if hasattr(self, "crates") and self.crates:
|
122
|
+
return self.crates
|
123
|
+
else:
|
124
|
+
return self._get_crate_list()
|
125
|
+
|
126
|
+
async def fetch_metadata_batch(self, crate_names: "List[str]") -> "List[CrateMetadata]":
|
127
|
+
"""
|
128
|
+
Fetches metadata for a batch of crates using asyncio-based parallel processing.
|
129
|
+
"""
|
130
|
+
|
131
|
+
async def fetch_single_crate_safe(
|
132
|
+
crate_name: str,
|
133
|
+
) -> Union[CrateMetadata, None]:
|
134
|
+
try:
|
135
|
+
loop = asyncio.get_running_loop()
|
136
|
+
data = await loop.run_in_executor(
|
137
|
+
None, self.api_client.fetch_crate_metadata, crate_name
|
138
|
+
)
|
139
|
+
if not data:
|
140
|
+
return None
|
141
|
+
|
142
|
+
return CrateMetadata(
|
143
|
+
name=data.get("name", ""),
|
144
|
+
version=data.get("version", ""),
|
145
|
+
description=data.get("description", ""),
|
146
|
+
repository=data.get("repository", ""),
|
147
|
+
keywords=data.get("keywords", []),
|
148
|
+
categories=data.get("categories", []),
|
149
|
+
readme=data.get("readme", ""),
|
150
|
+
downloads=data.get("downloads", 0),
|
151
|
+
github_stars=data.get("github_stars", 0),
|
152
|
+
dependencies=data.get("dependencies", []),
|
153
|
+
features=data.get("features", {}),
|
154
|
+
code_snippets=data.get("code_snippets", []),
|
155
|
+
readme_sections=data.get("readme_sections", {}),
|
156
|
+
librs_downloads=data.get("librs_downloads"),
|
157
|
+
source=data.get("source", "crates.io"),
|
158
|
+
)
|
159
|
+
|
160
|
+
except Exception as e:
|
161
|
+
logging.error(f"Error fetching metadata for {crate_name}: {e}")
|
162
|
+
return None
|
163
|
+
|
164
|
+
tasks = [fetch_single_crate_safe(name) for name in crate_names]
|
165
|
+
results_raw = await asyncio.gather(*tasks)
|
166
|
+
results = [r for r in results_raw if r]
|
167
|
+
logging.info(
|
168
|
+
f"Fetched metadata for {len(results)} out of "
|
169
|
+
f"{len(crate_names)} requested crates."
|
170
|
+
)
|
171
|
+
return results
|
172
|
+
|
173
|
+
async def enrich_batch(self, batch: "List[CrateMetadata]") -> "List[EnrichedCrate]":
|
174
|
+
"""Enriches a batch of crates with GitHub stats, enhanced scraping, and AI."""
|
175
|
+
# Update GitHub stats
|
176
|
+
github_repos = [
|
177
|
+
c.repository for c in batch if c.repository and "github.com" in c.repository
|
138
178
|
]
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
with
|
147
|
-
|
148
|
-
|
179
|
+
if github_repos:
|
180
|
+
repo_stats = self.github_client.batch_get_repo_stats(github_repos)
|
181
|
+
for crate in batch:
|
182
|
+
if crate.repository in repo_stats:
|
183
|
+
stats = repo_stats[crate.repository]
|
184
|
+
crate.github_stars = stats.get("stargazers_count", 0)
|
185
|
+
|
186
|
+
# Asynchronously enhance with scraping and AI
|
187
|
+
enrichment_tasks = [self._enrich_single_crate(crate) for crate in batch]
|
188
|
+
enriched_results = await asyncio.gather(*enrichment_tasks)
|
189
|
+
return [result for result in enriched_results if result]
|
190
|
+
|
191
|
+
async def _enrich_single_crate(self, crate: CrateMetadata) -> Union[EnrichedCrate, None]:
|
192
|
+
"""Helper to enrich a single crate with scraping, AI analysis, and cargo analysis."""
|
193
|
+
try:
|
194
|
+
# Enhanced scraping if available
|
195
|
+
if self.enhanced_scraper:
|
196
|
+
await self._enhance_with_scraping(crate)
|
197
|
+
|
198
|
+
# Now enrich with AI
|
199
|
+
enriched = self.enricher.enrich_crate(crate)
|
149
200
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
crate_metadata = CrateMetadata(
|
158
|
-
name=data.get("name", ""),
|
159
|
-
version=data.get("version", ""),
|
160
|
-
description=data.get("description", ""),
|
161
|
-
repository=data.get("repository", ""),
|
162
|
-
keywords=data.get("keywords", []),
|
163
|
-
categories=data.get("categories", []),
|
164
|
-
readme=data.get("readme", ""),
|
165
|
-
downloads=data.get("downloads", 0),
|
166
|
-
github_stars=data.get("github_stars", 0),
|
167
|
-
dependencies=data.get("dependencies", []),
|
168
|
-
features=data.get("features", []),
|
169
|
-
code_snippets=data.get("code_snippets", []),
|
170
|
-
readme_sections=data.get("readme_sections", {}),
|
171
|
-
librs_downloads=data.get("librs_downloads"),
|
172
|
-
source=data.get("source", "crates.io")
|
173
|
-
)
|
174
|
-
results.append(crate_metadata)
|
175
|
-
logging.info(f"Fetched metadata for {crate_name}")
|
176
|
-
except Exception as e:
|
177
|
-
logging.error(f"Failed to fetch metadata for {crate_name}: {str(e)}")
|
201
|
+
# Add cargo analysis if we have a local crate directory
|
202
|
+
# Note: This would require downloading/cloning the crate first
|
203
|
+
# For now, we'll add a placeholder for cargo analysis
|
204
|
+
enriched.source_analysis = {
|
205
|
+
"cargo_analysis_available": False,
|
206
|
+
"note": "Cargo analysis requires local crate source code"
|
207
|
+
}
|
178
208
|
|
179
|
-
|
209
|
+
logging.info(f"Enriched {crate.name}")
|
210
|
+
return enriched
|
211
|
+
except Exception as e:
|
212
|
+
logging.error(f"Failed to enrich {crate.name}: {e}")
|
213
|
+
# Return a partially enriched crate to avoid data loss
|
214
|
+
enriched_dict = crate.to_dict()
|
215
|
+
return EnrichedCrate(**enriched_dict)
|
180
216
|
|
181
|
-
def
|
182
|
-
"""
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
217
|
+
async def _enhance_with_scraping(self, crate: CrateMetadata) -> None:
|
218
|
+
"""
|
219
|
+
Enhances a single crate with advanced web scraping data.
|
220
|
+
Modifies the crate object in place.
|
221
|
+
"""
|
222
|
+
if not self.enhanced_scraper:
|
223
|
+
return
|
224
|
+
|
225
|
+
try:
|
226
|
+
scraping_results = await self.enhanced_scraper.scrape_crate_info(crate.name)
|
227
|
+
if scraping_results:
|
228
|
+
self._integrate_scraping_results(crate, scraping_results)
|
229
|
+
logging.info(
|
230
|
+
f"Enhanced scraping for {crate.name}: "
|
231
|
+
f"{len(scraping_results)} sources"
|
232
|
+
)
|
233
|
+
except Exception as e:
|
234
|
+
logging.warning(f"Enhanced scraping failed for {crate.name}: {e}")
|
235
|
+
|
236
|
+
def _integrate_scraping_results(
|
237
|
+
self,
|
238
|
+
crate: CrateMetadata,
|
239
|
+
scraping_results: "Dict[str, EnhancedScrapingResult]",
|
240
|
+
) -> None:
|
241
|
+
"""
|
242
|
+
Integrates enhanced scraping results into the crate metadata.
|
243
|
+
Modifies the crate object in place.
|
244
|
+
"""
|
245
|
+
crate.enhanced_scraping = {}
|
246
|
+
|
247
|
+
for source, result in scraping_results.items():
|
248
|
+
if not result or result.error:
|
249
|
+
continue
|
250
|
+
|
251
|
+
crate.enhanced_scraping[source] = {
|
252
|
+
"title": result.title,
|
253
|
+
"quality_score": result.quality_score,
|
254
|
+
"extraction_method": result.extraction_method,
|
255
|
+
"structured_data": result.structured_data,
|
256
|
+
"content_length": len(result.content),
|
257
|
+
}
|
258
|
+
# Update README if we got better content
|
259
|
+
if source == "docs_rs" and result.quality_score > 0.7:
|
260
|
+
if not crate.readme or len(result.content) > len(crate.readme):
|
261
|
+
crate.readme = result.content
|
262
|
+
logging.info(f"Updated README for {crate.name} from {source}")
|
263
|
+
|
264
|
+
# Extract additional metadata from structured data
|
265
|
+
structured_data = result.structured_data or {}
|
266
|
+
if "features" in structured_data and isinstance(
|
267
|
+
structured_data["features"], list
|
268
|
+
):
|
269
|
+
crate.enhanced_features = structured_data["features"]
|
270
|
+
if "dependencies" in structured_data and isinstance(
|
271
|
+
structured_data["dependencies"], list
|
272
|
+
):
|
273
|
+
crate.enhanced_dependencies = structured_data["dependencies"]
|
274
|
+
if "examples" in structured_data and isinstance(
|
275
|
+
structured_data["examples"], list
|
276
|
+
):
|
277
|
+
crate.code_snippets.extend(structured_data["examples"])
|
278
|
+
|
279
|
+
def analyze_dependencies(self, crates: "List[EnrichedCrate]") -> "Dict[str, Any]":
|
280
|
+
"""Analyze dependencies between crates."""
|
211
281
|
return DependencyAnalyzer.analyze_dependencies(crates)
|
212
282
|
|
213
|
-
def save_checkpoint(self, data: List[EnrichedCrate], prefix: str):
|
214
|
-
"""
|
283
|
+
def save_checkpoint(self, data: "List[EnrichedCrate]", prefix: str) -> str:
|
284
|
+
"""Saves a processing checkpoint to a file."""
|
215
285
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
216
286
|
filename = os.path.join(self.output_dir, f"{prefix}_{timestamp}.jsonl")
|
217
|
-
|
287
|
+
|
218
288
|
with open(filename, "w") as f:
|
219
289
|
for item in data:
|
220
|
-
|
221
|
-
|
222
|
-
f.write(json.dumps(item_dict) + "\n")
|
223
|
-
|
224
|
-
# Save status metadata
|
225
|
-
status = {
|
226
|
-
"timestamp": timestamp,
|
227
|
-
"total_crates": len(data),
|
228
|
-
"processed_crates": sum(1 for c in data if c.use_case is not None),
|
229
|
-
"advanced_analysis": sum(1 for c in data if c.source_analysis is not None),
|
230
|
-
"checkpoint_file": filename
|
231
|
-
}
|
232
|
-
|
233
|
-
status_file = os.path.join(self.output_dir, f"{prefix}_status_{timestamp}.json")
|
234
|
-
with open(status_file, "w") as f:
|
235
|
-
json.dump(status, f, indent=2)
|
236
|
-
|
290
|
+
f.write(json.dumps(item.to_dict()) + "\n")
|
291
|
+
|
237
292
|
logging.info(f"Saved checkpoint to {filename}")
|
238
293
|
return filename
|
239
294
|
|
240
|
-
def save_final_output(
|
241
|
-
""
|
295
|
+
def save_final_output(
|
296
|
+
self, data: "List[EnrichedCrate]", dependency_data: "Dict[str, Any]"
|
297
|
+
) -> None:
|
298
|
+
"""Saves the final enriched data and analysis reports."""
|
242
299
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
243
|
-
|
300
|
+
|
244
301
|
# Save main enriched data
|
245
|
-
|
246
|
-
|
302
|
+
final_output_path = os.path.join(
|
303
|
+
self.output_dir, f"enriched_crate_metadata_{timestamp}.jsonl"
|
304
|
+
)
|
305
|
+
with open(final_output_path, "w") as f:
|
247
306
|
for item in data:
|
248
|
-
|
249
|
-
|
250
|
-
|
307
|
+
f.write(json.dumps(item.to_dict()) + "\n")
|
308
|
+
|
251
309
|
# Save dependency analysis
|
252
|
-
|
253
|
-
|
310
|
+
dep_file_path = os.path.join(
|
311
|
+
self.output_dir, f"dependency_analysis_{timestamp}.json"
|
312
|
+
)
|
313
|
+
with open(dep_file_path, "w") as f:
|
254
314
|
json.dump(dependency_data, f, indent=2)
|
255
315
|
|
256
|
-
# Generate summary report
|
316
|
+
# Generate and save summary report
|
317
|
+
self._generate_summary_report(data, dependency_data, timestamp)
|
318
|
+
|
319
|
+
logging.info(f"Results saved to {self.output_dir}/")
|
320
|
+
|
321
|
+
def _generate_summary_report(
|
322
|
+
self,
|
323
|
+
data: "List[EnrichedCrate]",
|
324
|
+
dependency_data: "Dict[str, Any]",
|
325
|
+
timestamp: str,
|
326
|
+
) -> None:
|
327
|
+
"""Generates a summary report of the pipeline run."""
|
257
328
|
summary = {
|
258
329
|
"total_crates": len(data),
|
259
330
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
260
|
-
"most_popular": sorted(
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
331
|
+
"most_popular": sorted(
|
332
|
+
[
|
333
|
+
{
|
334
|
+
"name": c.name,
|
335
|
+
"score": c.score or 0,
|
336
|
+
"downloads": c.downloads,
|
337
|
+
"github_stars": c.github_stars,
|
338
|
+
}
|
339
|
+
for c in data
|
340
|
+
],
|
341
|
+
key=lambda x: x.get("score", 0),
|
342
|
+
reverse=True,
|
343
|
+
)[:10],
|
344
|
+
"most_depended_upon": dependency_data.get("most_depended", [])[:10],
|
267
345
|
}
|
268
|
-
|
269
|
-
summary_file = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
|
270
|
-
with open(summary_file, "w") as f:
|
271
|
-
json.dump(summary, f, indent=2)
|
272
346
|
|
273
|
-
|
347
|
+
summary_path = os.path.join(self.output_dir, f"summary_report_{timestamp}.json")
|
348
|
+
with open(summary_path, "w") as f:
|
349
|
+
json.dump(summary, f, indent=2)
|
274
350
|
|
275
|
-
def run(self):
|
276
|
-
"""Main pipeline execution flow"""
|
351
|
+
async def run(self) -> Union["tuple[List[EnrichedCrate], Dict[str, Any]]", None]:
|
352
|
+
"""Main pipeline execution flow."""
|
277
353
|
start_time = time.time()
|
354
|
+
if not self.crates:
|
355
|
+
logging.error("No crates to process. Exiting.")
|
356
|
+
return None
|
357
|
+
|
278
358
|
logging.info(f"Processing {len(self.crates)} crates...")
|
279
|
-
|
280
|
-
# Process in batches
|
281
|
-
all_enriched = []
|
282
|
-
crate_batches = [self.crates[i:i+self.config.batch_size]
|
283
|
-
for i in range(0, len(self.crates), self.config.batch_size)]
|
284
359
|
|
285
|
-
|
286
|
-
|
287
|
-
|
360
|
+
all_enriched: "List[EnrichedCrate]" = []
|
361
|
+
batch_size = self.config.batch_size
|
362
|
+
crate_batches = [
|
363
|
+
self.crates[i : i + batch_size]
|
364
|
+
for i in range(0, len(self.crates), batch_size)
|
365
|
+
]
|
366
|
+
|
367
|
+
for i, batch_names in enumerate(crate_batches):
|
368
|
+
logging.info(
|
369
|
+
f"Processing batch {i + 1}/{len(crate_batches)} "
|
370
|
+
f"({len(batch_names)} crates)"
|
371
|
+
)
|
372
|
+
|
288
373
|
# Fetch metadata
|
289
|
-
|
290
|
-
|
374
|
+
metadata_batch = await self.fetch_metadata_batch(batch_names)
|
375
|
+
if not metadata_batch:
|
376
|
+
logging.warning(f"Batch {i+1} yielded no metadata. Skipping.")
|
377
|
+
continue
|
378
|
+
|
291
379
|
# Enrich the batch
|
292
|
-
enriched_batch = self.enrich_batch(
|
380
|
+
enriched_batch = await self.enrich_batch(metadata_batch)
|
293
381
|
all_enriched.extend(enriched_batch)
|
294
|
-
|
295
|
-
# Save checkpoint
|
296
|
-
self.save_checkpoint(all_enriched, "
|
297
|
-
logging.info(
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
crate.source_analysis = SourceAnalyzer.analyze_crate_source(crate)
|
304
|
-
crate.security = SecurityAnalyzer.check_security_metrics(crate)
|
305
|
-
crate.user_behavior = UserBehaviorAnalyzer.fetch_user_behavior_data(crate)
|
306
|
-
logging.info(f"Advanced analysis completed for {crate.name}")
|
307
|
-
except Exception as e:
|
308
|
-
logging.warning(f"Advanced analysis failed for {crate.name}: {str(e)}")
|
309
|
-
|
310
|
-
# Step 3: Perform dependency analysis
|
382
|
+
|
383
|
+
# Save checkpoint
|
384
|
+
self.save_checkpoint(all_enriched, f"checkpoint_batch_{i + 1}")
|
385
|
+
logging.info(
|
386
|
+
f"Completed batch {i + 1}, "
|
387
|
+
f"processed {len(all_enriched)}/{len(self.crates)} crates"
|
388
|
+
)
|
389
|
+
|
390
|
+
# Final analysis and saving
|
311
391
|
logging.info("Analyzing crate dependencies...")
|
312
392
|
dependency_analysis = self.analyze_dependencies(all_enriched)
|
313
|
-
|
314
|
-
# Save final results
|
315
393
|
self.save_final_output(all_enriched, dependency_analysis)
|
316
394
|
|
317
|
-
# Final summary
|
318
395
|
duration = time.time() - start_time
|
319
396
|
logging.info(f"✅ Done. Enriched {len(all_enriched)} crates in {duration:.2f}s")
|
320
|
-
|
321
397
|
return all_enriched, dependency_analysis
|