rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,548 @@
|
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import time
|
5
|
+
import argparse
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Dict, List, Optional, Any, Union, TYPE_CHECKING
|
8
|
+
|
9
|
+
from .config import PipelineConfig
|
10
|
+
from .core import IRLEngine, CanonRegistry, SacredChainTrace, TrustVerdict
|
11
|
+
from .scraping import UnifiedScraper, ScrapingResult
|
12
|
+
from .crate_analysis import CrateAnalyzer
|
13
|
+
|
14
|
+
# Import Azure OpenAI enricher if available
|
15
|
+
try:
|
16
|
+
from .azure_ai_processing import AzureOpenAIEnricher
|
17
|
+
AZURE_OPENAI_AVAILABLE = True
|
18
|
+
except ImportError:
|
19
|
+
AZURE_OPENAI_AVAILABLE = False
|
20
|
+
AzureOpenAIEnricher = None # type: ignore # Fallback for type checkers; see below
|
21
|
+
|
22
|
+
# Import unified LLM processor
|
23
|
+
try:
|
24
|
+
from .unified_llm_processor import UnifiedLLMProcessor, create_llm_processor_from_args, LLMConfig
|
25
|
+
UNIFIED_LLM_AVAILABLE = True
|
26
|
+
except ImportError:
|
27
|
+
UNIFIED_LLM_AVAILABLE = False
|
28
|
+
UnifiedLLMProcessor = None # type: ignore
|
29
|
+
create_llm_processor_from_args = None # type: ignore
|
30
|
+
LLMConfig = None # type: ignore
|
31
|
+
|
32
|
+
if TYPE_CHECKING:
|
33
|
+
from .azure_ai_processing import AzureOpenAIEnricher # type: ignore[import]
|
34
|
+
from .unified_llm_processor import UnifiedLLMProcessor, LLMConfig # type: ignore[import]
|
35
|
+
|
36
|
+
|
37
|
+
class UnifiedSigilPipeline:
|
38
|
+
|
39
|
+
def __init__(self, config: PipelineConfig, llm_config: Optional[Any] = None) -> None:
|
40
|
+
self.config = config
|
41
|
+
self.logger = logging.getLogger(__name__)
|
42
|
+
self.irl_engine: Optional[IRLEngine] = None
|
43
|
+
self.scraper: Optional[UnifiedScraper] = None
|
44
|
+
self.canon_registry: CanonRegistry = CanonRegistry()
|
45
|
+
|
46
|
+
# Initialize AI components
|
47
|
+
self.ai_enricher: Optional[Any] = None
|
48
|
+
self.unified_llm_processor: Optional[Any] = None
|
49
|
+
self.crate_analyzer: Optional[CrateAnalyzer] = None
|
50
|
+
|
51
|
+
# Store LLM config for later use
|
52
|
+
self.llm_config = llm_config
|
53
|
+
|
54
|
+
self._initialize_components()
|
55
|
+
|
56
|
+
def _initialize_components(self) -> None:
|
57
|
+
try:
|
58
|
+
self.irl_engine = IRLEngine(self.config, self.canon_registry)
|
59
|
+
self.logger.info("✅ IRL Engine initialized successfully")
|
60
|
+
|
61
|
+
scraper_config = {
|
62
|
+
"verbose": False,
|
63
|
+
"word_count_threshold": 10,
|
64
|
+
"crawl_config": {
|
65
|
+
"max_retries": self.config.max_retries,
|
66
|
+
"timeout": self.config.crawl4ai_timeout,
|
67
|
+
}
|
68
|
+
}
|
69
|
+
self.scraper = UnifiedScraper(scraper_config)
|
70
|
+
self.logger.info("✅ Unified Scraper initialized successfully")
|
71
|
+
|
72
|
+
# Initialize unified LLM processor if available
|
73
|
+
if UNIFIED_LLM_AVAILABLE and self.llm_config:
|
74
|
+
try:
|
75
|
+
if UnifiedLLMProcessor is not None:
|
76
|
+
self.unified_llm_processor = UnifiedLLMProcessor(self.llm_config)
|
77
|
+
self.logger.info(f"✅ Unified LLM Processor initialized with provider: {self.llm_config.provider}")
|
78
|
+
else:
|
79
|
+
self.logger.warning("⚠️ UnifiedLLMProcessor is None at runtime; skipping initialization.")
|
80
|
+
except Exception as e:
|
81
|
+
self.logger.warning(f"⚠️ Failed to initialize Unified LLM Processor: {e}")
|
82
|
+
|
83
|
+
# Initialize Azure OpenAI enricher if available and configured (fallback)
|
84
|
+
elif AZURE_OPENAI_AVAILABLE and self.config.use_azure_openai:
|
85
|
+
try:
|
86
|
+
if AzureOpenAIEnricher is not None:
|
87
|
+
self.ai_enricher = AzureOpenAIEnricher(self.config) # type: ignore
|
88
|
+
self.logger.info("✅ Azure OpenAI Enricher initialized successfully")
|
89
|
+
else:
|
90
|
+
self.logger.warning("⚠️ AzureOpenAIEnricher is None at runtime; skipping initialization.")
|
91
|
+
except Exception as e:
|
92
|
+
self.logger.warning(f"⚠️ Failed to initialize Azure OpenAI Enricher: {e}")
|
93
|
+
|
94
|
+
except Exception as e:
|
95
|
+
self.logger.error(f"❌ Failed to initialize pipeline components: {e}")
|
96
|
+
raise
|
97
|
+
|
98
|
+
async def __aenter__(self) -> "UnifiedSigilPipeline":
|
99
|
+
if self.irl_engine:
|
100
|
+
await self.irl_engine.__aenter__()
|
101
|
+
if self.scraper:
|
102
|
+
await self.scraper.__aenter__()
|
103
|
+
return self
|
104
|
+
|
105
|
+
async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
|
106
|
+
if self.irl_engine:
|
107
|
+
await self.irl_engine.__aexit__(exc_type, exc_val, exc_tb)
|
108
|
+
if self.scraper:
|
109
|
+
await self.scraper.__aexit__(exc_type, exc_val, exc_tb)
|
110
|
+
|
111
|
+
async def analyze_crate(self, crate_name: str) -> SacredChainTrace:
|
112
|
+
if not crate_name or not isinstance(crate_name, str):
|
113
|
+
raise ValueError("crate_name must be a non-empty string")
|
114
|
+
|
115
|
+
self.logger.info(f"🔍 Starting analysis of crate: {crate_name}")
|
116
|
+
|
117
|
+
try:
|
118
|
+
documentation_results = await self._gather_documentation(crate_name)
|
119
|
+
|
120
|
+
sacred_chain_trace = await self._perform_sacred_chain_analysis(
|
121
|
+
crate_name, documentation_results
|
122
|
+
)
|
123
|
+
|
124
|
+
await self._generate_analysis_report(crate_name, sacred_chain_trace)
|
125
|
+
|
126
|
+
self.logger.info(f"✅ Analysis completed for {crate_name}")
|
127
|
+
return sacred_chain_trace
|
128
|
+
|
129
|
+
except Exception as e:
|
130
|
+
self.logger.error(f"❌ Analysis failed for {crate_name}: {e}")
|
131
|
+
raise RuntimeError(f"Analysis failed for {crate_name}: {str(e)}")
|
132
|
+
|
133
|
+
async def _gather_documentation(self, crate_name: str) -> Dict[str, ScrapingResult]:
|
134
|
+
if not self.scraper:
|
135
|
+
raise RuntimeError("Scraper not initialized")
|
136
|
+
|
137
|
+
self.logger.info(f"📚 Gathering documentation for {crate_name}")
|
138
|
+
|
139
|
+
try:
|
140
|
+
results = await self.scraper.scrape_crate_documentation(crate_name)
|
141
|
+
|
142
|
+
successful_sources = [source for source, result in results.items()
|
143
|
+
if result.error is None]
|
144
|
+
failed_sources = [source for source, result in results.items()
|
145
|
+
if result.error is not None]
|
146
|
+
|
147
|
+
self.logger.info(f"✅ Successfully scraped {len(successful_sources)} sources: {successful_sources}")
|
148
|
+
if failed_sources:
|
149
|
+
self.logger.warning(f"⚠️ Failed to scrape {len(failed_sources)} sources: {failed_sources}")
|
150
|
+
|
151
|
+
return results
|
152
|
+
|
153
|
+
except Exception as e:
|
154
|
+
self.logger.error(f"❌ Documentation gathering failed: {e}")
|
155
|
+
raise
|
156
|
+
|
157
|
+
async def _perform_sacred_chain_analysis(
|
158
|
+
self, crate_name: str, documentation_results: Dict[str, ScrapingResult]
|
159
|
+
) -> SacredChainTrace:
|
160
|
+
if not self.irl_engine:
|
161
|
+
raise RuntimeError("IRL Engine not initialized")
|
162
|
+
|
163
|
+
self.logger.info(f"🔗 Performing Sacred Chain analysis for {crate_name}")
|
164
|
+
|
165
|
+
try:
|
166
|
+
sacred_chain_trace = await self.irl_engine.analyze_with_sacred_chain(crate_name)
|
167
|
+
|
168
|
+
successful_docs = [result for result in documentation_results.values()
|
169
|
+
if result.error is None]
|
170
|
+
if successful_docs:
|
171
|
+
avg_quality = sum(doc.quality_score for doc in successful_docs) / len(successful_docs)
|
172
|
+
sacred_chain_trace.audit_info["documentation_quality"] = avg_quality
|
173
|
+
sacred_chain_trace.audit_info["documentation_sources"] = list(documentation_results.keys())
|
174
|
+
|
175
|
+
# Add crate analysis results if available
|
176
|
+
await self._add_crate_analysis_results(crate_name, sacred_chain_trace)
|
177
|
+
|
178
|
+
# Add AI enrichment if available
|
179
|
+
await self._add_ai_enrichment(crate_name, sacred_chain_trace)
|
180
|
+
|
181
|
+
return sacred_chain_trace
|
182
|
+
|
183
|
+
except Exception as e:
|
184
|
+
self.logger.error(f"❌ Sacred Chain analysis failed: {e}")
|
185
|
+
raise
|
186
|
+
|
187
|
+
async def _add_crate_analysis_results(self, crate_name: str, trace: SacredChainTrace) -> None:
|
188
|
+
"""Add cargo analysis results to the sacred chain trace"""
|
189
|
+
try:
|
190
|
+
# For now, we'll use a temporary directory approach
|
191
|
+
# In a real implementation, you'd download/extract the crate first
|
192
|
+
self.logger.info(f"🔍 Adding crate analysis results for {crate_name}")
|
193
|
+
|
194
|
+
# This would be implemented based on your crate source strategy
|
195
|
+
# For now, we'll add a placeholder
|
196
|
+
trace.audit_info["crate_analysis"] = {
|
197
|
+
"status": "not_implemented",
|
198
|
+
"note": "Crate analysis requires downloading/extracting the crate source"
|
199
|
+
}
|
200
|
+
|
201
|
+
except Exception as e:
|
202
|
+
self.logger.warning(f"⚠️ Failed to add crate analysis results: {e}")
|
203
|
+
|
204
|
+
async def _add_ai_enrichment(self, crate_name: str, trace: SacredChainTrace) -> None:
|
205
|
+
"""Add AI enrichment results to the sacred chain trace"""
|
206
|
+
# Use unified LLM processor if available, otherwise fall back to Azure OpenAI
|
207
|
+
if self.unified_llm_processor:
|
208
|
+
await self._add_unified_llm_enrichment(crate_name, trace)
|
209
|
+
elif self.ai_enricher:
|
210
|
+
await self._add_azure_openai_enrichment(crate_name, trace)
|
211
|
+
else:
|
212
|
+
self.logger.info("ℹ️ No AI enricher available, skipping AI enrichment")
|
213
|
+
|
214
|
+
async def _add_unified_llm_enrichment(self, crate_name: str, trace: SacredChainTrace) -> None:
|
215
|
+
"""Add enrichment using unified LLM processor"""
|
216
|
+
if not self.unified_llm_processor:
|
217
|
+
return
|
218
|
+
|
219
|
+
try:
|
220
|
+
self.logger.info(f"🤖 Adding unified LLM enrichment for {crate_name}")
|
221
|
+
|
222
|
+
# Create a mock crate metadata for AI analysis
|
223
|
+
# In a real implementation, this would come from your scraping results
|
224
|
+
from .config import CrateMetadata
|
225
|
+
|
226
|
+
mock_crate = CrateMetadata(
|
227
|
+
name=crate_name,
|
228
|
+
version="unknown",
|
229
|
+
description=trace.suggestion or "No description available",
|
230
|
+
repository="",
|
231
|
+
keywords=[],
|
232
|
+
categories=[],
|
233
|
+
readme="",
|
234
|
+
downloads=0,
|
235
|
+
github_stars=0,
|
236
|
+
dependencies=[],
|
237
|
+
features={},
|
238
|
+
code_snippets=[],
|
239
|
+
readme_sections={},
|
240
|
+
librs_downloads=None,
|
241
|
+
source="crates.io",
|
242
|
+
enhanced_scraping={},
|
243
|
+
enhanced_features=[],
|
244
|
+
enhanced_dependencies=[]
|
245
|
+
)
|
246
|
+
|
247
|
+
# Enrich the crate using unified LLM processor
|
248
|
+
enriched_crate = self.unified_llm_processor.enrich_crate(mock_crate)
|
249
|
+
|
250
|
+
# Add enrichment results to trace
|
251
|
+
trace.audit_info["ai_enrichment"] = {
|
252
|
+
"provider": self.llm_config.provider if self.llm_config else "unknown",
|
253
|
+
"model": self.llm_config.model if self.llm_config else "unknown",
|
254
|
+
"readme_summary": enriched_crate.readme_summary,
|
255
|
+
"use_case": enriched_crate.use_case,
|
256
|
+
"score": enriched_crate.score,
|
257
|
+
"factual_counterfactual": enriched_crate.factual_counterfactual
|
258
|
+
}
|
259
|
+
|
260
|
+
self.logger.info(f"✅ Unified LLM enrichment completed for {crate_name}")
|
261
|
+
|
262
|
+
except Exception as e:
|
263
|
+
self.logger.warning(f"⚠️ Failed to add unified LLM enrichment: {e}")
|
264
|
+
|
265
|
+
async def _add_azure_openai_enrichment(self, crate_name: str, trace: SacredChainTrace) -> None:
|
266
|
+
"""Add enrichment using Azure OpenAI (fallback method)"""
|
267
|
+
if not self.ai_enricher:
|
268
|
+
return
|
269
|
+
|
270
|
+
try:
|
271
|
+
self.logger.info(f"🤖 Adding Azure OpenAI enrichment for {crate_name}")
|
272
|
+
|
273
|
+
# Create a mock crate metadata for AI analysis
|
274
|
+
# In a real implementation, this would come from your scraping results
|
275
|
+
from .config import CrateMetadata
|
276
|
+
|
277
|
+
mock_crate = CrateMetadata(
|
278
|
+
name=crate_name,
|
279
|
+
version="unknown",
|
280
|
+
description=trace.suggestion or "No description available",
|
281
|
+
repository="",
|
282
|
+
keywords=[],
|
283
|
+
categories=[],
|
284
|
+
readme="",
|
285
|
+
downloads=0,
|
286
|
+
github_stars=0,
|
287
|
+
dependencies=[],
|
288
|
+
features={},
|
289
|
+
code_snippets=[],
|
290
|
+
readme_sections={},
|
291
|
+
librs_downloads=None,
|
292
|
+
source="crates.io",
|
293
|
+
enhanced_scraping={},
|
294
|
+
enhanced_features=[],
|
295
|
+
enhanced_dependencies=[]
|
296
|
+
)
|
297
|
+
|
298
|
+
# Enrich the crate using Azure OpenAI
|
299
|
+
enriched_crate = self.ai_enricher.enrich_crate(mock_crate)
|
300
|
+
|
301
|
+
# Add enrichment results to trace
|
302
|
+
trace.audit_info["ai_enrichment"] = {
|
303
|
+
"provider": "azure_openai",
|
304
|
+
"model": self.config.azure_openai_deployment_name,
|
305
|
+
"readme_summary": enriched_crate.readme_summary,
|
306
|
+
"use_case": enriched_crate.use_case,
|
307
|
+
"score": enriched_crate.score,
|
308
|
+
"factual_counterfactual": enriched_crate.factual_counterfactual
|
309
|
+
}
|
310
|
+
|
311
|
+
self.logger.info(f"✅ Azure OpenAI enrichment completed for {crate_name}")
|
312
|
+
|
313
|
+
except Exception as e:
|
314
|
+
self.logger.warning(f"⚠️ Failed to add Azure OpenAI enrichment: {e}")
|
315
|
+
|
316
|
+
async def _generate_analysis_report(self, crate_name: str, trace: SacredChainTrace) -> None:
|
317
|
+
report_data = {
|
318
|
+
"crate_name": crate_name,
|
319
|
+
"analysis_timestamp": trace.timestamp,
|
320
|
+
"execution_id": trace.execution_id,
|
321
|
+
"verdict": trace.verdict.value,
|
322
|
+
"irl_score": trace.irl_score,
|
323
|
+
"suggestion": trace.suggestion,
|
324
|
+
"context_sources": trace.context_sources,
|
325
|
+
"reasoning_steps": trace.reasoning_steps,
|
326
|
+
"audit_info": trace.audit_info,
|
327
|
+
"canon_version": trace.canon_version,
|
328
|
+
}
|
329
|
+
|
330
|
+
report_file = Path(f"analysis_report_{crate_name}_{int(time.time())}.json")
|
331
|
+
try:
|
332
|
+
with open(report_file, "w") as f:
|
333
|
+
json.dump(report_data, f, indent=2)
|
334
|
+
self.logger.info(f"📄 Analysis report saved: {report_file}")
|
335
|
+
except IOError as e:
|
336
|
+
self.logger.error(f"❌ Failed to save analysis report: {e}")
|
337
|
+
|
338
|
+
async def analyze_multiple_crates(self, crate_names: List[str]) -> Dict[str, SacredChainTrace]:
|
339
|
+
if not crate_names:
|
340
|
+
return {}
|
341
|
+
|
342
|
+
self.logger.info(f"🚀 Starting concurrent analysis of {len(crate_names)} crates")
|
343
|
+
|
344
|
+
semaphore = asyncio.Semaphore(self.config.n_workers)
|
345
|
+
|
346
|
+
async def analyze_single_crate(crate_name: str) -> "tuple[str, SacredChainTrace]":
|
347
|
+
async with semaphore:
|
348
|
+
try:
|
349
|
+
trace = await self.analyze_crate(crate_name)
|
350
|
+
return crate_name, trace
|
351
|
+
except Exception as e:
|
352
|
+
self.logger.error(f"❌ Analysis failed for {crate_name}: {e}")
|
353
|
+
error_trace = SacredChainTrace(
|
354
|
+
input_data=crate_name,
|
355
|
+
context_sources=[],
|
356
|
+
reasoning_steps=[f"Analysis failed: {str(e)}"],
|
357
|
+
suggestion="DEFER: Analysis failed",
|
358
|
+
verdict=TrustVerdict.DEFER,
|
359
|
+
audit_info={"error": str(e)},
|
360
|
+
irl_score=0.0,
|
361
|
+
execution_id=f"error-{int(time.time())}",
|
362
|
+
timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
363
|
+
canon_version="1.3.0",
|
364
|
+
)
|
365
|
+
return crate_name, error_trace
|
366
|
+
|
367
|
+
tasks = [analyze_single_crate(name) for name in crate_names]
|
368
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
369
|
+
|
370
|
+
analysis_results: Dict[str, SacredChainTrace] = {}
|
371
|
+
for result in results:
|
372
|
+
if isinstance(result, tuple):
|
373
|
+
crate_name, trace = result
|
374
|
+
analysis_results[crate_name] = trace
|
375
|
+
else:
|
376
|
+
self.logger.error(f"❌ Unexpected result type: {type(result)}")
|
377
|
+
|
378
|
+
self.logger.info(f"✅ Completed analysis of {len(analysis_results)} crates")
|
379
|
+
return analysis_results
|
380
|
+
|
381
|
+
def get_pipeline_summary(self) -> Dict[str, Any]:
|
382
|
+
"""Get a summary of the pipeline configuration and status"""
|
383
|
+
summary = {
|
384
|
+
"pipeline_version": "1.3.0",
|
385
|
+
"components": {
|
386
|
+
"irl_engine": self.irl_engine is not None,
|
387
|
+
"scraper": self.scraper is not None,
|
388
|
+
"canon_registry": self.canon_registry is not None,
|
389
|
+
},
|
390
|
+
"ai_components": {
|
391
|
+
"unified_llm_processor": self.unified_llm_processor is not None,
|
392
|
+
"azure_openai_enricher": self.ai_enricher is not None,
|
393
|
+
"crate_analyzer": self.crate_analyzer is not None,
|
394
|
+
},
|
395
|
+
"configuration": {
|
396
|
+
"max_tokens": self.config.max_tokens,
|
397
|
+
"checkpoint_interval": self.config.checkpoint_interval,
|
398
|
+
"batch_size": self.config.batch_size,
|
399
|
+
"enable_crawl4ai": self.config.enable_crawl4ai,
|
400
|
+
}
|
401
|
+
}
|
402
|
+
|
403
|
+
# Add LLM configuration if available
|
404
|
+
if self.llm_config:
|
405
|
+
summary["llm_configuration"] = {
|
406
|
+
"provider": self.llm_config.provider,
|
407
|
+
"model": self.llm_config.model,
|
408
|
+
"temperature": self.llm_config.temperature,
|
409
|
+
"max_tokens": self.llm_config.max_tokens,
|
410
|
+
"timeout": self.llm_config.timeout,
|
411
|
+
"max_retries": self.llm_config.max_retries
|
412
|
+
}
|
413
|
+
elif self.config.use_azure_openai:
|
414
|
+
summary["llm_configuration"] = {
|
415
|
+
"provider": "azure_openai",
|
416
|
+
"model": self.config.azure_openai_deployment_name,
|
417
|
+
"endpoint": self.config.azure_openai_endpoint,
|
418
|
+
"max_tokens": self.config.max_tokens
|
419
|
+
}
|
420
|
+
|
421
|
+
return summary
|
422
|
+
|
423
|
+
|
424
|
+
def create_pipeline_from_args(args: argparse.Namespace) -> UnifiedSigilPipeline:
|
425
|
+
"""Create pipeline from command line arguments"""
|
426
|
+
# Create base config
|
427
|
+
config = PipelineConfig()
|
428
|
+
|
429
|
+
# Create LLM config if LLM arguments are provided
|
430
|
+
llm_config = None
|
431
|
+
if hasattr(args, 'llm_provider') and args.llm_provider:
|
432
|
+
if UNIFIED_LLM_AVAILABLE and LLMConfig is not None:
|
433
|
+
llm_config = LLMConfig(
|
434
|
+
provider=args.llm_provider,
|
435
|
+
model=args.llm_model or "gpt-4",
|
436
|
+
api_base=getattr(args, 'llm_api_base', None),
|
437
|
+
api_key=getattr(args, 'llm_api_key', None),
|
438
|
+
temperature=getattr(args, 'llm_temperature', 0.2),
|
439
|
+
max_tokens=getattr(args, 'llm_max_tokens', 256),
|
440
|
+
timeout=getattr(args, 'llm_timeout', 30),
|
441
|
+
max_retries=getattr(args, 'llm_max_retries', 3),
|
442
|
+
# Provider-specific settings
|
443
|
+
azure_deployment=getattr(args, 'azure_deployment', None),
|
444
|
+
azure_api_version=getattr(args, 'azure_api_version', None),
|
445
|
+
ollama_host=getattr(args, 'ollama_host', None),
|
446
|
+
lmstudio_host=getattr(args, 'lmstudio_host', None)
|
447
|
+
)
|
448
|
+
else:
|
449
|
+
logging.warning("Unified LLM processor not available, falling back to Azure OpenAI")
|
450
|
+
|
451
|
+
return UnifiedSigilPipeline(config, llm_config)
|
452
|
+
|
453
|
+
|
454
|
+
def add_llm_arguments(parser: argparse.ArgumentParser) -> None:
|
455
|
+
"""Add LLM-related command line arguments to the parser"""
|
456
|
+
llm_group = parser.add_argument_group('LLM Configuration')
|
457
|
+
|
458
|
+
llm_group.add_argument(
|
459
|
+
'--llm-provider',
|
460
|
+
choices=['azure', 'ollama', 'lmstudio', 'openai', 'anthropic', 'google', 'cohere', 'huggingface'],
|
461
|
+
help='LLM provider to use (default: azure)'
|
462
|
+
)
|
463
|
+
|
464
|
+
llm_group.add_argument(
|
465
|
+
'--llm-model',
|
466
|
+
help='Model name/identifier (e.g., gpt-4, llama2, claude-3)'
|
467
|
+
)
|
468
|
+
|
469
|
+
llm_group.add_argument(
|
470
|
+
'--llm-api-base',
|
471
|
+
help='API base URL (for local providers or custom endpoints)'
|
472
|
+
)
|
473
|
+
|
474
|
+
llm_group.add_argument(
|
475
|
+
'--llm-api-key',
|
476
|
+
help='API key (if required by provider)'
|
477
|
+
)
|
478
|
+
|
479
|
+
llm_group.add_argument(
|
480
|
+
'--llm-temperature',
|
481
|
+
type=float,
|
482
|
+
default=0.2,
|
483
|
+
help='Temperature for LLM generation (default: 0.2)'
|
484
|
+
)
|
485
|
+
|
486
|
+
llm_group.add_argument(
|
487
|
+
'--llm-max-tokens',
|
488
|
+
type=int,
|
489
|
+
default=256,
|
490
|
+
help='Maximum tokens for LLM generation (default: 256)'
|
491
|
+
)
|
492
|
+
|
493
|
+
llm_group.add_argument(
|
494
|
+
'--llm-timeout',
|
495
|
+
type=int,
|
496
|
+
default=30,
|
497
|
+
help='Timeout for LLM API calls in seconds (default: 30)'
|
498
|
+
)
|
499
|
+
|
500
|
+
llm_group.add_argument(
|
501
|
+
'--llm-max-retries',
|
502
|
+
type=int,
|
503
|
+
default=3,
|
504
|
+
help='Maximum retries for LLM API calls (default: 3)'
|
505
|
+
)
|
506
|
+
|
507
|
+
# Provider-specific arguments
|
508
|
+
azure_group = parser.add_argument_group('Azure OpenAI Configuration')
|
509
|
+
azure_group.add_argument(
|
510
|
+
'--azure-deployment',
|
511
|
+
help='Azure OpenAI deployment name'
|
512
|
+
)
|
513
|
+
azure_group.add_argument(
|
514
|
+
'--azure-api-version',
|
515
|
+
help='Azure OpenAI API version'
|
516
|
+
)
|
517
|
+
|
518
|
+
ollama_group = parser.add_argument_group('Ollama Configuration')
|
519
|
+
ollama_group.add_argument(
|
520
|
+
'--ollama-host',
|
521
|
+
default='http://localhost:11434',
|
522
|
+
help='Ollama host URL (default: http://localhost:11434)'
|
523
|
+
)
|
524
|
+
|
525
|
+
lmstudio_group = parser.add_argument_group('LM Studio Configuration')
|
526
|
+
lmstudio_group.add_argument(
|
527
|
+
'--lmstudio-host',
|
528
|
+
default='http://localhost:1234/v1',
|
529
|
+
help='LM Studio host URL (default: http://localhost:1234/v1)'
|
530
|
+
)
|
531
|
+
|
532
|
+
|
533
|
+
async def quick_analyze_crate(crate_name: str, config: Optional[PipelineConfig] = None, llm_config: Optional[Any] = None) -> SacredChainTrace:
|
534
|
+
"""Quick analysis of a single crate"""
|
535
|
+
if config is None:
|
536
|
+
config = PipelineConfig()
|
537
|
+
|
538
|
+
async with UnifiedSigilPipeline(config, llm_config) as pipeline:
|
539
|
+
return await pipeline.analyze_crate(crate_name)
|
540
|
+
|
541
|
+
|
542
|
+
async def batch_analyze_crates(crate_names: List[str], config: Optional[PipelineConfig] = None, llm_config: Optional[Any] = None) -> Dict[str, SacredChainTrace]:
|
543
|
+
"""Batch analysis of multiple crates"""
|
544
|
+
if config is None:
|
545
|
+
config = PipelineConfig()
|
546
|
+
|
547
|
+
async with UnifiedSigilPipeline(config, llm_config) as pipeline:
|
548
|
+
return await pipeline.analyze_multiple_crates(crate_names)
|
@@ -3,7 +3,7 @@ import json
|
|
3
3
|
import os
|
4
4
|
import shutil
|
5
5
|
from datetime import datetime
|
6
|
-
from typing import
|
6
|
+
from typing import Any, Union
|
7
7
|
|
8
8
|
|
9
9
|
def create_output_dir(base_name: str = "crate_data") -> str:
|
@@ -22,7 +22,7 @@ def create_output_dir(base_name: str = "crate_data") -> str:
|
|
22
22
|
return output_dir
|
23
23
|
|
24
24
|
|
25
|
-
def save_checkpoint(data:
|
25
|
+
def save_checkpoint(data: list[dict], prefix: str, output_dir: str) -> str:
|
26
26
|
"""
|
27
27
|
Save processing checkpoint with status metadata
|
28
28
|
|
@@ -45,7 +45,7 @@ def save_checkpoint(data: List[Dict], prefix: str, output_dir: str) -> str:
|
|
45
45
|
status = {
|
46
46
|
"timestamp": timestamp,
|
47
47
|
"total_items": len(data),
|
48
|
-
"checkpoint_file": filename
|
48
|
+
"checkpoint_file": filename,
|
49
49
|
}
|
50
50
|
|
51
51
|
status_file = os.path.join(output_dir, f"{prefix}_status_{timestamp}.json")
|
@@ -55,7 +55,7 @@ def save_checkpoint(data: List[Dict], prefix: str, output_dir: str) -> str:
|
|
55
55
|
return filename
|
56
56
|
|
57
57
|
|
58
|
-
def safe_file_cleanup(path: str):
|
58
|
+
def safe_file_cleanup(path: str) -> None:
|
59
59
|
"""Safely remove files or directories"""
|
60
60
|
try:
|
61
61
|
if os.path.isfile(path):
|
@@ -70,7 +70,34 @@ def disk_space_check(min_free_gb: float = 1.0) -> bool:
|
|
70
70
|
"""Check if sufficient disk space is available"""
|
71
71
|
try:
|
72
72
|
free_bytes = shutil.disk_usage(".").free
|
73
|
-
free_gb = free_bytes / (1024
|
73
|
+
free_gb = free_bytes / (1024**3)
|
74
74
|
return free_gb >= min_free_gb
|
75
75
|
except Exception:
|
76
76
|
return True # Assume OK if check fails
|
77
|
+
|
78
|
+
|
79
|
+
def load_rule_zero_typing_quick_lookup(
|
80
|
+
path: Union[str, None] = None,
|
81
|
+
) -> dict[str, Any]:
|
82
|
+
"""
|
83
|
+
Load the Rule Zero Python Typing & PEP8 Quick Lookup Table as a dict.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
path: Optional path to the quick lookup JSON file.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
Dictionary with quick lookup entries.
|
90
|
+
"""
|
91
|
+
if not path:
|
92
|
+
path = os.path.abspath(
|
93
|
+
os.path.join(
|
94
|
+
os.path.dirname(__file__),
|
95
|
+
"../../rule_zero_typing_quick_lookup.json",
|
96
|
+
)
|
97
|
+
)
|
98
|
+
try:
|
99
|
+
with open(path, encoding="utf-8") as f:
|
100
|
+
return json.load(f)
|
101
|
+
except Exception as e:
|
102
|
+
print(f"Failed to load Rule Zero typing quick lookup: {e}")
|
103
|
+
return {}
|