rust-crate-pipeline 1.4.1__tar.gz → 1.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline-1.4.3/CHANGELOG_v1.4.2.md +10 -0
- rust_crate_pipeline-1.4.3/CHANGELOG_v1.4.3.md +13 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/PKG-INFO +1 -1
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/README_LLM_PROVIDERS.md +19 -2
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/pyproject.toml +1 -1
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/config.py +11 -4
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/core/irl_engine.py +3 -23
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/main.py +2 -2
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/pipeline.py +21 -4
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/unified_llm_processor.py +80 -66
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/unified_pipeline.py +153 -17
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/version.py +7 -2
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline.egg-info/PKG-INFO +1 -1
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline.egg-info/SOURCES.txt +2 -1
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/setup.py +1 -1
- rust_crate_pipeline-1.4.1/.aider.chat.history.md +0 -11533
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.3.0.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.3.1.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.3.2.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.3.3.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.3.4.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.3.5.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.3.6.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CHANGELOG_v1.4.0.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/CRAWL4AI_TYPE_ANALYSIS.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/LICENSE +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/MANIFEST.in +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/README.md +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/requirements-crawl4ai.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/requirements-dev.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/requirements.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/__init__.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/__main__.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/ai_processing.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/analysis.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/azure_ai_processing.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/core/__init__.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/core/canon_registry.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/core/sacred_chain.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/crate_analysis.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/crate_list.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/github_token_checker.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/network.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/production_config.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/progress_monitor.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/scraping/__init__.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/scraping/unified_scraper.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/utils/file_utils.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/utils/logging_utils.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline.egg-info/dependency_links.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline.egg-info/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline.egg-info/not-zip-safe +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline.egg-info/requires.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline.egg-info/top_level.txt +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/setup.cfg +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/tests/test_analysis.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/tests/test_config.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/tests/test_core_modules.py +0 -0
- {rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/tests/test_github_token_checker.py +0 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
# Changelog v1.4.3
|
2
|
+
|
3
|
+
## [1.4.3] - {today}
|
4
|
+
|
5
|
+
### Added
|
6
|
+
- Implemented full crate analysis, including `cargo check`, `cargo clippy`, and `cargo audit`, to adhere to "Rule Zero".
|
7
|
+
- Added `--crates-file` argument to `run_pipeline_with_llm.py` to allow processing a large number of crates from a file.
|
8
|
+
|
9
|
+
### Changed
|
10
|
+
- Incremented project version to 1.4.3.
|
11
|
+
- Fixed various bugs in the pipeline execution and data handling.
|
12
|
+
|
13
|
+
---
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: rust-crate-pipeline
|
3
|
-
Version: 1.4.
|
3
|
+
Version: 1.4.3
|
4
4
|
Summary: A comprehensive pipeline for analyzing Rust crates with AI enrichment and enhanced scraping
|
5
5
|
Home-page: https://github.com/SigilDERG/rust-crate-pipeline
|
6
6
|
Author: SigilDERG Team
|
@@ -51,12 +51,29 @@ python run_pipeline_with_llm.py --llm-provider <provider> --llm-model <model> --
|
|
51
51
|
### Provider-Specific Configuration
|
52
52
|
|
53
53
|
#### Azure OpenAI
|
54
|
+
|
55
|
+
Set the following environment variables:
|
56
|
+
```bash
|
57
|
+
export AZURE_OPENAI_ENDPOINT="<your_endpoint>"
|
58
|
+
export AZURE_OPENAI_API_KEY="<your_api_key>"
|
59
|
+
export AZURE_OPENAI_DEPLOYMENT_NAME="<your_deployment_name>"
|
60
|
+
```
|
61
|
+
|
62
|
+
Then, run the pipeline:
|
54
63
|
```bash
|
55
64
|
python run_pipeline_with_llm.py \
|
56
65
|
--llm-provider azure \
|
57
66
|
--llm-model gpt-4o \
|
58
|
-
--
|
59
|
-
|
67
|
+
--crates tokio serde
|
68
|
+
```
|
69
|
+
|
70
|
+
You can still override these with command-line arguments if needed:
|
71
|
+
```bash
|
72
|
+
python run_pipeline_with_llm.py \\
|
73
|
+
--llm-provider azure \\
|
74
|
+
--llm-model gpt-4o \\
|
75
|
+
--llm-api-key YOUR_AZURE_API_KEY \\
|
76
|
+
--azure-deployment YOUR_AZURE_DEPLOYMENT \\
|
60
77
|
--crates tokio serde
|
61
78
|
```
|
62
79
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
import os
|
3
3
|
import warnings
|
4
4
|
from dataclasses import dataclass, field, asdict
|
5
|
-
from typing import Any, Union, TYPE_CHECKING
|
5
|
+
from typing import Any, Union, TYPE_CHECKING, Optional
|
6
6
|
|
7
7
|
if TYPE_CHECKING:
|
8
8
|
from typing import Dict, List
|
@@ -37,14 +37,21 @@ class PipelineConfig:
|
|
37
37
|
)
|
38
38
|
crawl4ai_timeout: int = 30
|
39
39
|
output_path: str = "output"
|
40
|
+
llm_max_retries: int = 3
|
41
|
+
output_dir: str = "output"
|
42
|
+
verbose: bool = False
|
43
|
+
budget: Optional[float] = None
|
40
44
|
|
41
45
|
# Azure OpenAI Configuration
|
42
46
|
use_azure_openai: bool = True
|
43
|
-
azure_openai_endpoint: str = "https://david-mc08tirc-eastus2.services.ai.azure.com/"
|
44
|
-
azure_openai_api_key: str = "2hw0jjqwjtKke7DMGiJSPtlj6GhuLCNdQWPXoDGN2I3JMvzp4PmGJQQJ99BFACHYHv6XJ3w3AAAAACOGFPYA"
|
45
|
-
azure_openai_deployment_name: str = "gpt-4o"
|
47
|
+
azure_openai_endpoint: str = os.getenv("AZURE_OPENAI_ENDPOINT", "https://david-mc08tirc-eastus2.services.ai.azure.com/")
|
48
|
+
azure_openai_api_key: str = os.getenv("AZURE_OPENAI_API_KEY", "2hw0jjqwjtKke7DMGiJSPtlj6GhuLCNdQWPXoDGN2I3JMvzp4PmGJQQJ99BFACHYHv6XJ3w3AAAAACOGFPYA")
|
49
|
+
azure_openai_deployment_name: str = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "gpt-4o")
|
46
50
|
azure_openai_api_version: str = "2024-02-15-preview"
|
47
51
|
|
52
|
+
class Config:
|
53
|
+
validate_assignment = True
|
54
|
+
|
48
55
|
|
49
56
|
@dataclass
|
50
57
|
class CrateMetadata:
|
{rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/core/irl_engine.py
RENAMED
@@ -14,29 +14,13 @@ class IRLEngine(SacredChainBase):
|
|
14
14
|
super().__init__()
|
15
15
|
self.config = config
|
16
16
|
self.canon_registry = canon_registry or CanonRegistry()
|
17
|
-
self.crawler: Optional[Any] = None
|
18
17
|
self.logger = logging.getLogger(__name__)
|
19
18
|
|
20
19
|
async def __aenter__(self) -> "IRLEngine":
|
21
|
-
|
22
|
-
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
23
|
-
browser_config = BrowserConfig(headless=True, browser_type="chromium")
|
24
|
-
self.crawler = AsyncWebCrawler(config=browser_config)
|
25
|
-
await self.crawler.start()
|
26
|
-
self.logger.info("IRL Engine initialized with full traceability")
|
27
|
-
except ImportError:
|
28
|
-
self.logger.warning("Crawl4AI not available - IRL Engine running in limited mode")
|
29
|
-
except Exception as e:
|
30
|
-
self.logger.warning(f"Failed to initialize crawler: {e}")
|
31
|
-
|
20
|
+
self.logger.info("IRL Engine initialized with full traceability")
|
32
21
|
return self
|
33
22
|
|
34
23
|
async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
|
35
|
-
if self.crawler:
|
36
|
-
try:
|
37
|
-
await self.crawler.stop()
|
38
|
-
except Exception as e:
|
39
|
-
self.logger.warning(f"Error stopping crawler: {e}")
|
40
24
|
self._finalize_audit_log()
|
41
25
|
|
42
26
|
def _finalize_audit_log(self) -> None:
|
@@ -120,9 +104,8 @@ class IRLEngine(SacredChainBase):
|
|
120
104
|
reasoning_steps.append(f"Metadata extracted: {len(metadata)} fields")
|
121
105
|
|
122
106
|
docs = {}
|
123
|
-
|
124
|
-
|
125
|
-
reasoning_steps.append(f"Documentation analyzed: quality {docs.get('quality_score', 0):.1f}")
|
107
|
+
docs = await self._analyze_documentation(input_data)
|
108
|
+
reasoning_steps.append(f"Documentation analyzed: quality {docs.get('quality_score', 0):.1f}")
|
126
109
|
|
127
110
|
sentiment = await self._analyze_community_sentiment(input_data)
|
128
111
|
reasoning_steps.append(f"Sentiment analyzed: {sentiment.get('overall', 'unknown')}")
|
@@ -144,9 +127,6 @@ class IRLEngine(SacredChainBase):
|
|
144
127
|
}
|
145
128
|
|
146
129
|
async def _analyze_documentation(self, input_data: str) -> Dict[str, Any]:
|
147
|
-
if not self.crawler:
|
148
|
-
return {"quality_score": 5.0, "error": "No crawler available"}
|
149
|
-
|
150
130
|
try:
|
151
131
|
return {
|
152
132
|
"quality_score": 7.0,
|
@@ -468,7 +468,7 @@ def main() -> None:
|
|
468
468
|
logging.info("Falling back to standard pipeline")
|
469
469
|
|
470
470
|
logging.debug("Creating standard pipeline as Sigil fallback")
|
471
|
-
standard_pipeline = CrateDataPipeline(config)
|
471
|
+
standard_pipeline = CrateDataPipeline(config, **pipeline_kwargs)
|
472
472
|
logging.debug("Standard pipeline created, about to run asynchronously")
|
473
473
|
|
474
474
|
# Run standard pipeline (asynchronous)
|
@@ -487,7 +487,7 @@ def main() -> None:
|
|
487
487
|
else:
|
488
488
|
logging.info("Standard pipeline mode")
|
489
489
|
logging.debug("Creating standard pipeline")
|
490
|
-
standard_pipeline = CrateDataPipeline(config)
|
490
|
+
standard_pipeline = CrateDataPipeline(config, **pipeline_kwargs)
|
491
491
|
logging.info(f"Starting pipeline with {len(vars(args))} arguments")
|
492
492
|
logging.debug("Standard pipeline created, about to run asynchronously")
|
493
493
|
|
@@ -34,10 +34,21 @@ except ImportError:
|
|
34
34
|
logging.warning("Enhanced scraping not available - using basic methods")
|
35
35
|
|
36
36
|
|
37
|
+
class CustomJSONEncoder(json.JSONEncoder):
|
38
|
+
"""Custom JSON encoder to handle non-serializable objects"""
|
39
|
+
def default(self, obj):
|
40
|
+
if hasattr(obj, 'to_dict'):
|
41
|
+
return obj.to_dict()
|
42
|
+
elif hasattr(obj, '__dict__'):
|
43
|
+
return obj.__dict__
|
44
|
+
else:
|
45
|
+
return str(obj)
|
46
|
+
|
47
|
+
|
37
48
|
class CrateDataPipeline:
|
38
49
|
"""Orchestrates the entire data collection, enrichment, and analysis pipeline."""
|
39
50
|
|
40
|
-
def __init__(self, config: PipelineConfig) -> None:
|
51
|
+
def __init__(self, config: PipelineConfig, crate_list: "List[str] | None" = None, **kwargs) -> None:
|
41
52
|
self.config = config
|
42
53
|
self.api_client = CrateAPIClient(config)
|
43
54
|
self.github_client = GitHubBatchClient(config)
|
@@ -60,7 +71,13 @@ class CrateDataPipeline:
|
|
60
71
|
# Initialize cargo analyzer
|
61
72
|
self.cargo_analyzer = CrateAnalyzer(".")
|
62
73
|
|
63
|
-
|
74
|
+
# Use provided crate_list or load from file
|
75
|
+
if crate_list:
|
76
|
+
self.crates = crate_list
|
77
|
+
logging.info(f"Using provided crate list: {len(crate_list)} crates")
|
78
|
+
else:
|
79
|
+
self.crates = self._get_crate_list()
|
80
|
+
|
64
81
|
self.output_dir = self._create_output_dir()
|
65
82
|
self.enhanced_scraper: Any = (
|
66
83
|
self._initialize_enhanced_scraper()
|
@@ -280,7 +297,7 @@ class CrateDataPipeline:
|
|
280
297
|
|
281
298
|
with open(filename, "w") as f:
|
282
299
|
for item in data:
|
283
|
-
f.write(json.dumps(item.to_dict()) + "\n")
|
300
|
+
f.write(json.dumps(item.to_dict(), cls=CustomJSONEncoder) + "\n")
|
284
301
|
|
285
302
|
logging.info(f"Saved checkpoint to {filename}")
|
286
303
|
return filename
|
@@ -297,7 +314,7 @@ class CrateDataPipeline:
|
|
297
314
|
)
|
298
315
|
with open(final_output_path, "w") as f:
|
299
316
|
for item in data:
|
300
|
-
f.write(json.dumps(item.to_dict()) + "\n")
|
317
|
+
f.write(json.dumps(item.to_dict(), cls=CustomJSONEncoder) + "\n")
|
301
318
|
|
302
319
|
# Save dependency analysis
|
303
320
|
dep_file_path = os.path.join(
|
{rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/unified_llm_processor.py
RENAMED
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
|
|
13
13
|
try:
|
14
14
|
import litellm
|
15
15
|
from litellm import completion
|
16
|
+
from litellm.cost_calculator import cost_per_token
|
16
17
|
LITELLM_AVAILABLE = True
|
17
18
|
except ImportError:
|
18
19
|
LITELLM_AVAILABLE = False
|
@@ -44,6 +45,35 @@ class LLMConfig:
|
|
44
45
|
lmstudio_host: Optional[str] = None
|
45
46
|
|
46
47
|
|
48
|
+
class BudgetManager:
|
49
|
+
"""Monitors and enforces spending limits for LLM calls."""
|
50
|
+
|
51
|
+
def __init__(self, budget: float = 90.0):
|
52
|
+
self.budget = budget
|
53
|
+
self.total_cost = 0.0
|
54
|
+
|
55
|
+
def update_cost(self, model: str, completion_tokens: int, prompt_tokens: int) -> None:
|
56
|
+
"""Update the total cost with the latest API call."""
|
57
|
+
try:
|
58
|
+
cost, _ = cost_per_token(
|
59
|
+
model=model,
|
60
|
+
completion_tokens=completion_tokens,
|
61
|
+
prompt_tokens=prompt_tokens,
|
62
|
+
)
|
63
|
+
self.total_cost += cost
|
64
|
+
except Exception:
|
65
|
+
# If cost cannot be determined, do not track.
|
66
|
+
pass
|
67
|
+
|
68
|
+
def is_over_budget(self) -> bool:
|
69
|
+
"""Check if the cumulative cost has exceeded the budget."""
|
70
|
+
return self.total_cost > self.budget
|
71
|
+
|
72
|
+
def get_total_cost(self) -> float:
|
73
|
+
"""Return the current total cost."""
|
74
|
+
return self.total_cost
|
75
|
+
|
76
|
+
|
47
77
|
class Section(TypedDict, total=True):
|
48
78
|
heading: str
|
49
79
|
content: str
|
@@ -62,9 +92,10 @@ class UnifiedLLMProcessor:
|
|
62
92
|
- And all other LiteLLM providers
|
63
93
|
"""
|
64
94
|
|
65
|
-
def __init__(self, config: LLMConfig) -> None:
|
95
|
+
def __init__(self, config: LLMConfig, budget_manager: Optional[BudgetManager] = None) -> None:
|
66
96
|
self.config = config
|
67
97
|
self.logger = logging.getLogger(__name__)
|
98
|
+
self.budget_manager = budget_manager or BudgetManager()
|
68
99
|
|
69
100
|
if not LITELLM_AVAILABLE:
|
70
101
|
raise ImportError("LiteLLM is required. Install with: pip install litellm")
|
@@ -275,72 +306,50 @@ class UnifiedLLMProcessor:
|
|
275
306
|
max_tokens: Optional[int] = None,
|
276
307
|
system_message: str = "You are a helpful AI assistant that analyzes Rust crates and provides insights."
|
277
308
|
) -> Optional[str]:
|
278
|
-
"""Call LLM
|
309
|
+
"""Call the LLM with the given prompt and parameters."""
|
310
|
+
|
311
|
+
if self.budget_manager and self.budget_manager.is_over_budget():
|
312
|
+
self.logger.warning("Budget exceeded. Skipping LLM call.")
|
313
|
+
return None
|
314
|
+
|
315
|
+
model_name = self._get_model_name()
|
316
|
+
|
317
|
+
# Prepare arguments for the completion call
|
318
|
+
args: Dict[str, Any] = {
|
319
|
+
"model": model_name,
|
320
|
+
"messages": [
|
321
|
+
{"role": "system", "content": system_message},
|
322
|
+
{"role": "user", "content": prompt}
|
323
|
+
],
|
324
|
+
"temperature": temperature if temperature is not None else self.config.temperature,
|
325
|
+
"max_tokens": max_tokens if max_tokens is not None else self.config.max_tokens,
|
326
|
+
"timeout": self.config.timeout
|
327
|
+
}
|
328
|
+
|
329
|
+
# Provider-specific arguments
|
330
|
+
if self.config.provider == "azure":
|
331
|
+
args["api_base"] = self.config.api_base
|
332
|
+
args["api_key"] = self.config.api_key
|
333
|
+
args["api_version"] = self.config.azure_api_version
|
334
|
+
# For Azure, model can be just the deployment name
|
335
|
+
args["model"] = self.config.azure_deployment or self.config.model
|
336
|
+
else:
|
337
|
+
args["api_base"] = self._get_api_base()
|
338
|
+
args["api_key"] = self.config.api_key
|
339
|
+
|
279
340
|
try:
|
280
|
-
|
281
|
-
temp = temperature if temperature is not None else self.config.temperature
|
282
|
-
tokens = max_tokens if max_tokens is not None else self.config.max_tokens
|
283
|
-
|
284
|
-
# Prepare the completion call parameters
|
285
|
-
completion_params: Dict[str, Any] = {
|
286
|
-
"model": self._get_model_name(),
|
287
|
-
"messages": [
|
288
|
-
{"role": "system", "content": system_message},
|
289
|
-
{"role": "user", "content": prompt}
|
290
|
-
],
|
291
|
-
"temperature": temp,
|
292
|
-
"max_tokens": tokens,
|
293
|
-
"timeout": self.config.timeout
|
294
|
-
}
|
295
|
-
|
296
|
-
# Add provider-specific parameters
|
297
|
-
if self.config.provider == "azure":
|
298
|
-
if self.config.api_base:
|
299
|
-
completion_params["api_base"] = self.config.api_base
|
300
|
-
if self.config.api_key:
|
301
|
-
completion_params["api_key"] = self.config.api_key
|
302
|
-
if self.config.azure_deployment:
|
303
|
-
completion_params["deployment_id"] = self.config.azure_deployment
|
304
|
-
if self.config.azure_api_version:
|
305
|
-
completion_params["api_version"] = self.config.azure_api_version
|
306
|
-
|
307
|
-
elif self.config.provider in ["ollama", "lmstudio"]:
|
308
|
-
# Local providers don't need API keys
|
309
|
-
pass
|
310
|
-
|
311
|
-
else:
|
312
|
-
# Other providers (OpenAI, Anthropic, etc.)
|
313
|
-
if self.config.api_key:
|
314
|
-
completion_params["api_key"] = self.config.api_key
|
315
|
-
if self.config.api_base:
|
316
|
-
completion_params["api_base"] = self.config.api_base
|
317
|
-
|
318
|
-
self.logger.debug(f"Calling LLM with provider: {self.config.provider}, model: {self.config.model}")
|
341
|
+
response = completion(**args)
|
319
342
|
|
320
|
-
|
343
|
+
# Update budget
|
344
|
+
if self.budget_manager:
|
345
|
+
completion_tokens = response.usage.completion_tokens # type: ignore
|
346
|
+
prompt_tokens = response.usage.prompt_tokens # type: ignore
|
347
|
+
self.budget_manager.update_cost(model=model_name, completion_tokens=completion_tokens, prompt_tokens=prompt_tokens)
|
348
|
+
|
349
|
+
return response.choices[0].message.content # type: ignore
|
321
350
|
|
322
|
-
# Handle different response formats from LiteLLM
|
323
|
-
# LiteLLM has complex response objects that vary by provider
|
324
|
-
try:
|
325
|
-
if hasattr(response, 'choices') and response.choices: # type: ignore[attr-defined]
|
326
|
-
choice = response.choices[0] # type: ignore[attr-defined]
|
327
|
-
if hasattr(choice, 'message') and hasattr(choice.message, 'content'): # type: ignore[attr-defined]
|
328
|
-
return choice.message.content # type: ignore[attr-defined]
|
329
|
-
elif hasattr(choice, 'content'): # type: ignore[attr-defined]
|
330
|
-
return choice.content # type: ignore[attr-defined]
|
331
|
-
elif hasattr(response, 'content'): # type: ignore[attr-defined]
|
332
|
-
return response.content # type: ignore[attr-defined]
|
333
|
-
elif isinstance(response, str):
|
334
|
-
return response
|
335
|
-
else:
|
336
|
-
self.logger.error(f"Unexpected response format: {response}")
|
337
|
-
return None
|
338
|
-
except Exception as e:
|
339
|
-
self.logger.error(f"Error parsing LLM response: {e}")
|
340
|
-
return None
|
341
|
-
|
342
351
|
except Exception as e:
|
343
|
-
self.logger.error(f"
|
352
|
+
self.logger.error(f"LLM call failed: {e}")
|
344
353
|
return None
|
345
354
|
|
346
355
|
def validate_and_retry(
|
@@ -610,7 +619,9 @@ def create_llm_processor_from_config(pipeline_config: PipelineConfig) -> Unified
|
|
610
619
|
max_retries=pipeline_config.max_retries
|
611
620
|
)
|
612
621
|
|
613
|
-
|
622
|
+
budget_manager = BudgetManager(budget=pipeline_config.budget) if pipeline_config.budget is not None else None
|
623
|
+
|
624
|
+
return UnifiedLLMProcessor(llm_config, budget_manager=budget_manager)
|
614
625
|
|
615
626
|
|
616
627
|
def create_llm_processor_from_args(
|
@@ -620,9 +631,10 @@ def create_llm_processor_from_args(
|
|
620
631
|
api_key: Optional[str] = None,
|
621
632
|
temperature: float = 0.2,
|
622
633
|
max_tokens: int = 256,
|
634
|
+
budget: Optional[float] = None,
|
623
635
|
**kwargs
|
624
636
|
) -> UnifiedLLMProcessor:
|
625
|
-
"""Create
|
637
|
+
"""Create a UnifiedLLMProcessor from command-line arguments."""
|
626
638
|
|
627
639
|
llm_config = LLMConfig(
|
628
640
|
provider=provider,
|
@@ -634,4 +646,6 @@ def create_llm_processor_from_args(
|
|
634
646
|
**kwargs
|
635
647
|
)
|
636
648
|
|
637
|
-
|
649
|
+
budget_manager = BudgetManager(budget=budget) if budget is not None else None
|
650
|
+
|
651
|
+
return UnifiedLLMProcessor(llm_config, budget_manager=budget_manager)
|
{rust_crate_pipeline-1.4.1 → rust_crate_pipeline-1.4.3}/rust_crate_pipeline/unified_pipeline.py
RENAMED
@@ -3,6 +3,11 @@ import json
|
|
3
3
|
import logging
|
4
4
|
import time
|
5
5
|
import argparse
|
6
|
+
import os
|
7
|
+
import tempfile
|
8
|
+
import aiohttp
|
9
|
+
import tarfile
|
10
|
+
import gzip
|
6
11
|
from pathlib import Path
|
7
12
|
from typing import Dict, List, Optional, Any, Union, TYPE_CHECKING
|
8
13
|
|
@@ -62,8 +67,6 @@ class UnifiedSigilPipeline:
|
|
62
67
|
"verbose": False,
|
63
68
|
"word_count_threshold": 10,
|
64
69
|
"crawl_config": {
|
65
|
-
"max_retries": self.config.max_retries,
|
66
|
-
"timeout": self.config.crawl4ai_timeout,
|
67
70
|
}
|
68
71
|
}
|
69
72
|
self.scraper = UnifiedScraper(scraper_config)
|
@@ -108,17 +111,22 @@ class UnifiedSigilPipeline:
|
|
108
111
|
if self.scraper:
|
109
112
|
await self.scraper.__aexit__(exc_type, exc_val, exc_tb)
|
110
113
|
|
111
|
-
async def analyze_crate(self, crate_name: str) -> SacredChainTrace:
|
114
|
+
async def analyze_crate(self, crate_name: str, crate_version: Optional[str] = None) -> SacredChainTrace:
|
112
115
|
if not crate_name or not isinstance(crate_name, str):
|
113
116
|
raise ValueError("crate_name must be a non-empty string")
|
114
117
|
|
115
118
|
self.logger.info(f"🔍 Starting analysis of crate: {crate_name}")
|
116
119
|
|
117
120
|
try:
|
121
|
+
if crate_version is None:
|
122
|
+
crate_version = await self._get_latest_crate_version(crate_name)
|
123
|
+
if not crate_version:
|
124
|
+
raise RuntimeError(f"Could not determine latest version for {crate_name}")
|
125
|
+
|
118
126
|
documentation_results = await self._gather_documentation(crate_name)
|
119
127
|
|
120
128
|
sacred_chain_trace = await self._perform_sacred_chain_analysis(
|
121
|
-
crate_name, documentation_results
|
129
|
+
crate_name, crate_version, documentation_results
|
122
130
|
)
|
123
131
|
|
124
132
|
await self._generate_analysis_report(crate_name, sacred_chain_trace)
|
@@ -155,7 +163,7 @@ class UnifiedSigilPipeline:
|
|
155
163
|
raise
|
156
164
|
|
157
165
|
async def _perform_sacred_chain_analysis(
|
158
|
-
self, crate_name: str, documentation_results: Dict[str, ScrapingResult]
|
166
|
+
self, crate_name: str, crate_version: str, documentation_results: Dict[str, ScrapingResult]
|
159
167
|
) -> SacredChainTrace:
|
160
168
|
if not self.irl_engine:
|
161
169
|
raise RuntimeError("IRL Engine not initialized")
|
@@ -173,7 +181,7 @@ class UnifiedSigilPipeline:
|
|
173
181
|
sacred_chain_trace.audit_info["documentation_sources"] = list(documentation_results.keys())
|
174
182
|
|
175
183
|
# Add crate analysis results if available
|
176
|
-
await self._add_crate_analysis_results(crate_name, sacred_chain_trace)
|
184
|
+
await self._add_crate_analysis_results(crate_name, crate_version, sacred_chain_trace)
|
177
185
|
|
178
186
|
# Add AI enrichment if available
|
179
187
|
await self._add_ai_enrichment(crate_name, sacred_chain_trace)
|
@@ -184,22 +192,144 @@ class UnifiedSigilPipeline:
|
|
184
192
|
self.logger.error(f"❌ Sacred Chain analysis failed: {e}")
|
185
193
|
raise
|
186
194
|
|
187
|
-
async def _add_crate_analysis_results(self, crate_name: str, trace: SacredChainTrace) -> None:
|
195
|
+
async def _add_crate_analysis_results(self, crate_name: str, crate_version: str, trace: SacredChainTrace) -> None:
|
188
196
|
"""Add cargo analysis results to the sacred chain trace"""
|
189
197
|
try:
|
190
|
-
|
191
|
-
# In a real implementation, you'd download/extract the crate first
|
192
|
-
self.logger.info(f"🔍 Adding crate analysis results for {crate_name}")
|
193
|
-
|
194
|
-
# This would be implemented based on your crate source strategy
|
195
|
-
# For now, we'll add a placeholder
|
196
|
-
trace.audit_info["crate_analysis"] = {
|
197
|
-
"status": "not_implemented",
|
198
|
-
"note": "Crate analysis requires downloading/extracting the crate source"
|
199
|
-
}
|
198
|
+
self.logger.info(f"🔍 Adding crate analysis results for {crate_name} v{crate_version}")
|
200
199
|
|
200
|
+
with tempfile.TemporaryDirectory() as temp_dir_str:
|
201
|
+
temp_dir = Path(temp_dir_str)
|
202
|
+
crate_source_path = await self._download_and_extract_crate(crate_name, crate_version, temp_dir)
|
203
|
+
|
204
|
+
if not crate_source_path:
|
205
|
+
trace.audit_info["crate_analysis"] = {"status": "error", "note": "Failed to download or extract crate."}
|
206
|
+
return
|
207
|
+
|
208
|
+
check_results = await self._run_cargo_command(
|
209
|
+
["cargo", "check", "--message-format=json"],
|
210
|
+
cwd=crate_source_path
|
211
|
+
)
|
212
|
+
|
213
|
+
clippy_results = await self._run_cargo_command(
|
214
|
+
["cargo", "clippy", "--message-format=json"],
|
215
|
+
cwd=crate_source_path
|
216
|
+
)
|
217
|
+
|
218
|
+
audit_results = await self._run_cargo_audit(crate_source_path)
|
219
|
+
|
220
|
+
trace.audit_info["crate_analysis"] = {
|
221
|
+
"status": "completed",
|
222
|
+
"check": check_results,
|
223
|
+
"clippy": clippy_results,
|
224
|
+
"audit": audit_results,
|
225
|
+
"note": "Crate analysis performed."
|
226
|
+
}
|
227
|
+
|
201
228
|
except Exception as e:
|
202
229
|
self.logger.warning(f"⚠️ Failed to add crate analysis results: {e}")
|
230
|
+
trace.audit_info["crate_analysis"] = {"status": "error", "note": str(e)}
|
231
|
+
|
232
|
+
async def _download_and_extract_crate(self, crate_name: str, crate_version: str, target_dir: Path) -> Optional[Path]:
|
233
|
+
"""Downloads and extracts a crate from crates.io."""
|
234
|
+
crate_url = f"https://static.crates.io/crates/{crate_name}/{crate_name}-{crate_version}.crate"
|
235
|
+
try:
|
236
|
+
async with aiohttp.ClientSession() as session:
|
237
|
+
async with session.get(crate_url) as response:
|
238
|
+
if response.status != 200:
|
239
|
+
self.logger.error(f"Failed to download {crate_url}: HTTP {response.status}")
|
240
|
+
return None
|
241
|
+
|
242
|
+
# Save the .crate file
|
243
|
+
crate_file_path = target_dir / f"{crate_name}-{crate_version}.crate"
|
244
|
+
with open(crate_file_path, "wb") as f:
|
245
|
+
f.write(await response.read())
|
246
|
+
|
247
|
+
# Extract the tarball
|
248
|
+
with gzip.open(crate_file_path, 'rb') as gz_file:
|
249
|
+
with tarfile.open(fileobj=gz_file, mode='r') as tar_file:
|
250
|
+
tar_file.extractall(path=target_dir)
|
251
|
+
|
252
|
+
# The crate is usually extracted into a directory named `{crate_name}-{crate_version}`
|
253
|
+
crate_source_dir = target_dir / f"{crate_name}-{crate_version}"
|
254
|
+
if crate_source_dir.is_dir():
|
255
|
+
return crate_source_dir
|
256
|
+
else:
|
257
|
+
self.logger.error(f"Could not find extracted directory: {crate_source_dir}")
|
258
|
+
return None
|
259
|
+
|
260
|
+
except Exception as e:
|
261
|
+
self.logger.error(f"Error downloading or extracting crate {crate_name}: {e}")
|
262
|
+
return None
|
263
|
+
|
264
|
+
async def _get_latest_crate_version(self, crate_name: str) -> Optional[str]:
|
265
|
+
"""Fetches the latest version of a crate from crates.io API."""
|
266
|
+
api_url = f"https://crates.io/api/v1/crates/{crate_name}"
|
267
|
+
try:
|
268
|
+
async with aiohttp.ClientSession() as session:
|
269
|
+
async with session.get(api_url) as response:
|
270
|
+
if response.status != 200:
|
271
|
+
self.logger.error(f"Failed to fetch crate info from {api_url}: HTTP {response.status}")
|
272
|
+
return None
|
273
|
+
data = await response.json()
|
274
|
+
return data.get("crate", {}).get("max_version")
|
275
|
+
except Exception as e:
|
276
|
+
self.logger.error(f"Error fetching latest crate version for {crate_name}: {e}")
|
277
|
+
return None
|
278
|
+
|
279
|
+
async def _run_cargo_command(self, command: List[str], cwd: Path) -> List[Dict[str, Any]]:
|
280
|
+
"""Runs a cargo command and returns the parsed JSON output."""
|
281
|
+
self.logger.info(f"Running command: {' '.join(command)} in {cwd}")
|
282
|
+
process = await asyncio.create_subprocess_exec(
|
283
|
+
*command,
|
284
|
+
cwd=cwd,
|
285
|
+
stdout=asyncio.subprocess.PIPE,
|
286
|
+
stderr=asyncio.subprocess.PIPE
|
287
|
+
)
|
288
|
+
|
289
|
+
stdout, stderr = await process.communicate()
|
290
|
+
|
291
|
+
if process.returncode != 0:
|
292
|
+
self.logger.warning(f"Cargo command failed with exit code {process.returncode}")
|
293
|
+
self.logger.warning(f"Stderr: {stderr.decode(errors='ignore')}")
|
294
|
+
|
295
|
+
results = []
|
296
|
+
if stdout:
|
297
|
+
for line in stdout.decode(errors='ignore').splitlines():
|
298
|
+
if line.strip():
|
299
|
+
try:
|
300
|
+
results.append(json.loads(line))
|
301
|
+
except json.JSONDecodeError:
|
302
|
+
self.logger.warning(f"Could not parse JSON line: {line}")
|
303
|
+
return results
|
304
|
+
|
305
|
+
async def _run_cargo_audit(self, cwd: Path) -> Optional[Dict[str, Any]]:
|
306
|
+
"""Runs cargo audit and returns the parsed JSON output."""
|
307
|
+
command = ["cargo", "audit", "--json"]
|
308
|
+
self.logger.info(f"Running command: {' '.join(command)} in {cwd}")
|
309
|
+
process = await asyncio.create_subprocess_exec(
|
310
|
+
*command,
|
311
|
+
cwd=cwd,
|
312
|
+
stdout=asyncio.subprocess.PIPE,
|
313
|
+
stderr=asyncio.subprocess.PIPE
|
314
|
+
)
|
315
|
+
|
316
|
+
stdout, stderr = await process.communicate()
|
317
|
+
|
318
|
+
if process.returncode != 0:
|
319
|
+
# cargo-audit exits with a non-zero status code if vulnerabilities are found.
|
320
|
+
# We still want to parse the output.
|
321
|
+
self.logger.info(f"Cargo audit finished with exit code {process.returncode}")
|
322
|
+
|
323
|
+
if stdout:
|
324
|
+
try:
|
325
|
+
return json.loads(stdout)
|
326
|
+
except json.JSONDecodeError:
|
327
|
+
self.logger.warning(f"Could not parse cargo audit JSON output: {stdout.decode(errors='ignore')}")
|
328
|
+
|
329
|
+
if stderr:
|
330
|
+
self.logger.warning(f"Stderr from cargo audit: {stderr.decode(errors='ignore')}")
|
331
|
+
|
332
|
+
return None
|
203
333
|
|
204
334
|
async def _add_ai_enrichment(self, crate_name: str, trace: SacredChainTrace) -> None:
|
205
335
|
"""Add AI enrichment results to the sacred chain trace"""
|
@@ -244,6 +374,9 @@ class UnifiedSigilPipeline:
|
|
244
374
|
enhanced_dependencies=[]
|
245
375
|
)
|
246
376
|
|
377
|
+
# Store the metadata used for enrichment
|
378
|
+
trace.audit_info["crate_metadata"] = mock_crate.to_dict()
|
379
|
+
|
247
380
|
# Enrich the crate using unified LLM processor
|
248
381
|
enriched_crate = self.unified_llm_processor.enrich_crate(mock_crate)
|
249
382
|
|
@@ -295,6 +428,9 @@ class UnifiedSigilPipeline:
|
|
295
428
|
enhanced_dependencies=[]
|
296
429
|
)
|
297
430
|
|
431
|
+
# Store the metadata used for enrichment
|
432
|
+
trace.audit_info["crate_metadata"] = mock_crate.to_dict()
|
433
|
+
|
298
434
|
# Enrich the crate using Azure OpenAI
|
299
435
|
enriched_crate = self.ai_enricher.enrich_crate(mock_crate)
|
300
436
|
|