rust-crate-pipeline 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +52 -0
- rust_crate_pipeline/__main__.py +6 -0
- rust_crate_pipeline/ai_processing.py +396 -0
- rust_crate_pipeline/analysis.py +435 -0
- rust_crate_pipeline/config.py +46 -0
- rust_crate_pipeline/main.py +177 -0
- rust_crate_pipeline/network.py +307 -0
- rust_crate_pipeline/pipeline.py +260 -0
- rust_crate_pipeline/utils/file_utils.py +72 -0
- rust_crate_pipeline/utils/logging_utils.py +66 -0
- rust_crate_pipeline/version.py +13 -0
- rust_crate_pipeline-1.1.0.dist-info/METADATA +473 -0
- rust_crate_pipeline-1.1.0.dist-info/RECORD +17 -0
- rust_crate_pipeline-1.1.0.dist-info/WHEEL +5 -0
- rust_crate_pipeline-1.1.0.dist-info/entry_points.txt +2 -0
- rust_crate_pipeline-1.1.0.dist-info/licenses/LICENSE +21 -0
- rust_crate_pipeline-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
# rust_crate_pipeline/__init__.py
|
2
|
+
"""
|
3
|
+
Rust Crate Data Processing Pipeline
|
4
|
+
|
5
|
+
A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates.
|
6
|
+
Includes AI-powered enrichment using local LLMs and dependency analysis.
|
7
|
+
|
8
|
+
Example usage:
|
9
|
+
from rust_crate_pipeline import CrateDataPipeline
|
10
|
+
from rust_crate_pipeline.main import main
|
11
|
+
|
12
|
+
# Run the main pipeline
|
13
|
+
main()
|
14
|
+
|
15
|
+
# Or use the pipeline class directly
|
16
|
+
config = PipelineConfig()
|
17
|
+
pipeline = CrateDataPipeline(config)
|
18
|
+
pipeline.run()
|
19
|
+
|
20
|
+
Components:
|
21
|
+
- CrateDataPipeline: Main orchestration class
|
22
|
+
- PipelineConfig: Configuration management
|
23
|
+
- Various analyzers for AI, security, and dependency analysis
|
24
|
+
"""
|
25
|
+
|
26
|
+
from .version import __version__
|
27
|
+
|
28
|
+
__author__ = "SuperUser666-Sigil"
|
29
|
+
__email__ = "miragemodularframework@gmail.com"
|
30
|
+
__license__ = "MIT"
|
31
|
+
|
32
|
+
# Import main components for easy access (only if dependencies are available)
|
33
|
+
try:
|
34
|
+
from .pipeline import CrateDataPipeline
|
35
|
+
from .config import PipelineConfig
|
36
|
+
|
37
|
+
__all__ = [
|
38
|
+
"CrateDataPipeline",
|
39
|
+
"PipelineConfig",
|
40
|
+
"__version__",
|
41
|
+
"__author__",
|
42
|
+
"__email__",
|
43
|
+
"__license__"
|
44
|
+
]
|
45
|
+
except ImportError:
|
46
|
+
# Handle case where dependencies aren't installed yet
|
47
|
+
__all__ = [
|
48
|
+
"__version__",
|
49
|
+
"__author__",
|
50
|
+
"__email__",
|
51
|
+
"__license__"
|
52
|
+
]
|
@@ -0,0 +1,396 @@
|
|
1
|
+
# ai_processing.py
|
2
|
+
import re
|
3
|
+
import time
|
4
|
+
import logging
|
5
|
+
import tiktoken
|
6
|
+
from typing import Callable, Optional
|
7
|
+
from llama_cpp import Llama
|
8
|
+
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
9
|
+
|
10
|
+
class LLMEnricher:
|
11
|
+
def __init__(self, config: PipelineConfig):
|
12
|
+
self.config = config
|
13
|
+
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
14
|
+
self.model = self._load_model()
|
15
|
+
|
16
|
+
def _load_model(self):
|
17
|
+
return Llama(
|
18
|
+
model_path=self.config.model_path,
|
19
|
+
n_ctx=1024,
|
20
|
+
n_batch=512,
|
21
|
+
n_gpu_layers=32
|
22
|
+
)
|
23
|
+
|
24
|
+
def estimate_tokens(self, text: str) -> int:
|
25
|
+
return len(self.tokenizer.encode(text))
|
26
|
+
|
27
|
+
def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
|
28
|
+
"""Truncate content to fit within token limit"""
|
29
|
+
paragraphs = content.split("\n\n")
|
30
|
+
result, current_tokens = "", 0
|
31
|
+
|
32
|
+
for para in paragraphs:
|
33
|
+
tokens = len(self.tokenizer.encode(para))
|
34
|
+
if current_tokens + tokens <= max_tokens:
|
35
|
+
result += para + "\n\n"
|
36
|
+
current_tokens += tokens
|
37
|
+
else:
|
38
|
+
break
|
39
|
+
return result.strip()
|
40
|
+
|
41
|
+
def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
|
42
|
+
"""Intelligently truncate content to preserve the most important parts"""
|
43
|
+
if not content:
|
44
|
+
return ""
|
45
|
+
|
46
|
+
# If content is short enough, return it all
|
47
|
+
if len(self.tokenizer.encode(content)) <= max_tokens:
|
48
|
+
return content
|
49
|
+
|
50
|
+
# Split into sections based on markdown headers
|
51
|
+
sections = []
|
52
|
+
current_section = {"heading": "Introduction", "content": "", "priority": 10}
|
53
|
+
|
54
|
+
for line in content.splitlines():
|
55
|
+
if re.match(r'^#+\s+', line): # It's a header
|
56
|
+
# Save previous section if not empty
|
57
|
+
if current_section["content"].strip():
|
58
|
+
sections.append(current_section)
|
59
|
+
|
60
|
+
# Create new section with appropriate priority
|
61
|
+
heading = re.sub(r'^#+\s+', '', line)
|
62
|
+
priority = 5 # Default priority
|
63
|
+
|
64
|
+
# Assign priority based on content type
|
65
|
+
if re.search(r'\b(usage|example|getting started)\b', heading, re.I):
|
66
|
+
priority = 10
|
67
|
+
elif re.search(r'\b(feature|overview|about)\b', heading, re.I):
|
68
|
+
priority = 9
|
69
|
+
elif re.search(r'\b(install|setup|config)\b', heading, re.I):
|
70
|
+
priority = 8
|
71
|
+
elif re.search(r'\b(api|interface)\b', heading, re.I):
|
72
|
+
priority = 7
|
73
|
+
|
74
|
+
current_section = {"heading": heading, "content": line + "\n", "priority": priority}
|
75
|
+
else:
|
76
|
+
current_section["content"] += line + "\n"
|
77
|
+
|
78
|
+
# Boost priority if code block is found
|
79
|
+
if "```rust" in line or "```no_run" in line:
|
80
|
+
current_section["priority"] = max(current_section["priority"], 8)
|
81
|
+
|
82
|
+
# Add the last section
|
83
|
+
if current_section["content"].strip():
|
84
|
+
sections.append(current_section)
|
85
|
+
|
86
|
+
# Sort sections by priority (highest first)
|
87
|
+
sections.sort(key=lambda x: x["priority"], reverse=True)
|
88
|
+
|
89
|
+
# Build the result, respecting token limits
|
90
|
+
result = ""
|
91
|
+
tokens_used = 0
|
92
|
+
|
93
|
+
for section in sections:
|
94
|
+
section_text = f"## {section['heading']}\n{section['content']}\n"
|
95
|
+
section_tokens = len(self.tokenizer.encode(section_text))
|
96
|
+
|
97
|
+
if tokens_used + section_tokens <= max_tokens:
|
98
|
+
result += section_text
|
99
|
+
tokens_used += section_tokens
|
100
|
+
elif tokens_used < max_tokens - 100: # If we can fit a truncated version
|
101
|
+
# Take what we can
|
102
|
+
remaining_tokens = max_tokens - tokens_used
|
103
|
+
truncated_text = self.tokenizer.decode(self.tokenizer.encode(section_text)[:remaining_tokens])
|
104
|
+
result += truncated_text
|
105
|
+
break
|
106
|
+
|
107
|
+
return result
|
108
|
+
|
109
|
+
def clean_output(self, output: str, task: str = "general") -> str:
|
110
|
+
"""Task-specific output cleaning"""
|
111
|
+
if not output:
|
112
|
+
return ""
|
113
|
+
|
114
|
+
# Remove any remaining prompt artifacts
|
115
|
+
output = output.split("<|end|>")[0].strip()
|
116
|
+
|
117
|
+
if task == "classification":
|
118
|
+
# For classification tasks, extract just the category
|
119
|
+
categories = ["AI", "Database", "Web Framework", "Networking", "Serialization",
|
120
|
+
"Utilities", "DevTools", "ML", "Cryptography", "Unknown"]
|
121
|
+
for category in categories:
|
122
|
+
if re.search(r'\b' + re.escape(category) + r'\b', output, re.IGNORECASE):
|
123
|
+
return category
|
124
|
+
return "Unknown"
|
125
|
+
|
126
|
+
elif task == "factual_pairs":
|
127
|
+
# For factual pairs, ensure proper formatting
|
128
|
+
pairs = []
|
129
|
+
facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', output, re.DOTALL)
|
130
|
+
counterfacts = re.findall(r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', output, re.DOTALL)
|
131
|
+
|
132
|
+
# Pair them up
|
133
|
+
for i in range(min(len(facts), len(counterfacts))):
|
134
|
+
pairs.append(f"✅ Factual: {facts[i].strip()}\n❌ Counterfactual: {counterfacts[i].strip()}")
|
135
|
+
|
136
|
+
return "\n\n".join(pairs)
|
137
|
+
|
138
|
+
else:
|
139
|
+
# General cleaning - more permissive than before
|
140
|
+
lines = [line.strip() for line in output.splitlines() if line.strip()]
|
141
|
+
return "\n".join(lines)
|
142
|
+
|
143
|
+
def run_llama(self, prompt: str, temp: float = 0.2, max_tokens: int = 256) -> Optional[str]:
|
144
|
+
"""Run the LLM with customizable parameters per task"""
|
145
|
+
try:
|
146
|
+
token_count = self.estimate_tokens(prompt)
|
147
|
+
if token_count > self.config.prompt_token_margin:
|
148
|
+
logging.warning(f"Prompt too long ({token_count} tokens). Truncating.")
|
149
|
+
prompt = self.truncate_content(prompt, self.config.prompt_token_margin - 100)
|
150
|
+
|
151
|
+
output = self.model(
|
152
|
+
prompt,
|
153
|
+
max_tokens=max_tokens,
|
154
|
+
temperature=temp,
|
155
|
+
stop=["<|end|>", "<|user|>", "<|system|>"] # Stop at these tokens
|
156
|
+
)
|
157
|
+
|
158
|
+
raw_text = output["choices"][0]["text"]
|
159
|
+
return self.clean_output(raw_text)
|
160
|
+
except Exception as e:
|
161
|
+
logging.error(f"Model inference failed: {str(e)}")
|
162
|
+
raise
|
163
|
+
|
164
|
+
def validate_and_retry(
|
165
|
+
self,
|
166
|
+
prompt: str,
|
167
|
+
validation_func: Callable[[str], bool],
|
168
|
+
temp: float = 0.2,
|
169
|
+
max_tokens: int = 256,
|
170
|
+
retries: int = 3
|
171
|
+
) -> Optional[str]:
|
172
|
+
"""Run LLM with validation and automatic retry on failure"""
|
173
|
+
for attempt in range(retries):
|
174
|
+
try:
|
175
|
+
# Adjust temperature slightly upward on retries to get different results
|
176
|
+
adjusted_temp = temp * (1 + (attempt * 0.1))
|
177
|
+
result = self.run_llama(prompt, temp=adjusted_temp, max_tokens=max_tokens)
|
178
|
+
|
179
|
+
# Validate the result
|
180
|
+
if result and validation_func(result):
|
181
|
+
return result
|
182
|
+
|
183
|
+
# If we get here, validation failed
|
184
|
+
logging.warning(f"Validation failed on attempt {attempt+1}/{retries}. Retrying with modified parameters.")
|
185
|
+
|
186
|
+
# For the last attempt, simplify the prompt
|
187
|
+
if attempt == retries - 2:
|
188
|
+
prompt = self.simplify_prompt(prompt)
|
189
|
+
|
190
|
+
except Exception as e:
|
191
|
+
logging.error(f"Generation error on attempt {attempt+1}: {str(e)}")
|
192
|
+
|
193
|
+
# Backoff before retry
|
194
|
+
time.sleep(1.5 * (2 ** attempt))
|
195
|
+
|
196
|
+
# If we exhaust all retries, return None
|
197
|
+
return None
|
198
|
+
|
199
|
+
def simplify_prompt(self, prompt: str) -> str:
|
200
|
+
"""Simplify a prompt by removing examples and reducing context"""
|
201
|
+
# Remove few-shot examples
|
202
|
+
prompt = re.sub(r'# Example [0-9].*?(?=# Crate to Classify|\Z)', '', prompt, flags=re.DOTALL)
|
203
|
+
|
204
|
+
# Make instructions more direct
|
205
|
+
prompt = re.sub(r'<\|system\|>.*?<\|user\|>', '<|system|>Be concise.\n<|user|>', prompt, flags=re.DOTALL)
|
206
|
+
|
207
|
+
return prompt
|
208
|
+
|
209
|
+
def validate_classification(self, result: str) -> bool:
|
210
|
+
"""Ensure a valid category was returned"""
|
211
|
+
if not result:
|
212
|
+
return False
|
213
|
+
valid_categories = ["AI", "Database", "Web Framework", "Networking", "Serialization",
|
214
|
+
"Utilities", "DevTools", "ML", "Cryptography", "Unknown"]
|
215
|
+
return any(category.lower() == result.strip().lower() for category in valid_categories)
|
216
|
+
|
217
|
+
def validate_factual_pairs(self, result: str) -> bool:
|
218
|
+
"""Ensure exactly 5 factual/counterfactual pairs exist"""
|
219
|
+
if not result:
|
220
|
+
return False
|
221
|
+
|
222
|
+
facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', result, re.DOTALL)
|
223
|
+
counterfacts = re.findall(r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', result, re.DOTALL)
|
224
|
+
|
225
|
+
return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
|
226
|
+
|
227
|
+
def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
|
228
|
+
"""Apply all AI enrichments to a crate"""
|
229
|
+
# Convert CrateMetadata to EnrichedCrate
|
230
|
+
enriched_dict = crate.__dict__.copy()
|
231
|
+
enriched = EnrichedCrate(**enriched_dict)
|
232
|
+
|
233
|
+
try:
|
234
|
+
# Generate README summary first
|
235
|
+
if crate.readme:
|
236
|
+
readme_content = self.smart_truncate(crate.readme, 2000)
|
237
|
+
prompt = (
|
238
|
+
f"<|system|>Extract key features from README.\n"
|
239
|
+
f"<|user|>Summarize key aspects of this Rust crate from its README:\n{readme_content}\n"
|
240
|
+
f"<|end|>"
|
241
|
+
)
|
242
|
+
enriched.readme_summary = self.validate_and_retry(
|
243
|
+
prompt,
|
244
|
+
lambda x: len(x) > 50,
|
245
|
+
temp=0.3,
|
246
|
+
max_tokens=300
|
247
|
+
)
|
248
|
+
|
249
|
+
# Extract key dependencies for context
|
250
|
+
key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5] if dep.get("kind") == "normal"]
|
251
|
+
|
252
|
+
# Generate other enrichments
|
253
|
+
enriched.feature_summary = self.summarize_features(crate)
|
254
|
+
enriched.use_case = self.classify_use_case(
|
255
|
+
crate,
|
256
|
+
enriched.readme_summary or ""
|
257
|
+
)
|
258
|
+
enriched.score = self.score_crate(crate)
|
259
|
+
enriched.factual_counterfactual = self.generate_factual_pairs(crate)
|
260
|
+
|
261
|
+
return enriched
|
262
|
+
except Exception as e:
|
263
|
+
logging.error(f"Failed to enrich {crate.name}: {str(e)}")
|
264
|
+
return enriched
|
265
|
+
|
266
|
+
def summarize_features(self, crate: CrateMetadata) -> str:
|
267
|
+
"""Generate summaries for crate features with better prompting"""
|
268
|
+
try:
|
269
|
+
if not crate.features:
|
270
|
+
return "No features documented for this crate."
|
271
|
+
|
272
|
+
# Format features with their dependencies
|
273
|
+
feature_text = ""
|
274
|
+
for f in crate.features[:8]: # Limit to 8 features for context size
|
275
|
+
feature_name = f.get("name", "")
|
276
|
+
deps = f.get("dependencies", [])
|
277
|
+
deps_str = ", ".join(deps) if deps else "none"
|
278
|
+
feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
|
279
|
+
|
280
|
+
prompt = (
|
281
|
+
f"<|system|>You are a Rust programming expert analyzing crate features.\n"
|
282
|
+
f"<|user|>For the Rust crate `{crate.name}`, explain these features and what functionality they provide:\n\n"
|
283
|
+
f"{feature_text}\n\n"
|
284
|
+
f"Provide a concise explanation of each feature's purpose and when a developer would enable it.\n"
|
285
|
+
f"<|end|>"
|
286
|
+
)
|
287
|
+
|
288
|
+
# Use moderate temperature for informative but natural explanation
|
289
|
+
result = self.run_llama(prompt, temp=0.2, max_tokens=350)
|
290
|
+
return result or "Feature summary not available."
|
291
|
+
except Exception as e:
|
292
|
+
logging.warning(f"Feature summarization failed for {crate.name}: {str(e)}")
|
293
|
+
return "Feature summary not available."
|
294
|
+
|
295
|
+
def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
|
296
|
+
"""Classify the use case of a crate with rich context"""
|
297
|
+
try:
|
298
|
+
# Calculate available tokens for prompt (classification usually needs ~20 response tokens)
|
299
|
+
available_prompt_tokens = self.config.model_token_limit - 200 # Reserve for response
|
300
|
+
|
301
|
+
joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
|
302
|
+
key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5] if dep.get("kind") == "normal"]
|
303
|
+
key_deps_str = ", ".join(key_deps) if key_deps else "None"
|
304
|
+
|
305
|
+
# Adaptively truncate different sections based on importance
|
306
|
+
token_budget = available_prompt_tokens - 400 # Reserve tokens for prompt template
|
307
|
+
|
308
|
+
# Allocate different percentages to each section
|
309
|
+
desc_tokens = int(token_budget * 0.2)
|
310
|
+
readme_tokens = int(token_budget * 0.6)
|
311
|
+
|
312
|
+
desc = self.truncate_content(crate.description, desc_tokens)
|
313
|
+
readme_summary = self.smart_truncate(readme_summary, readme_tokens)
|
314
|
+
|
315
|
+
# Few-shot prompting with examples
|
316
|
+
prompt = (
|
317
|
+
f"<|system|>You are a Rust expert classifying crates into the most appropriate category.\n"
|
318
|
+
f"<|user|>\n"
|
319
|
+
f"# Example 1\n"
|
320
|
+
f"Crate: `tokio`\n"
|
321
|
+
f"Description: An asynchronous runtime for the Rust programming language\n"
|
322
|
+
f"Keywords: async, runtime, futures\n"
|
323
|
+
f"Key Dependencies: mio, bytes, parking_lot\n"
|
324
|
+
f"Category: Networking\n\n"
|
325
|
+
|
326
|
+
f"# Example 2\n"
|
327
|
+
f"Crate: `serde`\n"
|
328
|
+
f"Description: A generic serialization/deserialization framework\n"
|
329
|
+
f"Keywords: serde, serialization\n"
|
330
|
+
f"Key Dependencies: serde_derive\n"
|
331
|
+
f"Category: Serialization\n\n"
|
332
|
+
|
333
|
+
f"# Crate to Classify\n"
|
334
|
+
f"Crate: `{crate.name}`\n"
|
335
|
+
f"Description: {desc}\n"
|
336
|
+
f"Keywords: {joined}\n"
|
337
|
+
f"README Summary: {readme_summary}\n"
|
338
|
+
f"Key Dependencies: {key_deps_str}\n\n"
|
339
|
+
f"Category (pick only one): [AI, Database, Web Framework, Networking, Serialization, Utilities, DevTools, ML, Cryptography, Unknown]\n"
|
340
|
+
f"<|end|>"
|
341
|
+
)
|
342
|
+
|
343
|
+
# Validate classification with retry
|
344
|
+
result = self.validate_and_retry(
|
345
|
+
prompt,
|
346
|
+
validation_func=self.validate_classification,
|
347
|
+
temp=0.1,
|
348
|
+
max_tokens=20
|
349
|
+
)
|
350
|
+
|
351
|
+
return result or "Unknown"
|
352
|
+
except Exception as e:
|
353
|
+
logging.error(f"Classification failed for {crate.name}: {str(e)}")
|
354
|
+
return "Unknown"
|
355
|
+
|
356
|
+
def generate_factual_pairs(self, crate: CrateMetadata) -> str:
|
357
|
+
"""Generate factual/counterfactual pairs with retry and validation"""
|
358
|
+
try:
|
359
|
+
desc = self.truncate_content(crate.description, 300)
|
360
|
+
readme_summary = self.truncate_content(getattr(crate, 'readme_summary', '') or '', 300)
|
361
|
+
|
362
|
+
prompt = (
|
363
|
+
f"<|system|>Create exactly 5 factual/counterfactual pairs for the Rust crate. "
|
364
|
+
f"Factual statements must be true. Counterfactuals should be plausible but incorrect - "
|
365
|
+
f"make them subtle and convincing rather than simple negations.\n"
|
366
|
+
f"<|user|>\n"
|
367
|
+
f"Crate: {crate.name}\n"
|
368
|
+
f"Description: {desc}\n"
|
369
|
+
f"Repo: {crate.repository}\n"
|
370
|
+
f"README Summary: {readme_summary}\n"
|
371
|
+
f"Key Features: {', '.join([f.get('name', '') for f in crate.features[:5]])}\n\n"
|
372
|
+
f"Format each pair as:\n"
|
373
|
+
f"✅ Factual: [true statement about the crate]\n"
|
374
|
+
f"❌ Counterfactual: [plausible but false statement]\n\n"
|
375
|
+
f"Create exactly 5 pairs.\n"
|
376
|
+
f"<|end|>"
|
377
|
+
)
|
378
|
+
|
379
|
+
# Use validation for retry
|
380
|
+
result = self.validate_and_retry(
|
381
|
+
prompt,
|
382
|
+
validation_func=self.validate_factual_pairs,
|
383
|
+
temp=0.6,
|
384
|
+
max_tokens=500
|
385
|
+
)
|
386
|
+
|
387
|
+
return result or "Factual pairs generation failed."
|
388
|
+
except Exception as e:
|
389
|
+
logging.error(f"Exception in factual_pairs for {crate.name}: {str(e)}")
|
390
|
+
return "Factual pairs generation failed."
|
391
|
+
|
392
|
+
def score_crate(self, crate: CrateMetadata) -> float:
|
393
|
+
"""Calculate a score for the crate based on various metrics"""
|
394
|
+
score = (crate.downloads / 1000) + (crate.github_stars * 10)
|
395
|
+
score += len(self.truncate_content(crate.readme, 1000)) / 500
|
396
|
+
return round(score, 2)
|