rust-crate-pipeline 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ # rust_crate_pipeline/__init__.py
2
+ """
3
+ Rust Crate Data Processing Pipeline
4
+
5
+ A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates.
6
+ Includes AI-powered enrichment using local LLMs and dependency analysis.
7
+
8
+ Example usage:
9
+ from rust_crate_pipeline import CrateDataPipeline
10
+ from rust_crate_pipeline.main import main
11
+
12
+ # Run the main pipeline
13
+ main()
14
+
15
+ # Or use the pipeline class directly
16
+ config = PipelineConfig()
17
+ pipeline = CrateDataPipeline(config)
18
+ pipeline.run()
19
+
20
+ Components:
21
+ - CrateDataPipeline: Main orchestration class
22
+ - PipelineConfig: Configuration management
23
+ - Various analyzers for AI, security, and dependency analysis
24
+ """
25
+
26
+ from .version import __version__
27
+
28
+ __author__ = "SuperUser666-Sigil"
29
+ __email__ = "miragemodularframework@gmail.com"
30
+ __license__ = "MIT"
31
+
32
+ # Import main components for easy access (only if dependencies are available)
33
+ try:
34
+ from .pipeline import CrateDataPipeline
35
+ from .config import PipelineConfig
36
+
37
+ __all__ = [
38
+ "CrateDataPipeline",
39
+ "PipelineConfig",
40
+ "__version__",
41
+ "__author__",
42
+ "__email__",
43
+ "__license__"
44
+ ]
45
+ except ImportError:
46
+ # Handle case where dependencies aren't installed yet
47
+ __all__ = [
48
+ "__version__",
49
+ "__author__",
50
+ "__email__",
51
+ "__license__"
52
+ ]
@@ -0,0 +1,6 @@
1
+ # __main__.py
2
+ """Entry point for running the package as a module with python -m"""
3
+
4
+ if __name__ == "__main__":
5
+ from .main import main
6
+ main()
@@ -0,0 +1,396 @@
1
+ # ai_processing.py
2
+ import re
3
+ import time
4
+ import logging
5
+ import tiktoken
6
+ from typing import Callable, Optional
7
+ from llama_cpp import Llama
8
+ from .config import PipelineConfig, CrateMetadata, EnrichedCrate
9
+
10
+ class LLMEnricher:
11
+ def __init__(self, config: PipelineConfig):
12
+ self.config = config
13
+ self.tokenizer = tiktoken.get_encoding("cl100k_base")
14
+ self.model = self._load_model()
15
+
16
+ def _load_model(self):
17
+ return Llama(
18
+ model_path=self.config.model_path,
19
+ n_ctx=1024,
20
+ n_batch=512,
21
+ n_gpu_layers=32
22
+ )
23
+
24
+ def estimate_tokens(self, text: str) -> int:
25
+ return len(self.tokenizer.encode(text))
26
+
27
+ def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
28
+ """Truncate content to fit within token limit"""
29
+ paragraphs = content.split("\n\n")
30
+ result, current_tokens = "", 0
31
+
32
+ for para in paragraphs:
33
+ tokens = len(self.tokenizer.encode(para))
34
+ if current_tokens + tokens <= max_tokens:
35
+ result += para + "\n\n"
36
+ current_tokens += tokens
37
+ else:
38
+ break
39
+ return result.strip()
40
+
41
+ def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
42
+ """Intelligently truncate content to preserve the most important parts"""
43
+ if not content:
44
+ return ""
45
+
46
+ # If content is short enough, return it all
47
+ if len(self.tokenizer.encode(content)) <= max_tokens:
48
+ return content
49
+
50
+ # Split into sections based on markdown headers
51
+ sections = []
52
+ current_section = {"heading": "Introduction", "content": "", "priority": 10}
53
+
54
+ for line in content.splitlines():
55
+ if re.match(r'^#+\s+', line): # It's a header
56
+ # Save previous section if not empty
57
+ if current_section["content"].strip():
58
+ sections.append(current_section)
59
+
60
+ # Create new section with appropriate priority
61
+ heading = re.sub(r'^#+\s+', '', line)
62
+ priority = 5 # Default priority
63
+
64
+ # Assign priority based on content type
65
+ if re.search(r'\b(usage|example|getting started)\b', heading, re.I):
66
+ priority = 10
67
+ elif re.search(r'\b(feature|overview|about)\b', heading, re.I):
68
+ priority = 9
69
+ elif re.search(r'\b(install|setup|config)\b', heading, re.I):
70
+ priority = 8
71
+ elif re.search(r'\b(api|interface)\b', heading, re.I):
72
+ priority = 7
73
+
74
+ current_section = {"heading": heading, "content": line + "\n", "priority": priority}
75
+ else:
76
+ current_section["content"] += line + "\n"
77
+
78
+ # Boost priority if code block is found
79
+ if "```rust" in line or "```no_run" in line:
80
+ current_section["priority"] = max(current_section["priority"], 8)
81
+
82
+ # Add the last section
83
+ if current_section["content"].strip():
84
+ sections.append(current_section)
85
+
86
+ # Sort sections by priority (highest first)
87
+ sections.sort(key=lambda x: x["priority"], reverse=True)
88
+
89
+ # Build the result, respecting token limits
90
+ result = ""
91
+ tokens_used = 0
92
+
93
+ for section in sections:
94
+ section_text = f"## {section['heading']}\n{section['content']}\n"
95
+ section_tokens = len(self.tokenizer.encode(section_text))
96
+
97
+ if tokens_used + section_tokens <= max_tokens:
98
+ result += section_text
99
+ tokens_used += section_tokens
100
+ elif tokens_used < max_tokens - 100: # If we can fit a truncated version
101
+ # Take what we can
102
+ remaining_tokens = max_tokens - tokens_used
103
+ truncated_text = self.tokenizer.decode(self.tokenizer.encode(section_text)[:remaining_tokens])
104
+ result += truncated_text
105
+ break
106
+
107
+ return result
108
+
109
+ def clean_output(self, output: str, task: str = "general") -> str:
110
+ """Task-specific output cleaning"""
111
+ if not output:
112
+ return ""
113
+
114
+ # Remove any remaining prompt artifacts
115
+ output = output.split("<|end|>")[0].strip()
116
+
117
+ if task == "classification":
118
+ # For classification tasks, extract just the category
119
+ categories = ["AI", "Database", "Web Framework", "Networking", "Serialization",
120
+ "Utilities", "DevTools", "ML", "Cryptography", "Unknown"]
121
+ for category in categories:
122
+ if re.search(r'\b' + re.escape(category) + r'\b', output, re.IGNORECASE):
123
+ return category
124
+ return "Unknown"
125
+
126
+ elif task == "factual_pairs":
127
+ # For factual pairs, ensure proper formatting
128
+ pairs = []
129
+ facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', output, re.DOTALL)
130
+ counterfacts = re.findall(r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', output, re.DOTALL)
131
+
132
+ # Pair them up
133
+ for i in range(min(len(facts), len(counterfacts))):
134
+ pairs.append(f"✅ Factual: {facts[i].strip()}\n❌ Counterfactual: {counterfacts[i].strip()}")
135
+
136
+ return "\n\n".join(pairs)
137
+
138
+ else:
139
+ # General cleaning - more permissive than before
140
+ lines = [line.strip() for line in output.splitlines() if line.strip()]
141
+ return "\n".join(lines)
142
+
143
+ def run_llama(self, prompt: str, temp: float = 0.2, max_tokens: int = 256) -> Optional[str]:
144
+ """Run the LLM with customizable parameters per task"""
145
+ try:
146
+ token_count = self.estimate_tokens(prompt)
147
+ if token_count > self.config.prompt_token_margin:
148
+ logging.warning(f"Prompt too long ({token_count} tokens). Truncating.")
149
+ prompt = self.truncate_content(prompt, self.config.prompt_token_margin - 100)
150
+
151
+ output = self.model(
152
+ prompt,
153
+ max_tokens=max_tokens,
154
+ temperature=temp,
155
+ stop=["<|end|>", "<|user|>", "<|system|>"] # Stop at these tokens
156
+ )
157
+
158
+ raw_text = output["choices"][0]["text"]
159
+ return self.clean_output(raw_text)
160
+ except Exception as e:
161
+ logging.error(f"Model inference failed: {str(e)}")
162
+ raise
163
+
164
+ def validate_and_retry(
165
+ self,
166
+ prompt: str,
167
+ validation_func: Callable[[str], bool],
168
+ temp: float = 0.2,
169
+ max_tokens: int = 256,
170
+ retries: int = 3
171
+ ) -> Optional[str]:
172
+ """Run LLM with validation and automatic retry on failure"""
173
+ for attempt in range(retries):
174
+ try:
175
+ # Adjust temperature slightly upward on retries to get different results
176
+ adjusted_temp = temp * (1 + (attempt * 0.1))
177
+ result = self.run_llama(prompt, temp=adjusted_temp, max_tokens=max_tokens)
178
+
179
+ # Validate the result
180
+ if result and validation_func(result):
181
+ return result
182
+
183
+ # If we get here, validation failed
184
+ logging.warning(f"Validation failed on attempt {attempt+1}/{retries}. Retrying with modified parameters.")
185
+
186
+ # For the last attempt, simplify the prompt
187
+ if attempt == retries - 2:
188
+ prompt = self.simplify_prompt(prompt)
189
+
190
+ except Exception as e:
191
+ logging.error(f"Generation error on attempt {attempt+1}: {str(e)}")
192
+
193
+ # Backoff before retry
194
+ time.sleep(1.5 * (2 ** attempt))
195
+
196
+ # If we exhaust all retries, return None
197
+ return None
198
+
199
+ def simplify_prompt(self, prompt: str) -> str:
200
+ """Simplify a prompt by removing examples and reducing context"""
201
+ # Remove few-shot examples
202
+ prompt = re.sub(r'# Example [0-9].*?(?=# Crate to Classify|\Z)', '', prompt, flags=re.DOTALL)
203
+
204
+ # Make instructions more direct
205
+ prompt = re.sub(r'<\|system\|>.*?<\|user\|>', '<|system|>Be concise.\n<|user|>', prompt, flags=re.DOTALL)
206
+
207
+ return prompt
208
+
209
+ def validate_classification(self, result: str) -> bool:
210
+ """Ensure a valid category was returned"""
211
+ if not result:
212
+ return False
213
+ valid_categories = ["AI", "Database", "Web Framework", "Networking", "Serialization",
214
+ "Utilities", "DevTools", "ML", "Cryptography", "Unknown"]
215
+ return any(category.lower() == result.strip().lower() for category in valid_categories)
216
+
217
+ def validate_factual_pairs(self, result: str) -> bool:
218
+ """Ensure exactly 5 factual/counterfactual pairs exist"""
219
+ if not result:
220
+ return False
221
+
222
+ facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', result, re.DOTALL)
223
+ counterfacts = re.findall(r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', result, re.DOTALL)
224
+
225
+ return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
226
+
227
+ def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
228
+ """Apply all AI enrichments to a crate"""
229
+ # Convert CrateMetadata to EnrichedCrate
230
+ enriched_dict = crate.__dict__.copy()
231
+ enriched = EnrichedCrate(**enriched_dict)
232
+
233
+ try:
234
+ # Generate README summary first
235
+ if crate.readme:
236
+ readme_content = self.smart_truncate(crate.readme, 2000)
237
+ prompt = (
238
+ f"<|system|>Extract key features from README.\n"
239
+ f"<|user|>Summarize key aspects of this Rust crate from its README:\n{readme_content}\n"
240
+ f"<|end|>"
241
+ )
242
+ enriched.readme_summary = self.validate_and_retry(
243
+ prompt,
244
+ lambda x: len(x) > 50,
245
+ temp=0.3,
246
+ max_tokens=300
247
+ )
248
+
249
+ # Extract key dependencies for context
250
+ key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5] if dep.get("kind") == "normal"]
251
+
252
+ # Generate other enrichments
253
+ enriched.feature_summary = self.summarize_features(crate)
254
+ enriched.use_case = self.classify_use_case(
255
+ crate,
256
+ enriched.readme_summary or ""
257
+ )
258
+ enriched.score = self.score_crate(crate)
259
+ enriched.factual_counterfactual = self.generate_factual_pairs(crate)
260
+
261
+ return enriched
262
+ except Exception as e:
263
+ logging.error(f"Failed to enrich {crate.name}: {str(e)}")
264
+ return enriched
265
+
266
+ def summarize_features(self, crate: CrateMetadata) -> str:
267
+ """Generate summaries for crate features with better prompting"""
268
+ try:
269
+ if not crate.features:
270
+ return "No features documented for this crate."
271
+
272
+ # Format features with their dependencies
273
+ feature_text = ""
274
+ for f in crate.features[:8]: # Limit to 8 features for context size
275
+ feature_name = f.get("name", "")
276
+ deps = f.get("dependencies", [])
277
+ deps_str = ", ".join(deps) if deps else "none"
278
+ feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
279
+
280
+ prompt = (
281
+ f"<|system|>You are a Rust programming expert analyzing crate features.\n"
282
+ f"<|user|>For the Rust crate `{crate.name}`, explain these features and what functionality they provide:\n\n"
283
+ f"{feature_text}\n\n"
284
+ f"Provide a concise explanation of each feature's purpose and when a developer would enable it.\n"
285
+ f"<|end|>"
286
+ )
287
+
288
+ # Use moderate temperature for informative but natural explanation
289
+ result = self.run_llama(prompt, temp=0.2, max_tokens=350)
290
+ return result or "Feature summary not available."
291
+ except Exception as e:
292
+ logging.warning(f"Feature summarization failed for {crate.name}: {str(e)}")
293
+ return "Feature summary not available."
294
+
295
+ def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
296
+ """Classify the use case of a crate with rich context"""
297
+ try:
298
+ # Calculate available tokens for prompt (classification usually needs ~20 response tokens)
299
+ available_prompt_tokens = self.config.model_token_limit - 200 # Reserve for response
300
+
301
+ joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
302
+ key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5] if dep.get("kind") == "normal"]
303
+ key_deps_str = ", ".join(key_deps) if key_deps else "None"
304
+
305
+ # Adaptively truncate different sections based on importance
306
+ token_budget = available_prompt_tokens - 400 # Reserve tokens for prompt template
307
+
308
+ # Allocate different percentages to each section
309
+ desc_tokens = int(token_budget * 0.2)
310
+ readme_tokens = int(token_budget * 0.6)
311
+
312
+ desc = self.truncate_content(crate.description, desc_tokens)
313
+ readme_summary = self.smart_truncate(readme_summary, readme_tokens)
314
+
315
+ # Few-shot prompting with examples
316
+ prompt = (
317
+ f"<|system|>You are a Rust expert classifying crates into the most appropriate category.\n"
318
+ f"<|user|>\n"
319
+ f"# Example 1\n"
320
+ f"Crate: `tokio`\n"
321
+ f"Description: An asynchronous runtime for the Rust programming language\n"
322
+ f"Keywords: async, runtime, futures\n"
323
+ f"Key Dependencies: mio, bytes, parking_lot\n"
324
+ f"Category: Networking\n\n"
325
+
326
+ f"# Example 2\n"
327
+ f"Crate: `serde`\n"
328
+ f"Description: A generic serialization/deserialization framework\n"
329
+ f"Keywords: serde, serialization\n"
330
+ f"Key Dependencies: serde_derive\n"
331
+ f"Category: Serialization\n\n"
332
+
333
+ f"# Crate to Classify\n"
334
+ f"Crate: `{crate.name}`\n"
335
+ f"Description: {desc}\n"
336
+ f"Keywords: {joined}\n"
337
+ f"README Summary: {readme_summary}\n"
338
+ f"Key Dependencies: {key_deps_str}\n\n"
339
+ f"Category (pick only one): [AI, Database, Web Framework, Networking, Serialization, Utilities, DevTools, ML, Cryptography, Unknown]\n"
340
+ f"<|end|>"
341
+ )
342
+
343
+ # Validate classification with retry
344
+ result = self.validate_and_retry(
345
+ prompt,
346
+ validation_func=self.validate_classification,
347
+ temp=0.1,
348
+ max_tokens=20
349
+ )
350
+
351
+ return result or "Unknown"
352
+ except Exception as e:
353
+ logging.error(f"Classification failed for {crate.name}: {str(e)}")
354
+ return "Unknown"
355
+
356
+ def generate_factual_pairs(self, crate: CrateMetadata) -> str:
357
+ """Generate factual/counterfactual pairs with retry and validation"""
358
+ try:
359
+ desc = self.truncate_content(crate.description, 300)
360
+ readme_summary = self.truncate_content(getattr(crate, 'readme_summary', '') or '', 300)
361
+
362
+ prompt = (
363
+ f"<|system|>Create exactly 5 factual/counterfactual pairs for the Rust crate. "
364
+ f"Factual statements must be true. Counterfactuals should be plausible but incorrect - "
365
+ f"make them subtle and convincing rather than simple negations.\n"
366
+ f"<|user|>\n"
367
+ f"Crate: {crate.name}\n"
368
+ f"Description: {desc}\n"
369
+ f"Repo: {crate.repository}\n"
370
+ f"README Summary: {readme_summary}\n"
371
+ f"Key Features: {', '.join([f.get('name', '') for f in crate.features[:5]])}\n\n"
372
+ f"Format each pair as:\n"
373
+ f"✅ Factual: [true statement about the crate]\n"
374
+ f"❌ Counterfactual: [plausible but false statement]\n\n"
375
+ f"Create exactly 5 pairs.\n"
376
+ f"<|end|>"
377
+ )
378
+
379
+ # Use validation for retry
380
+ result = self.validate_and_retry(
381
+ prompt,
382
+ validation_func=self.validate_factual_pairs,
383
+ temp=0.6,
384
+ max_tokens=500
385
+ )
386
+
387
+ return result or "Factual pairs generation failed."
388
+ except Exception as e:
389
+ logging.error(f"Exception in factual_pairs for {crate.name}: {str(e)}")
390
+ return "Factual pairs generation failed."
391
+
392
+ def score_crate(self, crate: CrateMetadata) -> float:
393
+ """Calculate a score for the crate based on various metrics"""
394
+ score = (crate.downloads / 1000) + (crate.github_stars * 10)
395
+ score += len(self.truncate_content(crate.readme, 1000)) / 500
396
+ return round(score, 2)