rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +15 -6
- rust_crate_pipeline/ai_processing.py +260 -153
- rust_crate_pipeline/analysis.py +171 -160
- rust_crate_pipeline/config.py +23 -3
- rust_crate_pipeline/github_token_checker.py +30 -20
- rust_crate_pipeline/main.py +107 -45
- rust_crate_pipeline/network.py +109 -108
- rust_crate_pipeline/pipeline.py +269 -125
- rust_crate_pipeline/production_config.py +15 -9
- rust_crate_pipeline/utils/file_utils.py +14 -10
- rust_crate_pipeline/utils/logging_utils.py +25 -13
- rust_crate_pipeline/version.py +47 -2
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
- rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0
@@ -2,24 +2,41 @@
|
|
2
2
|
import re
|
3
3
|
import time
|
4
4
|
import logging
|
5
|
-
import
|
6
|
-
from typing import Callable, Optional
|
7
|
-
from llama_cpp import Llama
|
5
|
+
from typing import Callable, Optional, Any, Dict, List
|
8
6
|
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
9
7
|
|
8
|
+
# Optional imports with fallbacks
|
9
|
+
_ai_dependencies_available = True
|
10
|
+
try:
|
11
|
+
import tiktoken
|
12
|
+
from llama_cpp import Llama
|
13
|
+
except ImportError as e:
|
14
|
+
logging.warning(f"AI dependencies not available: {e}")
|
15
|
+
tiktoken = None
|
16
|
+
Llama = None
|
17
|
+
_ai_dependencies_available = False
|
18
|
+
|
19
|
+
|
10
20
|
class LLMEnricher:
|
11
21
|
def __init__(self, config: PipelineConfig):
|
22
|
+
if not _ai_dependencies_available:
|
23
|
+
raise ImportError("AI dependencies (tiktoken, llama_cpp) are not available. Please install them to use LLMEnricher.")
|
24
|
+
|
12
25
|
self.config = config
|
13
|
-
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
26
|
+
self.tokenizer = tiktoken.get_encoding("cl100k_base") # type: ignore
|
14
27
|
self.model = self._load_model()
|
15
|
-
|
28
|
+
|
16
29
|
def _load_model(self):
|
17
30
|
"""Optimized for GCP g2-standard-4 with L4 GPU (24GB VRAM)"""
|
18
|
-
|
31
|
+
if not _ai_dependencies_available:
|
32
|
+
raise ImportError("Cannot load model: AI dependencies not available")
|
33
|
+
|
34
|
+
return Llama( # type: ignore
|
19
35
|
model_path=self.config.model_path,
|
20
36
|
n_ctx=4096, # Larger context for L4's 24GB VRAM
|
21
37
|
n_batch=1024, # Larger batch size for better throughput
|
22
|
-
|
38
|
+
# Load ALL layers on GPU (L4 has plenty VRAM)
|
39
|
+
n_gpu_layers=-1,
|
23
40
|
n_threads=4, # Match the 4 vCPUs
|
24
41
|
n_threads_batch=4, # Parallel batch processing
|
25
42
|
use_mmap=True, # Memory-mapped files for efficiency
|
@@ -37,7 +54,7 @@ class LLMEnricher:
|
|
37
54
|
"""Truncate content to fit within token limit"""
|
38
55
|
paragraphs = content.split("\n\n")
|
39
56
|
result, current_tokens = "", 0
|
40
|
-
|
57
|
+
|
41
58
|
for para in paragraphs:
|
42
59
|
tokens = len(self.tokenizer.encode(para))
|
43
60
|
if current_tokens + tokens <= max_tokens:
|
@@ -51,27 +68,33 @@ class LLMEnricher:
|
|
51
68
|
"""Intelligently truncate content to preserve the most important parts"""
|
52
69
|
if not content:
|
53
70
|
return ""
|
54
|
-
|
71
|
+
|
55
72
|
# If content is short enough, return it all
|
56
73
|
if len(self.tokenizer.encode(content)) <= max_tokens:
|
57
74
|
return content
|
58
|
-
|
75
|
+
|
59
76
|
# Split into sections based on markdown headers
|
60
77
|
sections = []
|
61
|
-
current_section = {
|
62
|
-
|
78
|
+
current_section = {
|
79
|
+
"heading": "Introduction",
|
80
|
+
"content": "",
|
81
|
+
"priority": 10}
|
82
|
+
|
63
83
|
for line in content.splitlines():
|
64
84
|
if re.match(r'^#+\s+', line): # It's a header
|
65
85
|
# Save previous section if not empty
|
66
86
|
if current_section["content"].strip():
|
67
87
|
sections.append(current_section)
|
68
|
-
|
88
|
+
|
69
89
|
# Create new section with appropriate priority
|
70
90
|
heading = re.sub(r'^#+\s+', '', line)
|
71
91
|
priority = 5 # Default priority
|
72
|
-
|
92
|
+
|
73
93
|
# Assign priority based on content type
|
74
|
-
if re.search(
|
94
|
+
if re.search(
|
95
|
+
r'\b(usage|example|getting started)\b',
|
96
|
+
heading,
|
97
|
+
re.I):
|
75
98
|
priority = 10
|
76
99
|
elif re.search(r'\b(feature|overview|about)\b', heading, re.I):
|
77
100
|
priority = 9
|
@@ -79,91 +102,122 @@ class LLMEnricher:
|
|
79
102
|
priority = 8
|
80
103
|
elif re.search(r'\b(api|interface)\b', heading, re.I):
|
81
104
|
priority = 7
|
82
|
-
|
83
|
-
current_section = {
|
105
|
+
|
106
|
+
current_section = {
|
107
|
+
"heading": heading,
|
108
|
+
"content": line + "\n",
|
109
|
+
"priority": priority}
|
84
110
|
else:
|
85
111
|
current_section["content"] += line + "\n"
|
86
|
-
|
112
|
+
|
87
113
|
# Boost priority if code block is found
|
88
114
|
if "```rust" in line or "```no_run" in line:
|
89
|
-
current_section["priority"] = max(
|
90
|
-
|
115
|
+
current_section["priority"] = max(
|
116
|
+
current_section["priority"], 8)
|
117
|
+
|
91
118
|
# Add the last section
|
92
119
|
if current_section["content"].strip():
|
93
120
|
sections.append(current_section)
|
94
|
-
|
121
|
+
|
95
122
|
# Sort sections by priority (highest first)
|
96
123
|
sections.sort(key=lambda x: x["priority"], reverse=True)
|
97
|
-
|
124
|
+
|
98
125
|
# Build the result, respecting token limits
|
99
126
|
result = ""
|
100
127
|
tokens_used = 0
|
101
|
-
|
128
|
+
|
102
129
|
for section in sections:
|
103
130
|
section_text = f"## {section['heading']}\n{section['content']}\n"
|
104
131
|
section_tokens = len(self.tokenizer.encode(section_text))
|
105
|
-
|
132
|
+
|
106
133
|
if tokens_used + section_tokens <= max_tokens:
|
107
134
|
result += section_text
|
108
135
|
tokens_used += section_tokens
|
109
136
|
elif tokens_used < max_tokens - 100: # If we can fit a truncated version
|
110
137
|
# Take what we can
|
111
138
|
remaining_tokens = max_tokens - tokens_used
|
112
|
-
truncated_text = self.tokenizer.decode(
|
139
|
+
truncated_text = self.tokenizer.decode(
|
140
|
+
self.tokenizer.encode(section_text)[:remaining_tokens])
|
113
141
|
result += truncated_text
|
114
142
|
break
|
115
|
-
|
143
|
+
|
116
144
|
return result
|
117
145
|
|
118
146
|
def clean_output(self, output: str, task: str = "general") -> str:
|
119
147
|
"""Task-specific output cleaning"""
|
120
148
|
if not output:
|
121
149
|
return ""
|
122
|
-
|
150
|
+
|
123
151
|
# Remove any remaining prompt artifacts
|
124
152
|
output = output.split("<|end|>")[0].strip()
|
125
|
-
|
153
|
+
|
126
154
|
if task == "classification":
|
127
155
|
# For classification tasks, extract just the category
|
128
|
-
categories = [
|
129
|
-
|
156
|
+
categories = [
|
157
|
+
"AI",
|
158
|
+
"Database",
|
159
|
+
"Web Framework",
|
160
|
+
"Networking",
|
161
|
+
"Serialization",
|
162
|
+
"Utilities",
|
163
|
+
"DevTools",
|
164
|
+
"ML",
|
165
|
+
"Cryptography",
|
166
|
+
"Unknown"]
|
130
167
|
for category in categories:
|
131
|
-
if re.search(
|
168
|
+
if re.search(
|
169
|
+
r'\b' +
|
170
|
+
re.escape(category) +
|
171
|
+
r'\b',
|
172
|
+
output,
|
173
|
+
re.IGNORECASE):
|
132
174
|
return category
|
133
175
|
return "Unknown"
|
134
|
-
|
176
|
+
|
135
177
|
elif task == "factual_pairs":
|
136
178
|
# For factual pairs, ensure proper formatting
|
137
179
|
pairs = []
|
138
|
-
facts = re.findall(
|
139
|
-
|
140
|
-
|
180
|
+
facts = re.findall(
|
181
|
+
r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)',
|
182
|
+
output,
|
183
|
+
re.DOTALL)
|
184
|
+
counterfacts = re.findall(
|
185
|
+
r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', output, re.DOTALL)
|
186
|
+
|
141
187
|
# Pair them up
|
142
188
|
for i in range(min(len(facts), len(counterfacts))):
|
143
|
-
pairs.append(
|
144
|
-
|
189
|
+
pairs.append(
|
190
|
+
f"✅ Factual: {
|
191
|
+
facts[i].strip()}\n❌ Counterfactual: {
|
192
|
+
counterfacts[i].strip()}")
|
193
|
+
|
145
194
|
return "\n\n".join(pairs)
|
146
|
-
|
195
|
+
|
147
196
|
else:
|
148
197
|
# General cleaning - more permissive than before
|
149
|
-
lines = [line.strip()
|
198
|
+
lines = [line.strip()
|
199
|
+
for line in output.splitlines() if line.strip()]
|
150
200
|
return "\n".join(lines)
|
151
201
|
|
152
|
-
def run_llama(self, prompt: str, temp: float = 0.2,
|
202
|
+
def run_llama(self, prompt: str, temp: float = 0.2,
|
203
|
+
max_tokens: int = 256) -> Optional[str]:
|
153
204
|
"""Run the LLM with customizable parameters per task"""
|
154
205
|
try:
|
155
206
|
token_count = self.estimate_tokens(prompt)
|
156
207
|
if token_count > self.config.prompt_token_margin:
|
157
|
-
logging.warning(
|
158
|
-
|
159
|
-
|
208
|
+
logging.warning(
|
209
|
+
f"Prompt too long ({token_count} tokens). Truncating.")
|
210
|
+
prompt = self.truncate_content(
|
211
|
+
prompt, self.config.prompt_token_margin - 100)
|
212
|
+
|
160
213
|
output = self.model(
|
161
214
|
prompt,
|
162
215
|
max_tokens=max_tokens,
|
163
216
|
temperature=temp,
|
164
|
-
|
217
|
+
# Stop at these tokens
|
218
|
+
stop=["<|end|>", "<|user|>", "<|system|>"]
|
165
219
|
)
|
166
|
-
|
220
|
+
|
167
221
|
raw_text = output["choices"][0]["text"]
|
168
222
|
return self.clean_output(raw_text)
|
169
223
|
except Exception as e:
|
@@ -174,7 +228,7 @@ class LLMEnricher:
|
|
174
228
|
self,
|
175
229
|
prompt: str,
|
176
230
|
validation_func: Callable[[str], bool],
|
177
|
-
temp: float = 0.2,
|
231
|
+
temp: float = 0.2, max_tokens: int = 256,
|
178
232
|
retries: int = 4 # Increased from 2 to 4 for better success rates
|
179
233
|
) -> Optional[str]:
|
180
234
|
"""Run LLM with validation and automatic retry on failure"""
|
@@ -182,58 +236,89 @@ class LLMEnricher:
|
|
182
236
|
for attempt in range(retries):
|
183
237
|
try:
|
184
238
|
# More generous temperature adjustment for better variety
|
185
|
-
|
186
|
-
|
187
|
-
|
239
|
+
# 20% increases instead of 10%
|
240
|
+
adjusted_temp = temp * (1 + (attempt * 0.2))
|
241
|
+
result = self.run_llama(
|
242
|
+
prompt, temp=adjusted_temp, max_tokens=max_tokens)
|
243
|
+
|
188
244
|
# Validate the result
|
189
245
|
if result and validation_func(result):
|
190
246
|
return result
|
191
|
-
|
192
|
-
# If we get here, validation failed - use debug level for early
|
247
|
+
|
248
|
+
# If we get here, validation failed - use debug level for early
|
249
|
+
# attempts
|
193
250
|
if attempt == retries - 1:
|
194
|
-
logging.debug(
|
251
|
+
logging.debug(
|
252
|
+
f"All {retries} validation attempts failed, using last available result.")
|
195
253
|
else:
|
196
|
-
logging.debug(
|
197
|
-
|
254
|
+
logging.debug(
|
255
|
+
f"Validation failed on attempt {
|
256
|
+
attempt + 1}/{retries}. Retrying with adjusted temp={
|
257
|
+
adjusted_temp:.2f}")
|
258
|
+
|
198
259
|
# Only simplify prompt on later attempts (attempt 2+)
|
199
260
|
if attempt >= 2:
|
200
261
|
prompt = self.simplify_prompt(prompt)
|
201
|
-
|
262
|
+
|
202
263
|
except Exception as e:
|
203
|
-
logging.error(
|
204
|
-
|
264
|
+
logging.error(
|
265
|
+
f"Generation error on attempt {
|
266
|
+
attempt +
|
267
|
+
1}: {
|
268
|
+
str(e)}")
|
269
|
+
|
205
270
|
# More generous backoff - give the model more time
|
206
271
|
time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
|
207
|
-
|
208
|
-
# If we exhausted all retries, return the last result even if not
|
272
|
+
|
273
|
+
# If we exhausted all retries, return the last result even if not
|
274
|
+
# perfect
|
209
275
|
return result if 'result' in locals() else None
|
210
276
|
|
211
277
|
def simplify_prompt(self, prompt: str) -> str:
|
212
278
|
"""Simplify a prompt by removing examples and reducing context"""
|
213
279
|
# Remove few-shot examples
|
214
|
-
prompt = re.sub(
|
215
|
-
|
280
|
+
prompt = re.sub(
|
281
|
+
r'# Example [0-9].*?(?=# Crate to Classify|\Z)',
|
282
|
+
'',
|
283
|
+
prompt,
|
284
|
+
flags=re.DOTALL)
|
285
|
+
|
216
286
|
# Make instructions more direct
|
217
|
-
prompt = re.sub(
|
218
|
-
|
287
|
+
prompt = re.sub(
|
288
|
+
r'<\|system\|>.*?<\|user\|>',
|
289
|
+
'<|system|>Be concise.\n<|user|>',
|
290
|
+
prompt,
|
291
|
+
flags=re.DOTALL)
|
292
|
+
|
219
293
|
return prompt
|
220
294
|
|
221
295
|
def validate_classification(self, result: str) -> bool:
|
222
296
|
"""Ensure a valid category was returned"""
|
223
297
|
if not result:
|
224
298
|
return False
|
225
|
-
valid_categories = [
|
226
|
-
|
227
|
-
|
299
|
+
valid_categories = [
|
300
|
+
"AI",
|
301
|
+
"Database",
|
302
|
+
"Web Framework",
|
303
|
+
"Networking",
|
304
|
+
"Serialization",
|
305
|
+
"Utilities",
|
306
|
+
"DevTools",
|
307
|
+
"ML",
|
308
|
+
"Cryptography",
|
309
|
+
"Unknown"]
|
310
|
+
return any(category.lower() == result.strip().lower()
|
311
|
+
for category in valid_categories)
|
228
312
|
|
229
313
|
def validate_factual_pairs(self, result: str) -> bool:
|
230
314
|
"""Ensure exactly 5 factual/counterfactual pairs exist"""
|
231
315
|
if not result:
|
232
316
|
return False
|
233
|
-
|
317
|
+
|
234
318
|
facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', result, re.DOTALL)
|
235
|
-
counterfacts = re.findall(
|
236
|
-
|
319
|
+
counterfacts = re.findall(
|
320
|
+
r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', result, re.DOTALL)
|
321
|
+
|
237
322
|
return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
|
238
323
|
|
239
324
|
def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
|
@@ -241,31 +326,32 @@ class LLMEnricher:
|
|
241
326
|
# Convert CrateMetadata to EnrichedCrate
|
242
327
|
enriched_dict = crate.__dict__.copy()
|
243
328
|
enriched = EnrichedCrate(**enriched_dict)
|
244
|
-
|
329
|
+
|
245
330
|
try:
|
246
331
|
# Generate README summary first
|
247
332
|
if crate.readme:
|
248
333
|
readme_content = self.smart_truncate(crate.readme, 2000)
|
249
334
|
prompt = (
|
250
|
-
|
335
|
+
"<|system|>Extract key features from README.\n"
|
251
336
|
f"<|user|>Summarize key aspects of this Rust crate from its README:\n{readme_content}\n"
|
252
|
-
|
337
|
+
"<|end|>"
|
253
338
|
)
|
254
339
|
enriched.readme_summary = self.validate_and_retry(
|
255
|
-
prompt,
|
256
|
-
lambda x: len(x) > 50,
|
257
|
-
temp=0.3,
|
258
|
-
max_tokens=300
|
259
|
-
|
340
|
+
prompt,
|
341
|
+
lambda x: len(x) > 50,
|
342
|
+
temp=0.3,
|
343
|
+
max_tokens=300)
|
344
|
+
|
260
345
|
# Generate other enrichments
|
261
346
|
enriched.feature_summary = self.summarize_features(crate)
|
262
347
|
enriched.use_case = self.classify_use_case(
|
263
|
-
crate,
|
348
|
+
crate,
|
264
349
|
enriched.readme_summary or ""
|
265
350
|
)
|
266
351
|
enriched.score = self.score_crate(crate)
|
267
|
-
enriched.factual_counterfactual = self.generate_factual_pairs(
|
268
|
-
|
352
|
+
enriched.factual_counterfactual = self.generate_factual_pairs(
|
353
|
+
crate)
|
354
|
+
|
269
355
|
return enriched
|
270
356
|
except Exception as e:
|
271
357
|
logging.error(f"Failed to enrich {crate.name}: {str(e)}")
|
@@ -276,7 +362,7 @@ class LLMEnricher:
|
|
276
362
|
try:
|
277
363
|
if not crate.features:
|
278
364
|
return "No features documented for this crate."
|
279
|
-
|
365
|
+
|
280
366
|
# Format features with their dependencies
|
281
367
|
feature_text = ""
|
282
368
|
for f in crate.features[:8]: # Limit to 8 features for context size
|
@@ -284,77 +370,86 @@ class LLMEnricher:
|
|
284
370
|
deps = f.get("dependencies", [])
|
285
371
|
deps_str = ", ".join(deps) if deps else "none"
|
286
372
|
feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
|
287
|
-
|
373
|
+
|
288
374
|
prompt = (
|
289
|
-
|
375
|
+
"<|system|>You are a Rust programming expert analyzing crate features.\n"
|
290
376
|
f"<|user|>For the Rust crate `{crate.name}`, explain these features and what functionality they provide:\n\n"
|
291
377
|
f"{feature_text}\n\n"
|
292
|
-
|
293
|
-
|
378
|
+
"Provide a concise explanation of each feature's purpose and when a developer would enable it.\n"
|
379
|
+
"<|end|>"
|
294
380
|
)
|
295
|
-
|
381
|
+
|
296
382
|
# Use moderate temperature for informative but natural explanation
|
297
383
|
result = self.run_llama(prompt, temp=0.2, max_tokens=350)
|
298
384
|
return result or "Feature summary not available."
|
299
385
|
except Exception as e:
|
300
|
-
logging.warning(
|
386
|
+
logging.warning(
|
387
|
+
f"Feature summarization failed for {
|
388
|
+
crate.name}: {
|
389
|
+
str(e)}")
|
301
390
|
return "Feature summary not available."
|
302
391
|
|
303
|
-
def classify_use_case(
|
392
|
+
def classify_use_case(
|
393
|
+
self,
|
394
|
+
crate: CrateMetadata,
|
395
|
+
readme_summary: str) -> str:
|
304
396
|
"""Classify the use case of a crate with rich context"""
|
305
397
|
try: # Calculate available tokens for prompt (classification usually needs ~20 response tokens)
|
306
398
|
available_prompt_tokens = self.config.model_token_limit - 200 # Reserve for response
|
307
|
-
|
308
|
-
joined = ", ".join(
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
399
|
+
|
400
|
+
joined = ", ".join(
|
401
|
+
crate.keywords[:10]) if crate.keywords else "None"
|
402
|
+
key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5]
|
403
|
+
if dep.get("kind") == "normal" and dep.get("crate_id")]
|
404
|
+
key_deps_str = ", ".join(str(dep)
|
405
|
+
for dep in key_deps) if key_deps else "None"
|
406
|
+
|
313
407
|
# Adaptively truncate different sections based on importance
|
314
|
-
token_budget = available_prompt_tokens -
|
315
|
-
|
408
|
+
token_budget = available_prompt_tokens - \
|
409
|
+
400 # Reserve tokens for prompt template
|
410
|
+
|
316
411
|
# Allocate different percentages to each section
|
317
412
|
desc_tokens = int(token_budget * 0.2)
|
318
413
|
readme_tokens = int(token_budget * 0.6)
|
319
|
-
|
414
|
+
|
320
415
|
desc = self.truncate_content(crate.description, desc_tokens)
|
321
416
|
readme_summary = self.smart_truncate(readme_summary, readme_tokens)
|
322
|
-
|
417
|
+
|
323
418
|
# Few-shot prompting with examples
|
324
419
|
prompt = (
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
420
|
+
"<|system|>You are a Rust expert classifying crates into the most appropriate category.\n"
|
421
|
+
"<|user|>\n"
|
422
|
+
"# Example 1\n"
|
423
|
+
"Crate: `tokio`\n"
|
424
|
+
"Description: An asynchronous runtime for the Rust programming language\n"
|
425
|
+
"Keywords: async, runtime, futures\n"
|
426
|
+
"Key Dependencies: mio, bytes, parking_lot\n"
|
427
|
+
"Category: Networking\n\n"
|
428
|
+
|
429
|
+
"# Example 2\n"
|
430
|
+
"Crate: `serde`\n"
|
431
|
+
"Description: A generic serialization/deserialization framework\n"
|
432
|
+
"Keywords: serde, serialization\n"
|
433
|
+
"Key Dependencies: serde_derive\n"
|
434
|
+
"Category: Serialization\n\n"
|
435
|
+
|
436
|
+
"# Crate to Classify\n"
|
342
437
|
f"Crate: `{crate.name}`\n"
|
343
438
|
f"Description: {desc}\n"
|
344
439
|
f"Keywords: {joined}\n"
|
345
440
|
f"README Summary: {readme_summary}\n"
|
346
441
|
f"Key Dependencies: {key_deps_str}\n\n"
|
347
|
-
|
348
|
-
|
442
|
+
"Category (pick only one): [AI, Database, Web Framework, Networking, Serialization, Utilities, DevTools, ML, Cryptography, Unknown]\n"
|
443
|
+
"<|end|>"
|
349
444
|
)
|
350
|
-
|
445
|
+
# Validate classification with retry - more generous parameters
|
351
446
|
result = self.validate_and_retry(
|
352
|
-
prompt,
|
447
|
+
prompt,
|
353
448
|
validation_func=self.validate_classification,
|
354
449
|
temp=0.2, # Increased from 0.1 for more variety
|
355
450
|
max_tokens=50 # Increased from 20 to allow more complete responses
|
356
451
|
)
|
357
|
-
|
452
|
+
|
358
453
|
return result or "Unknown"
|
359
454
|
except Exception as e:
|
360
455
|
logging.error(f"Classification failed for {crate.name}: {str(e)}")
|
@@ -364,35 +459,39 @@ class LLMEnricher:
|
|
364
459
|
"""Generate factual/counterfactual pairs with retry and validation"""
|
365
460
|
try:
|
366
461
|
desc = self.truncate_content(crate.description, 300)
|
367
|
-
readme_summary = self.truncate_content(
|
368
|
-
|
462
|
+
readme_summary = self.truncate_content(
|
463
|
+
getattr(crate, 'readme_summary', '') or '', 300)
|
464
|
+
|
369
465
|
prompt = (
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
466
|
+
"<|system|>Create exactly 5 factual/counterfactual pairs for the Rust crate. "
|
467
|
+
"Factual statements must be true. Counterfactuals should be plausible but incorrect - "
|
468
|
+
"make them subtle and convincing rather than simple negations.\n"
|
469
|
+
"<|user|>\n"
|
374
470
|
f"Crate: {crate.name}\n"
|
375
471
|
f"Description: {desc}\n"
|
376
472
|
f"Repo: {crate.repository}\n"
|
377
473
|
f"README Summary: {readme_summary}\n"
|
378
474
|
f"Key Features: {', '.join([f.get('name', '') for f in crate.features[:5]])}\n\n"
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
475
|
+
"Format each pair as:\n"
|
476
|
+
"✅ Factual: [true statement about the crate]\n"
|
477
|
+
"❌ Counterfactual: [plausible but false statement]\n\n"
|
478
|
+
"Create exactly 5 pairs.\n"
|
479
|
+
"<|end|>"
|
384
480
|
)
|
385
|
-
|
481
|
+
# Use validation for retry - more generous parameters
|
386
482
|
result = self.validate_and_retry(
|
387
|
-
prompt,
|
388
|
-
validation_func=self.validate_factual_pairs,
|
483
|
+
prompt,
|
484
|
+
validation_func=self.validate_factual_pairs,
|
389
485
|
temp=0.7, # Increased from 0.6 for more creativity
|
390
486
|
max_tokens=800 # Increased from 500 for more complete responses
|
391
487
|
)
|
392
|
-
|
488
|
+
|
393
489
|
return result or "Factual pairs generation failed."
|
394
490
|
except Exception as e:
|
395
|
-
logging.error(
|
491
|
+
logging.error(
|
492
|
+
f"Exception in factual_pairs for {
|
493
|
+
crate.name}: {
|
494
|
+
str(e)}")
|
396
495
|
return "Factual pairs generation failed."
|
397
496
|
|
398
497
|
def score_crate(self, crate: CrateMetadata) -> float:
|
@@ -401,28 +500,33 @@ class LLMEnricher:
|
|
401
500
|
score += len(self.truncate_content(crate.readme, 1000)) / 500
|
402
501
|
return round(score, 2)
|
403
502
|
|
404
|
-
def batch_process_prompts(self,
|
503
|
+
def batch_process_prompts(self,
|
504
|
+
prompts: list[tuple[str,
|
505
|
+
float,
|
506
|
+
int]],
|
507
|
+
batch_size: int = 4) -> list[Optional[str]]:
|
405
508
|
"""
|
406
509
|
L4 GPU-optimized batch processing for multiple prompts
|
407
510
|
Processes prompts in batches to maximize GPU utilization
|
408
|
-
|
511
|
+
|
409
512
|
Args:
|
410
513
|
prompts: List of (prompt, temperature, max_tokens) tuples
|
411
514
|
batch_size: Number of prompts to process simultaneously (tuned for L4)
|
412
515
|
"""
|
413
516
|
results = []
|
414
|
-
|
517
|
+
|
415
518
|
# Process in batches optimized for L4's capabilities
|
416
519
|
for i in range(0, len(prompts), batch_size):
|
417
520
|
batch = prompts[i:i + batch_size]
|
418
521
|
batch_results = []
|
419
|
-
|
522
|
+
|
420
523
|
for prompt, temp, max_tokens in batch:
|
421
524
|
try:
|
422
525
|
# Prepare prompt with context preservation
|
423
|
-
if self.estimate_tokens(
|
526
|
+
if self.estimate_tokens(
|
527
|
+
prompt) > 3500: # Leave room for response
|
424
528
|
prompt = self.smart_truncate(prompt, 3500)
|
425
|
-
|
529
|
+
|
426
530
|
# Use optimized parameters for L4
|
427
531
|
output = self.model(
|
428
532
|
prompt,
|
@@ -434,23 +538,26 @@ class LLMEnricher:
|
|
434
538
|
echo=False, # Don't echo input
|
435
539
|
stream=False # Batch mode, no streaming
|
436
540
|
)
|
437
|
-
|
541
|
+
|
438
542
|
result = self.clean_output(output["choices"][0]["text"])
|
439
543
|
batch_results.append(result)
|
440
|
-
|
544
|
+
|
441
545
|
except Exception as e:
|
442
546
|
logging.warning(f"Batch processing error: {e}")
|
443
547
|
batch_results.append(None)
|
444
|
-
|
548
|
+
|
445
549
|
results.extend(batch_results)
|
446
|
-
|
550
|
+
|
447
551
|
# Small delay between batches to prevent thermal throttling
|
448
552
|
if i + batch_size < len(prompts):
|
449
553
|
time.sleep(0.1)
|
450
|
-
|
554
|
+
|
451
555
|
return results
|
452
556
|
|
453
|
-
def smart_context_management(
|
557
|
+
def smart_context_management(
|
558
|
+
self,
|
559
|
+
context_history: list[str],
|
560
|
+
new_prompt: str) -> str:
|
454
561
|
"""
|
455
562
|
Intelligent context management for prefix cache optimization
|
456
563
|
Maximizes cache hits by preserving common context patterns
|
@@ -458,14 +565,14 @@ class LLMEnricher:
|
|
458
565
|
# Calculate available tokens for context
|
459
566
|
base_tokens = self.estimate_tokens(new_prompt)
|
460
567
|
available_context = 4000 - base_tokens # Leave buffer for response
|
461
|
-
|
568
|
+
|
462
569
|
if available_context <= 0:
|
463
570
|
return new_prompt
|
464
|
-
|
571
|
+
|
465
572
|
# Build context from most recent and most relevant history
|
466
573
|
context_parts = []
|
467
574
|
tokens_used = 0
|
468
|
-
|
575
|
+
|
469
576
|
# Prioritize recent context (better cache hits)
|
470
577
|
for context in reversed(context_history[-5:]): # Last 5 contexts
|
471
578
|
context_tokens = self.estimate_tokens(context)
|
@@ -480,10 +587,10 @@ class LLMEnricher:
|
|
480
587
|
if truncated:
|
481
588
|
context_parts.insert(0, truncated)
|
482
589
|
break
|
483
|
-
|
590
|
+
|
484
591
|
# Combine context with new prompt
|
485
592
|
if context_parts:
|
486
593
|
full_context = "\n\n---\n\n".join(context_parts)
|
487
594
|
return f"{full_context}\n\n---\n\n{new_prompt}"
|
488
|
-
|
595
|
+
|
489
596
|
return new_prompt
|