rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +25 -25
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +309 -200
- rust_crate_pipeline/analysis.py +304 -368
- rust_crate_pipeline/azure_ai_processing.py +453 -0
- rust_crate_pipeline/config.py +57 -19
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +42 -36
- rust_crate_pipeline/main.py +386 -102
- rust_crate_pipeline/network.py +153 -133
- rust_crate_pipeline/pipeline.py +340 -264
- rust_crate_pipeline/production_config.py +35 -32
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +45 -14
- rust_crate_pipeline/utils/logging_utils.py +34 -17
- rust_crate_pipeline/version.py +47 -2
- rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
- rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
- rust_crate_pipeline-1.2.6.dist-info/METADATA +0 -573
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0
@@ -2,32 +2,61 @@
|
|
2
2
|
import re
|
3
3
|
import time
|
4
4
|
import logging
|
5
|
-
import
|
6
|
-
|
7
|
-
from
|
5
|
+
from typing import TypedDict, Union
|
6
|
+
|
7
|
+
from collections.abc import Callable
|
8
|
+
|
8
9
|
from .config import PipelineConfig, CrateMetadata, EnrichedCrate
|
9
10
|
|
11
|
+
# Optional imports with fallbacks
|
12
|
+
_ai_dependencies_available = True
|
13
|
+
try:
|
14
|
+
import tiktoken
|
15
|
+
from llama_cpp import Llama
|
16
|
+
except ImportError as e:
|
17
|
+
logging.warning(f"AI dependencies not available: {e}")
|
18
|
+
tiktoken = None # type: ignore[assignment]
|
19
|
+
Llama = None # type: ignore[assignment,misc]
|
20
|
+
_ai_dependencies_available = False
|
21
|
+
|
22
|
+
|
23
|
+
class Section(TypedDict):
|
24
|
+
heading: str
|
25
|
+
content: str
|
26
|
+
priority: int
|
27
|
+
|
28
|
+
|
10
29
|
class LLMEnricher:
|
11
|
-
def __init__(self, config: PipelineConfig):
|
30
|
+
def __init__(self, config: PipelineConfig) -> None:
|
31
|
+
if not _ai_dependencies_available:
|
32
|
+
raise ImportError(
|
33
|
+
"AI dependencies (tiktoken, llama_cpp) are not available. "
|
34
|
+
"Please install them to use LLMEnricher."
|
35
|
+
)
|
36
|
+
|
12
37
|
self.config = config
|
13
|
-
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
38
|
+
self.tokenizer = tiktoken.get_encoding("cl100k_base") # type: ignore
|
14
39
|
self.model = self._load_model()
|
15
|
-
|
16
|
-
def _load_model(self):
|
40
|
+
|
41
|
+
def _load_model(self) -> None:
|
17
42
|
"""Optimized for GCP g2-standard-4 with L4 GPU (24GB VRAM)"""
|
18
|
-
|
43
|
+
if not _ai_dependencies_available:
|
44
|
+
raise ImportError("Cannot load model: AI dependencies not available")
|
45
|
+
|
46
|
+
return Llama( # type: ignore
|
19
47
|
model_path=self.config.model_path,
|
20
|
-
n_ctx=4096,
|
21
|
-
n_batch=1024,
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
48
|
+
n_ctx=4096, # Larger context for L4's 24GB VRAM
|
49
|
+
n_batch=1024, # Larger batch size for better throughput
|
50
|
+
# Load ALL layers on GPU (L4 has plenty VRAM)
|
51
|
+
n_gpu_layers=-1,
|
52
|
+
n_threads=4, # Match the 4 vCPUs
|
53
|
+
n_threads_batch=4, # Parallel batch processing
|
54
|
+
use_mmap=True, # Memory-mapped files for efficiency
|
55
|
+
use_mlock=True, # Lock model in memory
|
56
|
+
rope_scaling_type=1, # RoPE scaling for longer contexts
|
28
57
|
rope_freq_base=10000.0, # Base frequency for RoPE
|
29
|
-
flash_attn=True,
|
30
|
-
verbose=False
|
58
|
+
flash_attn=True, # Enable flash attention if available
|
59
|
+
verbose=False, # Reduce logging overhead
|
31
60
|
)
|
32
61
|
|
33
62
|
def estimate_tokens(self, text: str) -> int:
|
@@ -37,7 +66,7 @@ class LLMEnricher:
|
|
37
66
|
"""Truncate content to fit within token limit"""
|
38
67
|
paragraphs = content.split("\n\n")
|
39
68
|
result, current_tokens = "", 0
|
40
|
-
|
69
|
+
|
41
70
|
for para in paragraphs:
|
42
71
|
tokens = len(self.tokenizer.encode(para))
|
43
72
|
if current_tokens + tokens <= max_tokens:
|
@@ -51,120 +80,152 @@ class LLMEnricher:
|
|
51
80
|
"""Intelligently truncate content to preserve the most important parts"""
|
52
81
|
if not content:
|
53
82
|
return ""
|
54
|
-
|
83
|
+
|
55
84
|
# If content is short enough, return it all
|
56
85
|
if len(self.tokenizer.encode(content)) <= max_tokens:
|
57
86
|
return content
|
58
|
-
|
87
|
+
|
59
88
|
# Split into sections based on markdown headers
|
60
|
-
sections = []
|
61
|
-
current_section = {
|
62
|
-
|
89
|
+
sections: list[Section] = []
|
90
|
+
current_section: Section = {
|
91
|
+
"heading": "Introduction",
|
92
|
+
"content": "",
|
93
|
+
"priority": 10,
|
94
|
+
}
|
95
|
+
|
63
96
|
for line in content.splitlines():
|
64
|
-
if re.match(r
|
97
|
+
if re.match(r"^#+\s+", line): # It's a header
|
65
98
|
# Save previous section if not empty
|
66
99
|
if current_section["content"].strip():
|
67
100
|
sections.append(current_section)
|
68
|
-
|
101
|
+
|
69
102
|
# Create new section with appropriate priority
|
70
|
-
heading = re.sub(r
|
103
|
+
heading = re.sub(r"^#+\s+", "", line)
|
71
104
|
priority = 5 # Default priority
|
72
|
-
|
105
|
+
|
73
106
|
# Assign priority based on content type
|
74
|
-
if re.search(r
|
107
|
+
if re.search(r"\b(Union[usage, example]|getting started)\b", heading, re.I):
|
75
108
|
priority = 10
|
76
|
-
elif re.search(r
|
109
|
+
elif re.search(r"\b(Union[feature, overview]|about)\b", heading, re.I):
|
77
110
|
priority = 9
|
78
|
-
elif re.search(r
|
111
|
+
elif re.search(r"\b(Union[install, setup]|config)\b", heading, re.I):
|
79
112
|
priority = 8
|
80
|
-
elif re.search(r
|
113
|
+
elif re.search(r"\b(Union[api, interface])\b", heading, re.I):
|
81
114
|
priority = 7
|
82
|
-
|
83
|
-
current_section = {
|
115
|
+
|
116
|
+
current_section = {
|
117
|
+
"heading": heading,
|
118
|
+
"content": line + "\n",
|
119
|
+
"priority": priority,
|
120
|
+
}
|
84
121
|
else:
|
85
122
|
current_section["content"] += line + "\n"
|
86
|
-
|
123
|
+
|
87
124
|
# Boost priority if code block is found
|
88
125
|
if "```rust" in line or "```no_run" in line:
|
89
126
|
current_section["priority"] = max(current_section["priority"], 8)
|
90
|
-
|
127
|
+
|
91
128
|
# Add the last section
|
92
129
|
if current_section["content"].strip():
|
93
130
|
sections.append(current_section)
|
94
|
-
|
131
|
+
|
95
132
|
# Sort sections by priority (highest first)
|
96
133
|
sections.sort(key=lambda x: x["priority"], reverse=True)
|
97
|
-
|
134
|
+
|
98
135
|
# Build the result, respecting token limits
|
99
136
|
result = ""
|
100
137
|
tokens_used = 0
|
101
|
-
|
138
|
+
|
102
139
|
for section in sections:
|
103
|
-
section_text = f
|
140
|
+
section_text = f'## {section["heading"]}\n{section["content"]}\n'
|
104
141
|
section_tokens = len(self.tokenizer.encode(section_text))
|
105
|
-
|
142
|
+
|
106
143
|
if tokens_used + section_tokens <= max_tokens:
|
107
144
|
result += section_text
|
108
145
|
tokens_used += section_tokens
|
109
146
|
elif tokens_used < max_tokens - 100: # If we can fit a truncated version
|
110
147
|
# Take what we can
|
111
148
|
remaining_tokens = max_tokens - tokens_used
|
112
|
-
truncated_text = self.tokenizer.decode(
|
149
|
+
truncated_text = self.tokenizer.decode(
|
150
|
+
self.tokenizer.encode(section_text)[:remaining_tokens]
|
151
|
+
)
|
113
152
|
result += truncated_text
|
114
153
|
break
|
115
|
-
|
154
|
+
|
116
155
|
return result
|
117
156
|
|
118
157
|
def clean_output(self, output: str, task: str = "general") -> str:
|
119
158
|
"""Task-specific output cleaning"""
|
120
159
|
if not output:
|
121
160
|
return ""
|
122
|
-
|
161
|
+
|
123
162
|
# Remove any remaining prompt artifacts
|
124
163
|
output = output.split("<|end|>")[0].strip()
|
125
|
-
|
164
|
+
|
126
165
|
if task == "classification":
|
127
166
|
# For classification tasks, extract just the category
|
128
|
-
categories = [
|
129
|
-
|
167
|
+
categories = [
|
168
|
+
"AI",
|
169
|
+
"Database",
|
170
|
+
"Web Framework",
|
171
|
+
"Networking",
|
172
|
+
"Serialization",
|
173
|
+
"Utilities",
|
174
|
+
"DevTools",
|
175
|
+
"ML",
|
176
|
+
"Cryptography",
|
177
|
+
"Unknown",
|
178
|
+
]
|
130
179
|
for category in categories:
|
131
|
-
if re.search(
|
180
|
+
if re.search(
|
181
|
+
r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
|
182
|
+
):
|
132
183
|
return category
|
133
184
|
return "Unknown"
|
134
|
-
|
185
|
+
|
135
186
|
elif task == "factual_pairs":
|
136
187
|
# For factual pairs, ensure proper formatting
|
137
|
-
pairs = []
|
138
|
-
facts = re.findall(r
|
139
|
-
counterfacts = re.findall(
|
140
|
-
|
188
|
+
pairs: list[str] = []
|
189
|
+
facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
|
190
|
+
counterfacts = re.findall(
|
191
|
+
r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
|
192
|
+
)
|
193
|
+
|
141
194
|
# Pair them up
|
142
195
|
for i in range(min(len(facts), len(counterfacts))):
|
143
|
-
pairs.append(
|
144
|
-
|
196
|
+
pairs.append(
|
197
|
+
f"✅ Factual: {facts[i].strip()}\n"
|
198
|
+
f"❌ Counterfactual: {counterfacts[i].strip()}"
|
199
|
+
)
|
200
|
+
|
145
201
|
return "\n\n".join(pairs)
|
146
|
-
|
202
|
+
|
147
203
|
else:
|
148
204
|
# General cleaning - more permissive than before
|
149
205
|
lines = [line.strip() for line in output.splitlines() if line.strip()]
|
150
206
|
return "\n".join(lines)
|
151
207
|
|
152
|
-
def run_llama(
|
208
|
+
def run_llama(
|
209
|
+
self, prompt: str, temp: float = 0.2, max_tokens: int = 256
|
210
|
+
) -> Union[str, None]:
|
153
211
|
"""Run the LLM with customizable parameters per task"""
|
154
212
|
try:
|
155
213
|
token_count = self.estimate_tokens(prompt)
|
156
214
|
if token_count > self.config.prompt_token_margin:
|
157
215
|
logging.warning(f"Prompt too long ({token_count} tokens). Truncating.")
|
158
|
-
prompt = self.truncate_content(
|
159
|
-
|
216
|
+
prompt = self.truncate_content(
|
217
|
+
prompt, self.config.prompt_token_margin - 100
|
218
|
+
)
|
219
|
+
|
160
220
|
output = self.model(
|
161
221
|
prompt,
|
162
222
|
max_tokens=max_tokens,
|
163
223
|
temperature=temp,
|
164
|
-
|
224
|
+
# Stop at these tokens
|
225
|
+
stop=["<|end|>", "<|user|>", "<|system|>"],
|
165
226
|
)
|
166
|
-
|
167
|
-
raw_text = output["choices"][0]["text"]
|
227
|
+
|
228
|
+
raw_text: str = output["choices"][0]["text"] # type: ignore
|
168
229
|
return self.clean_output(raw_text)
|
169
230
|
except Exception as e:
|
170
231
|
logging.error(f"Model inference failed: {str(e)}")
|
@@ -174,66 +235,102 @@ class LLMEnricher:
|
|
174
235
|
self,
|
175
236
|
prompt: str,
|
176
237
|
validation_func: Callable[[str], bool],
|
177
|
-
temp: float = 0.2,
|
178
|
-
|
179
|
-
|
238
|
+
temp: float = 0.2,
|
239
|
+
max_tokens: int = 256,
|
240
|
+
retries: int = 4, # Increased from 2 to 4 for better success rates
|
241
|
+
) -> Union[str, None]:
|
180
242
|
"""Run LLM with validation and automatic retry on failure"""
|
181
243
|
result = None
|
182
244
|
for attempt in range(retries):
|
183
245
|
try:
|
184
246
|
# More generous temperature adjustment for better variety
|
185
|
-
|
186
|
-
|
187
|
-
|
247
|
+
# 20% increases instead of 10%
|
248
|
+
adjusted_temp = temp * (1 + (attempt * 0.2))
|
249
|
+
result = self.run_llama(
|
250
|
+
prompt, temp=adjusted_temp, max_tokens=max_tokens
|
251
|
+
)
|
252
|
+
|
188
253
|
# Validate the result
|
189
254
|
if result and validation_func(result):
|
190
255
|
return result
|
191
|
-
|
192
|
-
# If we get here, validation failed - use debug level for early
|
256
|
+
|
257
|
+
# If we get here, validation failed - use debug level for early
|
258
|
+
# attempts
|
193
259
|
if attempt == retries - 1:
|
194
|
-
logging.debug(
|
260
|
+
logging.debug(
|
261
|
+
f"All {retries} validation attempts failed, "
|
262
|
+
"using last available result."
|
263
|
+
)
|
195
264
|
else:
|
196
|
-
logging.debug(
|
197
|
-
|
265
|
+
logging.debug(
|
266
|
+
f"Validation failed on attempt {attempt + 1}/{retries}. "
|
267
|
+
f"Retrying with adjusted temp={adjusted_temp:.2f}"
|
268
|
+
)
|
269
|
+
|
198
270
|
# Only simplify prompt on later attempts (attempt 2+)
|
199
271
|
if attempt >= 2:
|
200
272
|
prompt = self.simplify_prompt(prompt)
|
201
|
-
|
273
|
+
|
202
274
|
except Exception as e:
|
203
|
-
logging.error(f"Generation error on attempt {attempt+1}: {str(e)}")
|
204
|
-
|
275
|
+
logging.error(f"Generation error on attempt {attempt + 1}: {str(e)}")
|
276
|
+
|
205
277
|
# More generous backoff - give the model more time
|
206
278
|
time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
|
207
|
-
|
208
|
-
# If we exhausted all retries, return the last result even if not
|
209
|
-
|
279
|
+
|
280
|
+
# If we exhausted all retries, return the last result even if not
|
281
|
+
# perfect
|
282
|
+
return result if "result" in locals() else None
|
210
283
|
|
211
284
|
def simplify_prompt(self, prompt: str) -> str:
|
212
285
|
"""Simplify a prompt by removing examples and reducing context"""
|
213
286
|
# Remove few-shot examples
|
214
|
-
prompt = re.sub(
|
215
|
-
|
287
|
+
prompt = re.sub(
|
288
|
+
r"# Example [0-9].*?(?=# Crate to Classify|\Z)",
|
289
|
+
"",
|
290
|
+
prompt,
|
291
|
+
flags=re.DOTALL,
|
292
|
+
)
|
293
|
+
|
216
294
|
# Make instructions more direct
|
217
|
-
prompt = re.sub(
|
218
|
-
|
295
|
+
prompt = re.sub(
|
296
|
+
r"<\|system\|>.*?<\|user\|>",
|
297
|
+
"<|system|>Be concise.\n<|user|>",
|
298
|
+
prompt,
|
299
|
+
flags=re.DOTALL,
|
300
|
+
)
|
301
|
+
|
219
302
|
return prompt
|
220
303
|
|
221
304
|
def validate_classification(self, result: str) -> bool:
|
222
305
|
"""Ensure a valid category was returned"""
|
223
306
|
if not result:
|
224
307
|
return False
|
225
|
-
valid_categories = [
|
226
|
-
|
227
|
-
|
308
|
+
valid_categories = [
|
309
|
+
"AI",
|
310
|
+
"Database",
|
311
|
+
"Web Framework",
|
312
|
+
"Networking",
|
313
|
+
"Serialization",
|
314
|
+
"Utilities",
|
315
|
+
"DevTools",
|
316
|
+
"ML",
|
317
|
+
"Cryptography",
|
318
|
+
"Unknown",
|
319
|
+
]
|
320
|
+
return any(
|
321
|
+
category.lower() == result.strip().lower() for category in valid_categories
|
322
|
+
)
|
228
323
|
|
229
324
|
def validate_factual_pairs(self, result: str) -> bool:
|
230
325
|
"""Ensure exactly 5 factual/counterfactual pairs exist"""
|
231
326
|
if not result:
|
232
327
|
return False
|
233
|
-
|
234
|
-
facts = re.findall(r
|
235
|
-
counterfacts = re.findall(
|
236
|
-
|
328
|
+
|
329
|
+
facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", result, re.DOTALL)
|
330
|
+
counterfacts = re.findall(
|
331
|
+
r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", result, re.DOTALL
|
332
|
+
)
|
333
|
+
|
237
334
|
return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
|
238
335
|
|
239
336
|
def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
|
@@ -241,31 +338,29 @@ class LLMEnricher:
|
|
241
338
|
# Convert CrateMetadata to EnrichedCrate
|
242
339
|
enriched_dict = crate.__dict__.copy()
|
243
340
|
enriched = EnrichedCrate(**enriched_dict)
|
244
|
-
|
341
|
+
|
245
342
|
try:
|
246
343
|
# Generate README summary first
|
247
344
|
if crate.readme:
|
248
345
|
readme_content = self.smart_truncate(crate.readme, 2000)
|
249
346
|
prompt = (
|
250
|
-
|
251
|
-
|
252
|
-
f"
|
347
|
+
"<|system|>Extract key features from README.\n"
|
348
|
+
"<|user|>Summarize key aspects of this Rust crate from its "
|
349
|
+
f"README:\n{readme_content}\n"
|
350
|
+
"<|end|>"
|
253
351
|
)
|
254
352
|
enriched.readme_summary = self.validate_and_retry(
|
255
|
-
prompt,
|
256
|
-
|
257
|
-
|
258
|
-
max_tokens=300 )
|
259
|
-
|
353
|
+
prompt, lambda x: len(x) > 50, temp=0.3, max_tokens=300
|
354
|
+
)
|
355
|
+
|
260
356
|
# Generate other enrichments
|
261
357
|
enriched.feature_summary = self.summarize_features(crate)
|
262
358
|
enriched.use_case = self.classify_use_case(
|
263
|
-
crate,
|
264
|
-
enriched.readme_summary or ""
|
359
|
+
crate, enriched.readme_summary or ""
|
265
360
|
)
|
266
361
|
enriched.score = self.score_crate(crate)
|
267
362
|
enriched.factual_counterfactual = self.generate_factual_pairs(crate)
|
268
|
-
|
363
|
+
|
269
364
|
return enriched
|
270
365
|
except Exception as e:
|
271
366
|
logging.error(f"Failed to enrich {crate.name}: {str(e)}")
|
@@ -276,23 +371,24 @@ class LLMEnricher:
|
|
276
371
|
try:
|
277
372
|
if not crate.features:
|
278
373
|
return "No features documented for this crate."
|
279
|
-
|
374
|
+
|
280
375
|
# Format features with their dependencies
|
281
376
|
feature_text = ""
|
282
|
-
for
|
283
|
-
feature_name = f.get("name", "")
|
284
|
-
deps = f.get("dependencies", [])
|
377
|
+
for feature_name, deps in list(crate.features.items())[:8]:
|
285
378
|
deps_str = ", ".join(deps) if deps else "none"
|
286
379
|
feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
|
287
|
-
|
380
|
+
|
288
381
|
prompt = (
|
289
|
-
|
290
|
-
|
382
|
+
"<|system|>You are a Rust programming expert analyzing crate "
|
383
|
+
"features.\n"
|
384
|
+
f"<|user|>For the Rust crate `{crate.name}`, explain these "
|
385
|
+
"features and what functionality they provide:\n\n"
|
291
386
|
f"{feature_text}\n\n"
|
292
|
-
|
293
|
-
|
387
|
+
"Provide a concise explanation of each feature's purpose and "
|
388
|
+
"when a developer would enable it.\n"
|
389
|
+
"<|end|>"
|
294
390
|
)
|
295
|
-
|
391
|
+
|
296
392
|
# Use moderate temperature for informative but natural explanation
|
297
393
|
result = self.run_llama(prompt, temp=0.2, max_tokens=350)
|
298
394
|
return result or "Feature summary not available."
|
@@ -302,59 +398,67 @@ class LLMEnricher:
|
|
302
398
|
|
303
399
|
def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
|
304
400
|
"""Classify the use case of a crate with rich context"""
|
305
|
-
try:
|
306
|
-
|
307
|
-
|
401
|
+
try:
|
402
|
+
# Calculate available tokens for prompt
|
403
|
+
available_prompt_tokens = self.config.model_token_limit - 200
|
404
|
+
|
308
405
|
joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
|
309
|
-
key_deps = [
|
310
|
-
|
311
|
-
|
312
|
-
|
406
|
+
key_deps = [
|
407
|
+
dep.get("crate_id")
|
408
|
+
for dep in crate.dependencies[:5]
|
409
|
+
if dep.get("kind") == "normal" and dep.get("crate_id")
|
410
|
+
]
|
411
|
+
key_deps_str = (
|
412
|
+
", ".join(str(dep) for dep in key_deps) if key_deps else "None"
|
413
|
+
)
|
414
|
+
|
313
415
|
# Adaptively truncate different sections based on importance
|
314
|
-
token_budget = available_prompt_tokens - 400
|
315
|
-
|
416
|
+
token_budget = available_prompt_tokens - 400
|
417
|
+
|
316
418
|
# Allocate different percentages to each section
|
317
419
|
desc_tokens = int(token_budget * 0.2)
|
318
420
|
readme_tokens = int(token_budget * 0.6)
|
319
|
-
|
421
|
+
|
320
422
|
desc = self.truncate_content(crate.description, desc_tokens)
|
321
423
|
readme_summary = self.smart_truncate(readme_summary, readme_tokens)
|
322
|
-
|
424
|
+
|
323
425
|
# Few-shot prompting with examples
|
324
426
|
prompt = (
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
427
|
+
"<|system|>You are a Rust expert classifying crates into the "
|
428
|
+
"most appropriate category.\n"
|
429
|
+
"<|user|>\n"
|
430
|
+
"# Example 1\n"
|
431
|
+
"Crate: `tokio`\n"
|
432
|
+
"Description: An asynchronous runtime for the Rust programming "
|
433
|
+
"language\n"
|
434
|
+
"Keywords: async, runtime, futures\n"
|
435
|
+
"Key Dependencies: mio, bytes, parking_lot\n"
|
436
|
+
"Category: Networking\n\n"
|
437
|
+
"# Example 2\n"
|
438
|
+
"Crate: `serde`\n"
|
439
|
+
"Description: A generic serialization/deserialization framework\n"
|
440
|
+
"Keywords: serde, serialization\n"
|
441
|
+
"Key Dependencies: serde_derive\n"
|
442
|
+
"Category: Serialization\n\n"
|
443
|
+
"# Crate to Classify\n"
|
342
444
|
f"Crate: `{crate.name}`\n"
|
343
445
|
f"Description: {desc}\n"
|
344
446
|
f"Keywords: {joined}\n"
|
345
447
|
f"README Summary: {readme_summary}\n"
|
346
448
|
f"Key Dependencies: {key_deps_str}\n\n"
|
347
|
-
|
348
|
-
|
449
|
+
"Category (pick only one): [AI, Database, Web Framework, "
|
450
|
+
"Networking, Serialization, Utilities, DevTools, ML, "
|
451
|
+
"Cryptography, Unknown]\n"
|
452
|
+
"<|end|>"
|
349
453
|
)
|
350
|
-
|
454
|
+
# Validate classification with retry - more generous parameters
|
351
455
|
result = self.validate_and_retry(
|
352
|
-
prompt,
|
456
|
+
prompt,
|
353
457
|
validation_func=self.validate_classification,
|
354
|
-
temp=0.2,
|
355
|
-
max_tokens=50
|
458
|
+
temp=0.2,
|
459
|
+
max_tokens=50,
|
356
460
|
)
|
357
|
-
|
461
|
+
|
358
462
|
return result or "Unknown"
|
359
463
|
except Exception as e:
|
360
464
|
logging.error(f"Classification failed for {crate.name}: {str(e)}")
|
@@ -364,32 +468,36 @@ class LLMEnricher:
|
|
364
468
|
"""Generate factual/counterfactual pairs with retry and validation"""
|
365
469
|
try:
|
366
470
|
desc = self.truncate_content(crate.description, 300)
|
367
|
-
readme_summary = self.truncate_content(
|
368
|
-
|
471
|
+
readme_summary = self.truncate_content(
|
472
|
+
getattr(crate, "readme_summary", "") or "", 300
|
473
|
+
)
|
474
|
+
features = ", ".join(list(crate.features.keys())[:5])
|
475
|
+
|
369
476
|
prompt = (
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
477
|
+
"<|system|>Create exactly 5 factual/counterfactual pairs for "
|
478
|
+
"the Rust crate. Factual statements must be true. "
|
479
|
+
"Counterfactuals should be plausible but incorrect - make them "
|
480
|
+
"subtle and convincing rather than simple negations.\n"
|
481
|
+
"<|user|>\n"
|
374
482
|
f"Crate: {crate.name}\n"
|
375
483
|
f"Description: {desc}\n"
|
376
484
|
f"Repo: {crate.repository}\n"
|
377
485
|
f"README Summary: {readme_summary}\n"
|
378
|
-
f"Key Features: {
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
486
|
+
f"Key Features: {features}\n\n"
|
487
|
+
"Format each pair as:\n"
|
488
|
+
"✅ Factual: [true statement about the crate]\n"
|
489
|
+
"❌ Counterfactual: [plausible but false statement]\n\n"
|
490
|
+
"Create exactly 5 pairs.\n"
|
491
|
+
"<|end|>"
|
384
492
|
)
|
385
|
-
|
493
|
+
# Use validation for retry - more generous parameters
|
386
494
|
result = self.validate_and_retry(
|
387
|
-
prompt,
|
388
|
-
validation_func=self.validate_factual_pairs,
|
389
|
-
temp=0.7,
|
390
|
-
max_tokens=800
|
495
|
+
prompt,
|
496
|
+
validation_func=self.validate_factual_pairs,
|
497
|
+
temp=0.7,
|
498
|
+
max_tokens=800,
|
391
499
|
)
|
392
|
-
|
500
|
+
|
393
501
|
return result or "Factual pairs generation failed."
|
394
502
|
except Exception as e:
|
395
503
|
logging.error(f"Exception in factual_pairs for {crate.name}: {str(e)}")
|
@@ -401,71 +509,72 @@ class LLMEnricher:
|
|
401
509
|
score += len(self.truncate_content(crate.readme, 1000)) / 500
|
402
510
|
return round(score, 2)
|
403
511
|
|
404
|
-
def batch_process_prompts(
|
512
|
+
def batch_process_prompts(
|
513
|
+
self, prompts: list[tuple[str, float, int]], batch_size: int = 4
|
514
|
+
) -> list[Union[str, None]]:
|
405
515
|
"""
|
406
|
-
L4 GPU-optimized batch processing for multiple prompts
|
407
|
-
Processes prompts in batches to maximize GPU utilization
|
408
|
-
|
516
|
+
L4 GPU-optimized batch processing for multiple prompts.
|
517
|
+
Processes prompts in batches to maximize GPU utilization.
|
518
|
+
|
409
519
|
Args:
|
410
520
|
prompts: List of (prompt, temperature, max_tokens) tuples
|
411
|
-
batch_size: Number of prompts to process simultaneously
|
521
|
+
batch_size: Number of prompts to process simultaneously
|
412
522
|
"""
|
413
|
-
results = []
|
414
|
-
|
523
|
+
results: list[Union[str, None]] = []
|
524
|
+
|
415
525
|
# Process in batches optimized for L4's capabilities
|
416
526
|
for i in range(0, len(prompts), batch_size):
|
417
|
-
batch = prompts[i:i + batch_size]
|
418
|
-
batch_results = []
|
419
|
-
|
527
|
+
batch = prompts[i : i + batch_size]
|
528
|
+
batch_results: list[Union[str, None]] = []
|
529
|
+
|
420
530
|
for prompt, temp, max_tokens in batch:
|
421
531
|
try:
|
422
532
|
# Prepare prompt with context preservation
|
423
|
-
if self.estimate_tokens(prompt) > 3500:
|
533
|
+
if self.estimate_tokens(prompt) > 3500:
|
424
534
|
prompt = self.smart_truncate(prompt, 3500)
|
425
|
-
|
535
|
+
|
426
536
|
# Use optimized parameters for L4
|
427
537
|
output = self.model(
|
428
538
|
prompt,
|
429
539
|
max_tokens=max_tokens,
|
430
540
|
temperature=temp,
|
431
|
-
top_p=0.95,
|
432
|
-
repeat_penalty=1.1,
|
541
|
+
top_p=0.95,
|
542
|
+
repeat_penalty=1.1,
|
433
543
|
stop=["<|end|>", "<|user|>", "<|system|>"],
|
434
|
-
echo=False,
|
435
|
-
stream=False
|
544
|
+
echo=False,
|
545
|
+
stream=False,
|
436
546
|
)
|
437
|
-
|
438
|
-
|
547
|
+
|
548
|
+
# The type checker incorrectly infers a stream response
|
549
|
+
choice_text: str = output["choices"][0]["text"] # type: ignore
|
550
|
+
result = self.clean_output(choice_text)
|
439
551
|
batch_results.append(result)
|
440
|
-
|
441
552
|
except Exception as e:
|
442
|
-
logging.
|
553
|
+
logging.error(f"LLM batch processing error: {e}", exc_info=True)
|
443
554
|
batch_results.append(None)
|
444
|
-
|
555
|
+
|
445
556
|
results.extend(batch_results)
|
446
|
-
|
447
|
-
# Small delay between batches to prevent thermal throttling
|
448
|
-
if i + batch_size < len(prompts):
|
449
|
-
time.sleep(0.1)
|
450
|
-
|
557
|
+
|
451
558
|
return results
|
452
559
|
|
453
|
-
def smart_context_management(
|
560
|
+
def smart_context_management(
|
561
|
+
self, context_history: list[str], new_prompt: str
|
562
|
+
) -> str:
|
454
563
|
"""
|
455
|
-
Intelligent context management for prefix cache optimization
|
456
|
-
Maximizes cache hits by preserving common context patterns
|
564
|
+
Intelligent context management for prefix cache optimization.
|
565
|
+
Maximizes cache hits by preserving common context patterns.
|
457
566
|
"""
|
458
567
|
# Calculate available tokens for context
|
459
568
|
base_tokens = self.estimate_tokens(new_prompt)
|
460
569
|
available_context = 4000 - base_tokens # Leave buffer for response
|
461
|
-
|
570
|
+
|
462
571
|
if available_context <= 0:
|
463
572
|
return new_prompt
|
464
|
-
|
573
|
+
|
465
574
|
# Build context from most recent and most relevant history
|
466
|
-
context_parts = []
|
575
|
+
context_parts: list[str] = []
|
467
576
|
tokens_used = 0
|
468
|
-
|
577
|
+
|
469
578
|
# Prioritize recent context (better cache hits)
|
470
579
|
for context in reversed(context_history[-5:]): # Last 5 contexts
|
471
580
|
context_tokens = self.estimate_tokens(context)
|
@@ -480,10 +589,10 @@ class LLMEnricher:
|
|
480
589
|
if truncated:
|
481
590
|
context_parts.insert(0, truncated)
|
482
591
|
break
|
483
|
-
|
592
|
+
|
484
593
|
# Combine context with new prompt
|
485
594
|
if context_parts:
|
486
595
|
full_context = "\n\n---\n\n".join(context_parts)
|
487
596
|
return f"{full_context}\n\n---\n\n{new_prompt}"
|
488
|
-
|
597
|
+
|
489
598
|
return new_prompt
|