rust-crate-pipeline 1.2.5__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. rust_crate_pipeline/__init__.py +25 -25
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +309 -200
  4. rust_crate_pipeline/analysis.py +304 -368
  5. rust_crate_pipeline/azure_ai_processing.py +453 -0
  6. rust_crate_pipeline/config.py +57 -19
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +42 -36
  14. rust_crate_pipeline/main.py +386 -102
  15. rust_crate_pipeline/network.py +153 -133
  16. rust_crate_pipeline/pipeline.py +340 -264
  17. rust_crate_pipeline/production_config.py +35 -32
  18. rust_crate_pipeline/scraping/__init__.py +13 -0
  19. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  20. rust_crate_pipeline/unified_llm_processor.py +637 -0
  21. rust_crate_pipeline/unified_pipeline.py +548 -0
  22. rust_crate_pipeline/utils/file_utils.py +45 -14
  23. rust_crate_pipeline/utils/logging_utils.py +34 -17
  24. rust_crate_pipeline/version.py +47 -2
  25. rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
  26. rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
  27. rust_crate_pipeline-1.2.5.dist-info/METADATA +0 -573
  28. rust_crate_pipeline-1.2.5.dist-info/RECORD +0 -19
  29. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
  30. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
  31. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
  32. {rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0
@@ -2,32 +2,61 @@
2
2
  import re
3
3
  import time
4
4
  import logging
5
- import tiktoken
6
- from typing import Callable, Optional
7
- from llama_cpp import Llama
5
+ from typing import TypedDict, Union
6
+
7
+ from collections.abc import Callable
8
+
8
9
  from .config import PipelineConfig, CrateMetadata, EnrichedCrate
9
10
 
11
+ # Optional imports with fallbacks
12
+ _ai_dependencies_available = True
13
+ try:
14
+ import tiktoken
15
+ from llama_cpp import Llama
16
+ except ImportError as e:
17
+ logging.warning(f"AI dependencies not available: {e}")
18
+ tiktoken = None # type: ignore[assignment]
19
+ Llama = None # type: ignore[assignment,misc]
20
+ _ai_dependencies_available = False
21
+
22
+
23
+ class Section(TypedDict):
24
+ heading: str
25
+ content: str
26
+ priority: int
27
+
28
+
10
29
  class LLMEnricher:
11
- def __init__(self, config: PipelineConfig):
30
+ def __init__(self, config: PipelineConfig) -> None:
31
+ if not _ai_dependencies_available:
32
+ raise ImportError(
33
+ "AI dependencies (tiktoken, llama_cpp) are not available. "
34
+ "Please install them to use LLMEnricher."
35
+ )
36
+
12
37
  self.config = config
13
- self.tokenizer = tiktoken.get_encoding("cl100k_base")
38
+ self.tokenizer = tiktoken.get_encoding("cl100k_base") # type: ignore
14
39
  self.model = self._load_model()
15
-
16
- def _load_model(self):
40
+
41
+ def _load_model(self) -> None:
17
42
  """Optimized for GCP g2-standard-4 with L4 GPU (24GB VRAM)"""
18
- return Llama(
43
+ if not _ai_dependencies_available:
44
+ raise ImportError("Cannot load model: AI dependencies not available")
45
+
46
+ return Llama( # type: ignore
19
47
  model_path=self.config.model_path,
20
- n_ctx=4096, # Larger context for L4's 24GB VRAM
21
- n_batch=1024, # Larger batch size for better throughput
22
- n_gpu_layers=-1, # Load ALL layers on GPU (L4 has plenty VRAM)
23
- n_threads=4, # Match the 4 vCPUs
24
- n_threads_batch=4, # Parallel batch processing
25
- use_mmap=True, # Memory-mapped files for efficiency
26
- use_mlock=True, # Lock model in memory
27
- rope_scaling_type=1, # RoPE scaling for longer contexts
48
+ n_ctx=4096, # Larger context for L4's 24GB VRAM
49
+ n_batch=1024, # Larger batch size for better throughput
50
+ # Load ALL layers on GPU (L4 has plenty VRAM)
51
+ n_gpu_layers=-1,
52
+ n_threads=4, # Match the 4 vCPUs
53
+ n_threads_batch=4, # Parallel batch processing
54
+ use_mmap=True, # Memory-mapped files for efficiency
55
+ use_mlock=True, # Lock model in memory
56
+ rope_scaling_type=1, # RoPE scaling for longer contexts
28
57
  rope_freq_base=10000.0, # Base frequency for RoPE
29
- flash_attn=True, # Enable flash attention if available
30
- verbose=False # Reduce logging overhead
58
+ flash_attn=True, # Enable flash attention if available
59
+ verbose=False, # Reduce logging overhead
31
60
  )
32
61
 
33
62
  def estimate_tokens(self, text: str) -> int:
@@ -37,7 +66,7 @@ class LLMEnricher:
37
66
  """Truncate content to fit within token limit"""
38
67
  paragraphs = content.split("\n\n")
39
68
  result, current_tokens = "", 0
40
-
69
+
41
70
  for para in paragraphs:
42
71
  tokens = len(self.tokenizer.encode(para))
43
72
  if current_tokens + tokens <= max_tokens:
@@ -51,120 +80,152 @@ class LLMEnricher:
51
80
  """Intelligently truncate content to preserve the most important parts"""
52
81
  if not content:
53
82
  return ""
54
-
83
+
55
84
  # If content is short enough, return it all
56
85
  if len(self.tokenizer.encode(content)) <= max_tokens:
57
86
  return content
58
-
87
+
59
88
  # Split into sections based on markdown headers
60
- sections = []
61
- current_section = {"heading": "Introduction", "content": "", "priority": 10}
62
-
89
+ sections: list[Section] = []
90
+ current_section: Section = {
91
+ "heading": "Introduction",
92
+ "content": "",
93
+ "priority": 10,
94
+ }
95
+
63
96
  for line in content.splitlines():
64
- if re.match(r'^#+\s+', line): # It's a header
97
+ if re.match(r"^#+\s+", line): # It's a header
65
98
  # Save previous section if not empty
66
99
  if current_section["content"].strip():
67
100
  sections.append(current_section)
68
-
101
+
69
102
  # Create new section with appropriate priority
70
- heading = re.sub(r'^#+\s+', '', line)
103
+ heading = re.sub(r"^#+\s+", "", line)
71
104
  priority = 5 # Default priority
72
-
105
+
73
106
  # Assign priority based on content type
74
- if re.search(r'\b(usage|example|getting started)\b', heading, re.I):
107
+ if re.search(r"\b(Union[usage, example]|getting started)\b", heading, re.I):
75
108
  priority = 10
76
- elif re.search(r'\b(feature|overview|about)\b', heading, re.I):
109
+ elif re.search(r"\b(Union[feature, overview]|about)\b", heading, re.I):
77
110
  priority = 9
78
- elif re.search(r'\b(install|setup|config)\b', heading, re.I):
111
+ elif re.search(r"\b(Union[install, setup]|config)\b", heading, re.I):
79
112
  priority = 8
80
- elif re.search(r'\b(api|interface)\b', heading, re.I):
113
+ elif re.search(r"\b(Union[api, interface])\b", heading, re.I):
81
114
  priority = 7
82
-
83
- current_section = {"heading": heading, "content": line + "\n", "priority": priority}
115
+
116
+ current_section = {
117
+ "heading": heading,
118
+ "content": line + "\n",
119
+ "priority": priority,
120
+ }
84
121
  else:
85
122
  current_section["content"] += line + "\n"
86
-
123
+
87
124
  # Boost priority if code block is found
88
125
  if "```rust" in line or "```no_run" in line:
89
126
  current_section["priority"] = max(current_section["priority"], 8)
90
-
127
+
91
128
  # Add the last section
92
129
  if current_section["content"].strip():
93
130
  sections.append(current_section)
94
-
131
+
95
132
  # Sort sections by priority (highest first)
96
133
  sections.sort(key=lambda x: x["priority"], reverse=True)
97
-
134
+
98
135
  # Build the result, respecting token limits
99
136
  result = ""
100
137
  tokens_used = 0
101
-
138
+
102
139
  for section in sections:
103
- section_text = f"## {section['heading']}\n{section['content']}\n"
140
+ section_text = f'## {section["heading"]}\n{section["content"]}\n'
104
141
  section_tokens = len(self.tokenizer.encode(section_text))
105
-
142
+
106
143
  if tokens_used + section_tokens <= max_tokens:
107
144
  result += section_text
108
145
  tokens_used += section_tokens
109
146
  elif tokens_used < max_tokens - 100: # If we can fit a truncated version
110
147
  # Take what we can
111
148
  remaining_tokens = max_tokens - tokens_used
112
- truncated_text = self.tokenizer.decode(self.tokenizer.encode(section_text)[:remaining_tokens])
149
+ truncated_text = self.tokenizer.decode(
150
+ self.tokenizer.encode(section_text)[:remaining_tokens]
151
+ )
113
152
  result += truncated_text
114
153
  break
115
-
154
+
116
155
  return result
117
156
 
118
157
  def clean_output(self, output: str, task: str = "general") -> str:
119
158
  """Task-specific output cleaning"""
120
159
  if not output:
121
160
  return ""
122
-
161
+
123
162
  # Remove any remaining prompt artifacts
124
163
  output = output.split("<|end|>")[0].strip()
125
-
164
+
126
165
  if task == "classification":
127
166
  # For classification tasks, extract just the category
128
- categories = ["AI", "Database", "Web Framework", "Networking", "Serialization",
129
- "Utilities", "DevTools", "ML", "Cryptography", "Unknown"]
167
+ categories = [
168
+ "AI",
169
+ "Database",
170
+ "Web Framework",
171
+ "Networking",
172
+ "Serialization",
173
+ "Utilities",
174
+ "DevTools",
175
+ "ML",
176
+ "Cryptography",
177
+ "Unknown",
178
+ ]
130
179
  for category in categories:
131
- if re.search(r'\b' + re.escape(category) + r'\b', output, re.IGNORECASE):
180
+ if re.search(
181
+ r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
182
+ ):
132
183
  return category
133
184
  return "Unknown"
134
-
185
+
135
186
  elif task == "factual_pairs":
136
187
  # For factual pairs, ensure proper formatting
137
- pairs = []
138
- facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', output, re.DOTALL)
139
- counterfacts = re.findall(r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', output, re.DOTALL)
140
-
188
+ pairs: list[str] = []
189
+ facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
190
+ counterfacts = re.findall(
191
+ r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
192
+ )
193
+
141
194
  # Pair them up
142
195
  for i in range(min(len(facts), len(counterfacts))):
143
- pairs.append(f"✅ Factual: {facts[i].strip()}\n❌ Counterfactual: {counterfacts[i].strip()}")
144
-
196
+ pairs.append(
197
+ f"✅ Factual: {facts[i].strip()}\n"
198
+ f"❌ Counterfactual: {counterfacts[i].strip()}"
199
+ )
200
+
145
201
  return "\n\n".join(pairs)
146
-
202
+
147
203
  else:
148
204
  # General cleaning - more permissive than before
149
205
  lines = [line.strip() for line in output.splitlines() if line.strip()]
150
206
  return "\n".join(lines)
151
207
 
152
- def run_llama(self, prompt: str, temp: float = 0.2, max_tokens: int = 256) -> Optional[str]:
208
+ def run_llama(
209
+ self, prompt: str, temp: float = 0.2, max_tokens: int = 256
210
+ ) -> Union[str, None]:
153
211
  """Run the LLM with customizable parameters per task"""
154
212
  try:
155
213
  token_count = self.estimate_tokens(prompt)
156
214
  if token_count > self.config.prompt_token_margin:
157
215
  logging.warning(f"Prompt too long ({token_count} tokens). Truncating.")
158
- prompt = self.truncate_content(prompt, self.config.prompt_token_margin - 100)
159
-
216
+ prompt = self.truncate_content(
217
+ prompt, self.config.prompt_token_margin - 100
218
+ )
219
+
160
220
  output = self.model(
161
221
  prompt,
162
222
  max_tokens=max_tokens,
163
223
  temperature=temp,
164
- stop=["<|end|>", "<|user|>", "<|system|>"] # Stop at these tokens
224
+ # Stop at these tokens
225
+ stop=["<|end|>", "<|user|>", "<|system|>"],
165
226
  )
166
-
167
- raw_text = output["choices"][0]["text"]
227
+
228
+ raw_text: str = output["choices"][0]["text"] # type: ignore
168
229
  return self.clean_output(raw_text)
169
230
  except Exception as e:
170
231
  logging.error(f"Model inference failed: {str(e)}")
@@ -174,66 +235,102 @@ class LLMEnricher:
174
235
  self,
175
236
  prompt: str,
176
237
  validation_func: Callable[[str], bool],
177
- temp: float = 0.2, max_tokens: int = 256,
178
- retries: int = 4 # Increased from 2 to 4 for better success rates
179
- ) -> Optional[str]:
238
+ temp: float = 0.2,
239
+ max_tokens: int = 256,
240
+ retries: int = 4, # Increased from 2 to 4 for better success rates
241
+ ) -> Union[str, None]:
180
242
  """Run LLM with validation and automatic retry on failure"""
181
243
  result = None
182
244
  for attempt in range(retries):
183
245
  try:
184
246
  # More generous temperature adjustment for better variety
185
- adjusted_temp = temp * (1 + (attempt * 0.2)) # 20% increases instead of 10%
186
- result = self.run_llama(prompt, temp=adjusted_temp, max_tokens=max_tokens)
187
-
247
+ # 20% increases instead of 10%
248
+ adjusted_temp = temp * (1 + (attempt * 0.2))
249
+ result = self.run_llama(
250
+ prompt, temp=adjusted_temp, max_tokens=max_tokens
251
+ )
252
+
188
253
  # Validate the result
189
254
  if result and validation_func(result):
190
255
  return result
191
-
192
- # If we get here, validation failed - use debug level for early attempts
256
+
257
+ # If we get here, validation failed - use debug level for early
258
+ # attempts
193
259
  if attempt == retries - 1:
194
- logging.debug(f"All {retries} validation attempts failed, using last available result.")
260
+ logging.debug(
261
+ f"All {retries} validation attempts failed, "
262
+ "using last available result."
263
+ )
195
264
  else:
196
- logging.debug(f"Validation failed on attempt {attempt+1}/{retries}. Retrying with adjusted temp={adjusted_temp:.2f}")
197
-
265
+ logging.debug(
266
+ f"Validation failed on attempt {attempt + 1}/{retries}. "
267
+ f"Retrying with adjusted temp={adjusted_temp:.2f}"
268
+ )
269
+
198
270
  # Only simplify prompt on later attempts (attempt 2+)
199
271
  if attempt >= 2:
200
272
  prompt = self.simplify_prompt(prompt)
201
-
273
+
202
274
  except Exception as e:
203
- logging.error(f"Generation error on attempt {attempt+1}: {str(e)}")
204
-
275
+ logging.error(f"Generation error on attempt {attempt + 1}: {str(e)}")
276
+
205
277
  # More generous backoff - give the model more time
206
278
  time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
207
-
208
- # If we exhausted all retries, return the last result even if not perfect
209
- return result if 'result' in locals() else None
279
+
280
+ # If we exhausted all retries, return the last result even if not
281
+ # perfect
282
+ return result if "result" in locals() else None
210
283
 
211
284
  def simplify_prompt(self, prompt: str) -> str:
212
285
  """Simplify a prompt by removing examples and reducing context"""
213
286
  # Remove few-shot examples
214
- prompt = re.sub(r'# Example [0-9].*?(?=# Crate to Classify|\Z)', '', prompt, flags=re.DOTALL)
215
-
287
+ prompt = re.sub(
288
+ r"# Example [0-9].*?(?=# Crate to Classify|\Z)",
289
+ "",
290
+ prompt,
291
+ flags=re.DOTALL,
292
+ )
293
+
216
294
  # Make instructions more direct
217
- prompt = re.sub(r'<\|system\|>.*?<\|user\|>', '<|system|>Be concise.\n<|user|>', prompt, flags=re.DOTALL)
218
-
295
+ prompt = re.sub(
296
+ r"<\|system\|>.*?<\|user\|>",
297
+ "<|system|>Be concise.\n<|user|>",
298
+ prompt,
299
+ flags=re.DOTALL,
300
+ )
301
+
219
302
  return prompt
220
303
 
221
304
  def validate_classification(self, result: str) -> bool:
222
305
  """Ensure a valid category was returned"""
223
306
  if not result:
224
307
  return False
225
- valid_categories = ["AI", "Database", "Web Framework", "Networking", "Serialization",
226
- "Utilities", "DevTools", "ML", "Cryptography", "Unknown"]
227
- return any(category.lower() == result.strip().lower() for category in valid_categories)
308
+ valid_categories = [
309
+ "AI",
310
+ "Database",
311
+ "Web Framework",
312
+ "Networking",
313
+ "Serialization",
314
+ "Utilities",
315
+ "DevTools",
316
+ "ML",
317
+ "Cryptography",
318
+ "Unknown",
319
+ ]
320
+ return any(
321
+ category.lower() == result.strip().lower() for category in valid_categories
322
+ )
228
323
 
229
324
  def validate_factual_pairs(self, result: str) -> bool:
230
325
  """Ensure exactly 5 factual/counterfactual pairs exist"""
231
326
  if not result:
232
327
  return False
233
-
234
- facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', result, re.DOTALL)
235
- counterfacts = re.findall(r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', result, re.DOTALL)
236
-
328
+
329
+ facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", result, re.DOTALL)
330
+ counterfacts = re.findall(
331
+ r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", result, re.DOTALL
332
+ )
333
+
237
334
  return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
238
335
 
239
336
  def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
@@ -241,31 +338,29 @@ class LLMEnricher:
241
338
  # Convert CrateMetadata to EnrichedCrate
242
339
  enriched_dict = crate.__dict__.copy()
243
340
  enriched = EnrichedCrate(**enriched_dict)
244
-
341
+
245
342
  try:
246
343
  # Generate README summary first
247
344
  if crate.readme:
248
345
  readme_content = self.smart_truncate(crate.readme, 2000)
249
346
  prompt = (
250
- f"<|system|>Extract key features from README.\n"
251
- f"<|user|>Summarize key aspects of this Rust crate from its README:\n{readme_content}\n"
252
- f"<|end|>"
347
+ "<|system|>Extract key features from README.\n"
348
+ "<|user|>Summarize key aspects of this Rust crate from its "
349
+ f"README:\n{readme_content}\n"
350
+ "<|end|>"
253
351
  )
254
352
  enriched.readme_summary = self.validate_and_retry(
255
- prompt,
256
- lambda x: len(x) > 50,
257
- temp=0.3,
258
- max_tokens=300 )
259
-
353
+ prompt, lambda x: len(x) > 50, temp=0.3, max_tokens=300
354
+ )
355
+
260
356
  # Generate other enrichments
261
357
  enriched.feature_summary = self.summarize_features(crate)
262
358
  enriched.use_case = self.classify_use_case(
263
- crate,
264
- enriched.readme_summary or ""
359
+ crate, enriched.readme_summary or ""
265
360
  )
266
361
  enriched.score = self.score_crate(crate)
267
362
  enriched.factual_counterfactual = self.generate_factual_pairs(crate)
268
-
363
+
269
364
  return enriched
270
365
  except Exception as e:
271
366
  logging.error(f"Failed to enrich {crate.name}: {str(e)}")
@@ -276,23 +371,24 @@ class LLMEnricher:
276
371
  try:
277
372
  if not crate.features:
278
373
  return "No features documented for this crate."
279
-
374
+
280
375
  # Format features with their dependencies
281
376
  feature_text = ""
282
- for f in crate.features[:8]: # Limit to 8 features for context size
283
- feature_name = f.get("name", "")
284
- deps = f.get("dependencies", [])
377
+ for feature_name, deps in list(crate.features.items())[:8]:
285
378
  deps_str = ", ".join(deps) if deps else "none"
286
379
  feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
287
-
380
+
288
381
  prompt = (
289
- f"<|system|>You are a Rust programming expert analyzing crate features.\n"
290
- f"<|user|>For the Rust crate `{crate.name}`, explain these features and what functionality they provide:\n\n"
382
+ "<|system|>You are a Rust programming expert analyzing crate "
383
+ "features.\n"
384
+ f"<|user|>For the Rust crate `{crate.name}`, explain these "
385
+ "features and what functionality they provide:\n\n"
291
386
  f"{feature_text}\n\n"
292
- f"Provide a concise explanation of each feature's purpose and when a developer would enable it.\n"
293
- f"<|end|>"
387
+ "Provide a concise explanation of each feature's purpose and "
388
+ "when a developer would enable it.\n"
389
+ "<|end|>"
294
390
  )
295
-
391
+
296
392
  # Use moderate temperature for informative but natural explanation
297
393
  result = self.run_llama(prompt, temp=0.2, max_tokens=350)
298
394
  return result or "Feature summary not available."
@@ -302,59 +398,67 @@ class LLMEnricher:
302
398
 
303
399
  def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
304
400
  """Classify the use case of a crate with rich context"""
305
- try: # Calculate available tokens for prompt (classification usually needs ~20 response tokens)
306
- available_prompt_tokens = self.config.model_token_limit - 200 # Reserve for response
307
-
401
+ try:
402
+ # Calculate available tokens for prompt
403
+ available_prompt_tokens = self.config.model_token_limit - 200
404
+
308
405
  joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
309
- key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5]
310
- if dep.get("kind") == "normal" and dep.get("crate_id")]
311
- key_deps_str = ", ".join(str(dep) for dep in key_deps) if key_deps else "None"
312
-
406
+ key_deps = [
407
+ dep.get("crate_id")
408
+ for dep in crate.dependencies[:5]
409
+ if dep.get("kind") == "normal" and dep.get("crate_id")
410
+ ]
411
+ key_deps_str = (
412
+ ", ".join(str(dep) for dep in key_deps) if key_deps else "None"
413
+ )
414
+
313
415
  # Adaptively truncate different sections based on importance
314
- token_budget = available_prompt_tokens - 400 # Reserve tokens for prompt template
315
-
416
+ token_budget = available_prompt_tokens - 400
417
+
316
418
  # Allocate different percentages to each section
317
419
  desc_tokens = int(token_budget * 0.2)
318
420
  readme_tokens = int(token_budget * 0.6)
319
-
421
+
320
422
  desc = self.truncate_content(crate.description, desc_tokens)
321
423
  readme_summary = self.smart_truncate(readme_summary, readme_tokens)
322
-
424
+
323
425
  # Few-shot prompting with examples
324
426
  prompt = (
325
- f"<|system|>You are a Rust expert classifying crates into the most appropriate category.\n"
326
- f"<|user|>\n"
327
- f"# Example 1\n"
328
- f"Crate: `tokio`\n"
329
- f"Description: An asynchronous runtime for the Rust programming language\n"
330
- f"Keywords: async, runtime, futures\n"
331
- f"Key Dependencies: mio, bytes, parking_lot\n"
332
- f"Category: Networking\n\n"
333
-
334
- f"# Example 2\n"
335
- f"Crate: `serde`\n"
336
- f"Description: A generic serialization/deserialization framework\n"
337
- f"Keywords: serde, serialization\n"
338
- f"Key Dependencies: serde_derive\n"
339
- f"Category: Serialization\n\n"
340
-
341
- f"# Crate to Classify\n"
427
+ "<|system|>You are a Rust expert classifying crates into the "
428
+ "most appropriate category.\n"
429
+ "<|user|>\n"
430
+ "# Example 1\n"
431
+ "Crate: `tokio`\n"
432
+ "Description: An asynchronous runtime for the Rust programming "
433
+ "language\n"
434
+ "Keywords: async, runtime, futures\n"
435
+ "Key Dependencies: mio, bytes, parking_lot\n"
436
+ "Category: Networking\n\n"
437
+ "# Example 2\n"
438
+ "Crate: `serde`\n"
439
+ "Description: A generic serialization/deserialization framework\n"
440
+ "Keywords: serde, serialization\n"
441
+ "Key Dependencies: serde_derive\n"
442
+ "Category: Serialization\n\n"
443
+ "# Crate to Classify\n"
342
444
  f"Crate: `{crate.name}`\n"
343
445
  f"Description: {desc}\n"
344
446
  f"Keywords: {joined}\n"
345
447
  f"README Summary: {readme_summary}\n"
346
448
  f"Key Dependencies: {key_deps_str}\n\n"
347
- f"Category (pick only one): [AI, Database, Web Framework, Networking, Serialization, Utilities, DevTools, ML, Cryptography, Unknown]\n"
348
- f"<|end|>"
449
+ "Category (pick only one): [AI, Database, Web Framework, "
450
+ "Networking, Serialization, Utilities, DevTools, ML, "
451
+ "Cryptography, Unknown]\n"
452
+ "<|end|>"
349
453
  )
350
- # Validate classification with retry - more generous parameters
454
+ # Validate classification with retry - more generous parameters
351
455
  result = self.validate_and_retry(
352
- prompt,
456
+ prompt,
353
457
  validation_func=self.validate_classification,
354
- temp=0.2, # Increased from 0.1 for more variety
355
- max_tokens=50 # Increased from 20 to allow more complete responses
458
+ temp=0.2,
459
+ max_tokens=50,
356
460
  )
357
-
461
+
358
462
  return result or "Unknown"
359
463
  except Exception as e:
360
464
  logging.error(f"Classification failed for {crate.name}: {str(e)}")
@@ -364,32 +468,36 @@ class LLMEnricher:
364
468
  """Generate factual/counterfactual pairs with retry and validation"""
365
469
  try:
366
470
  desc = self.truncate_content(crate.description, 300)
367
- readme_summary = self.truncate_content(getattr(crate, 'readme_summary', '') or '', 300)
368
-
471
+ readme_summary = self.truncate_content(
472
+ getattr(crate, "readme_summary", "") or "", 300
473
+ )
474
+ features = ", ".join(list(crate.features.keys())[:5])
475
+
369
476
  prompt = (
370
- f"<|system|>Create exactly 5 factual/counterfactual pairs for the Rust crate. "
371
- f"Factual statements must be true. Counterfactuals should be plausible but incorrect - "
372
- f"make them subtle and convincing rather than simple negations.\n"
373
- f"<|user|>\n"
477
+ "<|system|>Create exactly 5 factual/counterfactual pairs for "
478
+ "the Rust crate. Factual statements must be true. "
479
+ "Counterfactuals should be plausible but incorrect - make them "
480
+ "subtle and convincing rather than simple negations.\n"
481
+ "<|user|>\n"
374
482
  f"Crate: {crate.name}\n"
375
483
  f"Description: {desc}\n"
376
484
  f"Repo: {crate.repository}\n"
377
485
  f"README Summary: {readme_summary}\n"
378
- f"Key Features: {', '.join([f.get('name', '') for f in crate.features[:5]])}\n\n"
379
- f"Format each pair as:\n"
380
- f"✅ Factual: [true statement about the crate]\n"
381
- f"❌ Counterfactual: [plausible but false statement]\n\n"
382
- f"Create exactly 5 pairs.\n"
383
- f"<|end|>"
486
+ f"Key Features: {features}\n\n"
487
+ "Format each pair as:\n"
488
+ "✅ Factual: [true statement about the crate]\n"
489
+ "❌ Counterfactual: [plausible but false statement]\n\n"
490
+ "Create exactly 5 pairs.\n"
491
+ "<|end|>"
384
492
  )
385
- # Use validation for retry - more generous parameters
493
+ # Use validation for retry - more generous parameters
386
494
  result = self.validate_and_retry(
387
- prompt,
388
- validation_func=self.validate_factual_pairs,
389
- temp=0.7, # Increased from 0.6 for more creativity
390
- max_tokens=800 # Increased from 500 for more complete responses
495
+ prompt,
496
+ validation_func=self.validate_factual_pairs,
497
+ temp=0.7,
498
+ max_tokens=800,
391
499
  )
392
-
500
+
393
501
  return result or "Factual pairs generation failed."
394
502
  except Exception as e:
395
503
  logging.error(f"Exception in factual_pairs for {crate.name}: {str(e)}")
@@ -401,71 +509,72 @@ class LLMEnricher:
401
509
  score += len(self.truncate_content(crate.readme, 1000)) / 500
402
510
  return round(score, 2)
403
511
 
404
- def batch_process_prompts(self, prompts: list[tuple[str, float, int]], batch_size: int = 4) -> list[Optional[str]]:
512
+ def batch_process_prompts(
513
+ self, prompts: list[tuple[str, float, int]], batch_size: int = 4
514
+ ) -> list[Union[str, None]]:
405
515
  """
406
- L4 GPU-optimized batch processing for multiple prompts
407
- Processes prompts in batches to maximize GPU utilization
408
-
516
+ L4 GPU-optimized batch processing for multiple prompts.
517
+ Processes prompts in batches to maximize GPU utilization.
518
+
409
519
  Args:
410
520
  prompts: List of (prompt, temperature, max_tokens) tuples
411
- batch_size: Number of prompts to process simultaneously (tuned for L4)
521
+ batch_size: Number of prompts to process simultaneously
412
522
  """
413
- results = []
414
-
523
+ results: list[Union[str, None]] = []
524
+
415
525
  # Process in batches optimized for L4's capabilities
416
526
  for i in range(0, len(prompts), batch_size):
417
- batch = prompts[i:i + batch_size]
418
- batch_results = []
419
-
527
+ batch = prompts[i : i + batch_size]
528
+ batch_results: list[Union[str, None]] = []
529
+
420
530
  for prompt, temp, max_tokens in batch:
421
531
  try:
422
532
  # Prepare prompt with context preservation
423
- if self.estimate_tokens(prompt) > 3500: # Leave room for response
533
+ if self.estimate_tokens(prompt) > 3500:
424
534
  prompt = self.smart_truncate(prompt, 3500)
425
-
535
+
426
536
  # Use optimized parameters for L4
427
537
  output = self.model(
428
538
  prompt,
429
539
  max_tokens=max_tokens,
430
540
  temperature=temp,
431
- top_p=0.95, # Nucleus sampling for better quality
432
- repeat_penalty=1.1, # Reduce repetition
541
+ top_p=0.95,
542
+ repeat_penalty=1.1,
433
543
  stop=["<|end|>", "<|user|>", "<|system|>"],
434
- echo=False, # Don't echo input
435
- stream=False # Batch mode, no streaming
544
+ echo=False,
545
+ stream=False,
436
546
  )
437
-
438
- result = self.clean_output(output["choices"][0]["text"])
547
+
548
+ # The type checker incorrectly infers a stream response
549
+ choice_text: str = output["choices"][0]["text"] # type: ignore
550
+ result = self.clean_output(choice_text)
439
551
  batch_results.append(result)
440
-
441
552
  except Exception as e:
442
- logging.warning(f"Batch processing error: {e}")
553
+ logging.error(f"LLM batch processing error: {e}", exc_info=True)
443
554
  batch_results.append(None)
444
-
555
+
445
556
  results.extend(batch_results)
446
-
447
- # Small delay between batches to prevent thermal throttling
448
- if i + batch_size < len(prompts):
449
- time.sleep(0.1)
450
-
557
+
451
558
  return results
452
559
 
453
- def smart_context_management(self, context_history: list[str], new_prompt: str) -> str:
560
+ def smart_context_management(
561
+ self, context_history: list[str], new_prompt: str
562
+ ) -> str:
454
563
  """
455
- Intelligent context management for prefix cache optimization
456
- Maximizes cache hits by preserving common context patterns
564
+ Intelligent context management for prefix cache optimization.
565
+ Maximizes cache hits by preserving common context patterns.
457
566
  """
458
567
  # Calculate available tokens for context
459
568
  base_tokens = self.estimate_tokens(new_prompt)
460
569
  available_context = 4000 - base_tokens # Leave buffer for response
461
-
570
+
462
571
  if available_context <= 0:
463
572
  return new_prompt
464
-
573
+
465
574
  # Build context from most recent and most relevant history
466
- context_parts = []
575
+ context_parts: list[str] = []
467
576
  tokens_used = 0
468
-
577
+
469
578
  # Prioritize recent context (better cache hits)
470
579
  for context in reversed(context_history[-5:]): # Last 5 contexts
471
580
  context_tokens = self.estimate_tokens(context)
@@ -480,10 +589,10 @@ class LLMEnricher:
480
589
  if truncated:
481
590
  context_parts.insert(0, truncated)
482
591
  break
483
-
592
+
484
593
  # Combine context with new prompt
485
594
  if context_parts:
486
595
  full_context = "\n\n---\n\n".join(context_parts)
487
596
  return f"{full_context}\n\n---\n\n{new_prompt}"
488
-
597
+
489
598
  return new_prompt