rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. rust_crate_pipeline/__init__.py +18 -27
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +718 -596
  4. rust_crate_pipeline/analysis.py +330 -363
  5. rust_crate_pipeline/azure_ai_processing.py +462 -0
  6. rust_crate_pipeline/config.py +46 -28
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +108 -112
  14. rust_crate_pipeline/main.py +329 -109
  15. rust_crate_pipeline/network.py +317 -308
  16. rust_crate_pipeline/pipeline.py +300 -375
  17. rust_crate_pipeline/production_config.py +24 -27
  18. rust_crate_pipeline/progress_monitor.py +334 -0
  19. rust_crate_pipeline/scraping/__init__.py +13 -0
  20. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  21. rust_crate_pipeline/unified_llm_processor.py +637 -0
  22. rust_crate_pipeline/unified_pipeline.py +548 -0
  23. rust_crate_pipeline/utils/file_utils.py +32 -5
  24. rust_crate_pipeline/utils/logging_utils.py +21 -16
  25. rust_crate_pipeline/version.py +76 -47
  26. rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
  27. rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
  28. rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
  29. rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
  30. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
  31. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
  32. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
  33. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -1,596 +1,718 @@
1
- # ai_processing.py
2
- import re
3
- import time
4
- import logging
5
- from typing import Callable, Optional, Any, Dict, List
6
- from .config import PipelineConfig, CrateMetadata, EnrichedCrate
7
-
8
- # Optional imports with fallbacks
9
- _ai_dependencies_available = True
10
- try:
11
- import tiktoken
12
- from llama_cpp import Llama
13
- except ImportError as e:
14
- logging.warning(f"AI dependencies not available: {e}")
15
- tiktoken = None
16
- Llama = None
17
- _ai_dependencies_available = False
18
-
19
-
20
- class LLMEnricher:
21
- def __init__(self, config: PipelineConfig):
22
- if not _ai_dependencies_available:
23
- raise ImportError("AI dependencies (tiktoken, llama_cpp) are not available. Please install them to use LLMEnricher.")
24
-
25
- self.config = config
26
- self.tokenizer = tiktoken.get_encoding("cl100k_base") # type: ignore
27
- self.model = self._load_model()
28
-
29
- def _load_model(self):
30
- """Optimized for GCP g2-standard-4 with L4 GPU (24GB VRAM)"""
31
- if not _ai_dependencies_available:
32
- raise ImportError("Cannot load model: AI dependencies not available")
33
-
34
- return Llama( # type: ignore
35
- model_path=self.config.model_path,
36
- n_ctx=4096, # Larger context for L4's 24GB VRAM
37
- n_batch=1024, # Larger batch size for better throughput
38
- # Load ALL layers on GPU (L4 has plenty VRAM)
39
- n_gpu_layers=-1,
40
- n_threads=4, # Match the 4 vCPUs
41
- n_threads_batch=4, # Parallel batch processing
42
- use_mmap=True, # Memory-mapped files for efficiency
43
- use_mlock=True, # Lock model in memory
44
- rope_scaling_type=1, # RoPE scaling for longer contexts
45
- rope_freq_base=10000.0, # Base frequency for RoPE
46
- flash_attn=True, # Enable flash attention if available
47
- verbose=False # Reduce logging overhead
48
- )
49
-
50
- def estimate_tokens(self, text: str) -> int:
51
- return len(self.tokenizer.encode(text))
52
-
53
- def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
54
- """Truncate content to fit within token limit"""
55
- paragraphs = content.split("\n\n")
56
- result, current_tokens = "", 0
57
-
58
- for para in paragraphs:
59
- tokens = len(self.tokenizer.encode(para))
60
- if current_tokens + tokens <= max_tokens:
61
- result += para + "\n\n"
62
- current_tokens += tokens
63
- else:
64
- break
65
- return result.strip()
66
-
67
- def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
68
- """Intelligently truncate content to preserve the most important parts"""
69
- if not content:
70
- return ""
71
-
72
- # If content is short enough, return it all
73
- if len(self.tokenizer.encode(content)) <= max_tokens:
74
- return content
75
-
76
- # Split into sections based on markdown headers
77
- sections = []
78
- current_section = {
79
- "heading": "Introduction",
80
- "content": "",
81
- "priority": 10}
82
-
83
- for line in content.splitlines():
84
- if re.match(r'^#+\s+', line): # It's a header
85
- # Save previous section if not empty
86
- if current_section["content"].strip():
87
- sections.append(current_section)
88
-
89
- # Create new section with appropriate priority
90
- heading = re.sub(r'^#+\s+', '', line)
91
- priority = 5 # Default priority
92
-
93
- # Assign priority based on content type
94
- if re.search(
95
- r'\b(usage|example|getting started)\b',
96
- heading,
97
- re.I):
98
- priority = 10
99
- elif re.search(r'\b(feature|overview|about)\b', heading, re.I):
100
- priority = 9
101
- elif re.search(r'\b(install|setup|config)\b', heading, re.I):
102
- priority = 8
103
- elif re.search(r'\b(api|interface)\b', heading, re.I):
104
- priority = 7
105
-
106
- current_section = {
107
- "heading": heading,
108
- "content": line + "\n",
109
- "priority": priority}
110
- else:
111
- current_section["content"] += line + "\n"
112
-
113
- # Boost priority if code block is found
114
- if "```rust" in line or "```no_run" in line:
115
- current_section["priority"] = max(
116
- current_section["priority"], 8)
117
-
118
- # Add the last section
119
- if current_section["content"].strip():
120
- sections.append(current_section)
121
-
122
- # Sort sections by priority (highest first)
123
- sections.sort(key=lambda x: x["priority"], reverse=True)
124
-
125
- # Build the result, respecting token limits
126
- result = ""
127
- tokens_used = 0
128
-
129
- for section in sections:
130
- section_text = f"## {section['heading']}\n{section['content']}\n"
131
- section_tokens = len(self.tokenizer.encode(section_text))
132
-
133
- if tokens_used + section_tokens <= max_tokens:
134
- result += section_text
135
- tokens_used += section_tokens
136
- elif tokens_used < max_tokens - 100: # If we can fit a truncated version
137
- # Take what we can
138
- remaining_tokens = max_tokens - tokens_used
139
- truncated_text = self.tokenizer.decode(
140
- self.tokenizer.encode(section_text)[:remaining_tokens])
141
- result += truncated_text
142
- break
143
-
144
- return result
145
-
146
- def clean_output(self, output: str, task: str = "general") -> str:
147
- """Task-specific output cleaning"""
148
- if not output:
149
- return ""
150
-
151
- # Remove any remaining prompt artifacts
152
- output = output.split("<|end|>")[0].strip()
153
-
154
- if task == "classification":
155
- # For classification tasks, extract just the category
156
- categories = [
157
- "AI",
158
- "Database",
159
- "Web Framework",
160
- "Networking",
161
- "Serialization",
162
- "Utilities",
163
- "DevTools",
164
- "ML",
165
- "Cryptography",
166
- "Unknown"]
167
- for category in categories:
168
- if re.search(
169
- r'\b' +
170
- re.escape(category) +
171
- r'\b',
172
- output,
173
- re.IGNORECASE):
174
- return category
175
- return "Unknown"
176
-
177
- elif task == "factual_pairs":
178
- # For factual pairs, ensure proper formatting
179
- pairs = []
180
- facts = re.findall(
181
- r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)',
182
- output,
183
- re.DOTALL)
184
- counterfacts = re.findall(
185
- r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', output, re.DOTALL)
186
-
187
- # Pair them up
188
- for i in range(min(len(facts), len(counterfacts))):
189
- pairs.append(
190
- f"✅ Factual: {
191
- facts[i].strip()}\n❌ Counterfactual: {
192
- counterfacts[i].strip()}")
193
-
194
- return "\n\n".join(pairs)
195
-
196
- else:
197
- # General cleaning - more permissive than before
198
- lines = [line.strip()
199
- for line in output.splitlines() if line.strip()]
200
- return "\n".join(lines)
201
-
202
- def run_llama(self, prompt: str, temp: float = 0.2,
203
- max_tokens: int = 256) -> Optional[str]:
204
- """Run the LLM with customizable parameters per task"""
205
- try:
206
- token_count = self.estimate_tokens(prompt)
207
- if token_count > self.config.prompt_token_margin:
208
- logging.warning(
209
- f"Prompt too long ({token_count} tokens). Truncating.")
210
- prompt = self.truncate_content(
211
- prompt, self.config.prompt_token_margin - 100)
212
-
213
- output = self.model(
214
- prompt,
215
- max_tokens=max_tokens,
216
- temperature=temp,
217
- # Stop at these tokens
218
- stop=["<|end|>", "<|user|>", "<|system|>"]
219
- )
220
-
221
- raw_text = output["choices"][0]["text"]
222
- return self.clean_output(raw_text)
223
- except Exception as e:
224
- logging.error(f"Model inference failed: {str(e)}")
225
- raise
226
-
227
- def validate_and_retry(
228
- self,
229
- prompt: str,
230
- validation_func: Callable[[str], bool],
231
- temp: float = 0.2, max_tokens: int = 256,
232
- retries: int = 4 # Increased from 2 to 4 for better success rates
233
- ) -> Optional[str]:
234
- """Run LLM with validation and automatic retry on failure"""
235
- result = None
236
- for attempt in range(retries):
237
- try:
238
- # More generous temperature adjustment for better variety
239
- # 20% increases instead of 10%
240
- adjusted_temp = temp * (1 + (attempt * 0.2))
241
- result = self.run_llama(
242
- prompt, temp=adjusted_temp, max_tokens=max_tokens)
243
-
244
- # Validate the result
245
- if result and validation_func(result):
246
- return result
247
-
248
- # If we get here, validation failed - use debug level for early
249
- # attempts
250
- if attempt == retries - 1:
251
- logging.debug(
252
- f"All {retries} validation attempts failed, using last available result.")
253
- else:
254
- logging.debug(
255
- f"Validation failed on attempt {
256
- attempt + 1}/{retries}. Retrying with adjusted temp={
257
- adjusted_temp:.2f}")
258
-
259
- # Only simplify prompt on later attempts (attempt 2+)
260
- if attempt >= 2:
261
- prompt = self.simplify_prompt(prompt)
262
-
263
- except Exception as e:
264
- logging.error(
265
- f"Generation error on attempt {
266
- attempt +
267
- 1}: {
268
- str(e)}")
269
-
270
- # More generous backoff - give the model more time
271
- time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
272
-
273
- # If we exhausted all retries, return the last result even if not
274
- # perfect
275
- return result if 'result' in locals() else None
276
-
277
- def simplify_prompt(self, prompt: str) -> str:
278
- """Simplify a prompt by removing examples and reducing context"""
279
- # Remove few-shot examples
280
- prompt = re.sub(
281
- r'# Example [0-9].*?(?=# Crate to Classify|\Z)',
282
- '',
283
- prompt,
284
- flags=re.DOTALL)
285
-
286
- # Make instructions more direct
287
- prompt = re.sub(
288
- r'<\|system\|>.*?<\|user\|>',
289
- '<|system|>Be concise.\n<|user|>',
290
- prompt,
291
- flags=re.DOTALL)
292
-
293
- return prompt
294
-
295
- def validate_classification(self, result: str) -> bool:
296
- """Ensure a valid category was returned"""
297
- if not result:
298
- return False
299
- valid_categories = [
300
- "AI",
301
- "Database",
302
- "Web Framework",
303
- "Networking",
304
- "Serialization",
305
- "Utilities",
306
- "DevTools",
307
- "ML",
308
- "Cryptography",
309
- "Unknown"]
310
- return any(category.lower() == result.strip().lower()
311
- for category in valid_categories)
312
-
313
- def validate_factual_pairs(self, result: str) -> bool:
314
- """Ensure exactly 5 factual/counterfactual pairs exist"""
315
- if not result:
316
- return False
317
-
318
- facts = re.findall(r'✅\s*Factual:?\s*(.*?)(?=❌|\Z)', result, re.DOTALL)
319
- counterfacts = re.findall(
320
- r'❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)', result, re.DOTALL)
321
-
322
- return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
323
-
324
- def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
325
- """Apply all AI enrichments to a crate"""
326
- # Convert CrateMetadata to EnrichedCrate
327
- enriched_dict = crate.__dict__.copy()
328
- enriched = EnrichedCrate(**enriched_dict)
329
-
330
- try:
331
- # Generate README summary first
332
- if crate.readme:
333
- readme_content = self.smart_truncate(crate.readme, 2000)
334
- prompt = (
335
- "<|system|>Extract key features from README.\n"
336
- f"<|user|>Summarize key aspects of this Rust crate from its README:\n{readme_content}\n"
337
- "<|end|>"
338
- )
339
- enriched.readme_summary = self.validate_and_retry(
340
- prompt,
341
- lambda x: len(x) > 50,
342
- temp=0.3,
343
- max_tokens=300)
344
-
345
- # Generate other enrichments
346
- enriched.feature_summary = self.summarize_features(crate)
347
- enriched.use_case = self.classify_use_case(
348
- crate,
349
- enriched.readme_summary or ""
350
- )
351
- enriched.score = self.score_crate(crate)
352
- enriched.factual_counterfactual = self.generate_factual_pairs(
353
- crate)
354
-
355
- return enriched
356
- except Exception as e:
357
- logging.error(f"Failed to enrich {crate.name}: {str(e)}")
358
- return enriched
359
-
360
- def summarize_features(self, crate: CrateMetadata) -> str:
361
- """Generate summaries for crate features with better prompting"""
362
- try:
363
- if not crate.features:
364
- return "No features documented for this crate."
365
-
366
- # Format features with their dependencies
367
- feature_text = ""
368
- for f in crate.features[:8]: # Limit to 8 features for context size
369
- feature_name = f.get("name", "")
370
- deps = f.get("dependencies", [])
371
- deps_str = ", ".join(deps) if deps else "none"
372
- feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
373
-
374
- prompt = (
375
- "<|system|>You are a Rust programming expert analyzing crate features.\n"
376
- f"<|user|>For the Rust crate `{crate.name}`, explain these features and what functionality they provide:\n\n"
377
- f"{feature_text}\n\n"
378
- "Provide a concise explanation of each feature's purpose and when a developer would enable it.\n"
379
- "<|end|>"
380
- )
381
-
382
- # Use moderate temperature for informative but natural explanation
383
- result = self.run_llama(prompt, temp=0.2, max_tokens=350)
384
- return result or "Feature summary not available."
385
- except Exception as e:
386
- logging.warning(
387
- f"Feature summarization failed for {
388
- crate.name}: {
389
- str(e)}")
390
- return "Feature summary not available."
391
-
392
- def classify_use_case(
393
- self,
394
- crate: CrateMetadata,
395
- readme_summary: str) -> str:
396
- """Classify the use case of a crate with rich context"""
397
- try: # Calculate available tokens for prompt (classification usually needs ~20 response tokens)
398
- available_prompt_tokens = self.config.model_token_limit - 200 # Reserve for response
399
-
400
- joined = ", ".join(
401
- crate.keywords[:10]) if crate.keywords else "None"
402
- key_deps = [dep.get("crate_id") for dep in crate.dependencies[:5]
403
- if dep.get("kind") == "normal" and dep.get("crate_id")]
404
- key_deps_str = ", ".join(str(dep)
405
- for dep in key_deps) if key_deps else "None"
406
-
407
- # Adaptively truncate different sections based on importance
408
- token_budget = available_prompt_tokens - \
409
- 400 # Reserve tokens for prompt template
410
-
411
- # Allocate different percentages to each section
412
- desc_tokens = int(token_budget * 0.2)
413
- readme_tokens = int(token_budget * 0.6)
414
-
415
- desc = self.truncate_content(crate.description, desc_tokens)
416
- readme_summary = self.smart_truncate(readme_summary, readme_tokens)
417
-
418
- # Few-shot prompting with examples
419
- prompt = (
420
- "<|system|>You are a Rust expert classifying crates into the most appropriate category.\n"
421
- "<|user|>\n"
422
- "# Example 1\n"
423
- "Crate: `tokio`\n"
424
- "Description: An asynchronous runtime for the Rust programming language\n"
425
- "Keywords: async, runtime, futures\n"
426
- "Key Dependencies: mio, bytes, parking_lot\n"
427
- "Category: Networking\n\n"
428
-
429
- "# Example 2\n"
430
- "Crate: `serde`\n"
431
- "Description: A generic serialization/deserialization framework\n"
432
- "Keywords: serde, serialization\n"
433
- "Key Dependencies: serde_derive\n"
434
- "Category: Serialization\n\n"
435
-
436
- "# Crate to Classify\n"
437
- f"Crate: `{crate.name}`\n"
438
- f"Description: {desc}\n"
439
- f"Keywords: {joined}\n"
440
- f"README Summary: {readme_summary}\n"
441
- f"Key Dependencies: {key_deps_str}\n\n"
442
- "Category (pick only one): [AI, Database, Web Framework, Networking, Serialization, Utilities, DevTools, ML, Cryptography, Unknown]\n"
443
- "<|end|>"
444
- )
445
- # Validate classification with retry - more generous parameters
446
- result = self.validate_and_retry(
447
- prompt,
448
- validation_func=self.validate_classification,
449
- temp=0.2, # Increased from 0.1 for more variety
450
- max_tokens=50 # Increased from 20 to allow more complete responses
451
- )
452
-
453
- return result or "Unknown"
454
- except Exception as e:
455
- logging.error(f"Classification failed for {crate.name}: {str(e)}")
456
- return "Unknown"
457
-
458
- def generate_factual_pairs(self, crate: CrateMetadata) -> str:
459
- """Generate factual/counterfactual pairs with retry and validation"""
460
- try:
461
- desc = self.truncate_content(crate.description, 300)
462
- readme_summary = self.truncate_content(
463
- getattr(crate, 'readme_summary', '') or '', 300)
464
-
465
- prompt = (
466
- "<|system|>Create exactly 5 factual/counterfactual pairs for the Rust crate. "
467
- "Factual statements must be true. Counterfactuals should be plausible but incorrect - "
468
- "make them subtle and convincing rather than simple negations.\n"
469
- "<|user|>\n"
470
- f"Crate: {crate.name}\n"
471
- f"Description: {desc}\n"
472
- f"Repo: {crate.repository}\n"
473
- f"README Summary: {readme_summary}\n"
474
- f"Key Features: {', '.join([f.get('name', '') for f in crate.features[:5]])}\n\n"
475
- "Format each pair as:\n"
476
- "✅ Factual: [true statement about the crate]\n"
477
- "❌ Counterfactual: [plausible but false statement]\n\n"
478
- "Create exactly 5 pairs.\n"
479
- "<|end|>"
480
- )
481
- # Use validation for retry - more generous parameters
482
- result = self.validate_and_retry(
483
- prompt,
484
- validation_func=self.validate_factual_pairs,
485
- temp=0.7, # Increased from 0.6 for more creativity
486
- max_tokens=800 # Increased from 500 for more complete responses
487
- )
488
-
489
- return result or "Factual pairs generation failed."
490
- except Exception as e:
491
- logging.error(
492
- f"Exception in factual_pairs for {
493
- crate.name}: {
494
- str(e)}")
495
- return "Factual pairs generation failed."
496
-
497
- def score_crate(self, crate: CrateMetadata) -> float:
498
- """Calculate a score for the crate based on various metrics"""
499
- score = (crate.downloads / 1000) + (crate.github_stars * 10)
500
- score += len(self.truncate_content(crate.readme, 1000)) / 500
501
- return round(score, 2)
502
-
503
- def batch_process_prompts(self,
504
- prompts: list[tuple[str,
505
- float,
506
- int]],
507
- batch_size: int = 4) -> list[Optional[str]]:
508
- """
509
- L4 GPU-optimized batch processing for multiple prompts
510
- Processes prompts in batches to maximize GPU utilization
511
-
512
- Args:
513
- prompts: List of (prompt, temperature, max_tokens) tuples
514
- batch_size: Number of prompts to process simultaneously (tuned for L4)
515
- """
516
- results = []
517
-
518
- # Process in batches optimized for L4's capabilities
519
- for i in range(0, len(prompts), batch_size):
520
- batch = prompts[i:i + batch_size]
521
- batch_results = []
522
-
523
- for prompt, temp, max_tokens in batch:
524
- try:
525
- # Prepare prompt with context preservation
526
- if self.estimate_tokens(
527
- prompt) > 3500: # Leave room for response
528
- prompt = self.smart_truncate(prompt, 3500)
529
-
530
- # Use optimized parameters for L4
531
- output = self.model(
532
- prompt,
533
- max_tokens=max_tokens,
534
- temperature=temp,
535
- top_p=0.95, # Nucleus sampling for better quality
536
- repeat_penalty=1.1, # Reduce repetition
537
- stop=["<|end|>", "<|user|>", "<|system|>"],
538
- echo=False, # Don't echo input
539
- stream=False # Batch mode, no streaming
540
- )
541
-
542
- result = self.clean_output(output["choices"][0]["text"])
543
- batch_results.append(result)
544
-
545
- except Exception as e:
546
- logging.warning(f"Batch processing error: {e}")
547
- batch_results.append(None)
548
-
549
- results.extend(batch_results)
550
-
551
- # Small delay between batches to prevent thermal throttling
552
- if i + batch_size < len(prompts):
553
- time.sleep(0.1)
554
-
555
- return results
556
-
557
- def smart_context_management(
558
- self,
559
- context_history: list[str],
560
- new_prompt: str) -> str:
561
- """
562
- Intelligent context management for prefix cache optimization
563
- Maximizes cache hits by preserving common context patterns
564
- """
565
- # Calculate available tokens for context
566
- base_tokens = self.estimate_tokens(new_prompt)
567
- available_context = 4000 - base_tokens # Leave buffer for response
568
-
569
- if available_context <= 0:
570
- return new_prompt
571
-
572
- # Build context from most recent and most relevant history
573
- context_parts = []
574
- tokens_used = 0
575
-
576
- # Prioritize recent context (better cache hits)
577
- for context in reversed(context_history[-5:]): # Last 5 contexts
578
- context_tokens = self.estimate_tokens(context)
579
- if tokens_used + context_tokens <= available_context:
580
- context_parts.insert(0, context)
581
- tokens_used += context_tokens
582
- else:
583
- # Try to fit truncated version
584
- remaining_tokens = available_context - tokens_used
585
- if remaining_tokens > 100: # Only if meaningful space left
586
- truncated = self.smart_truncate(context, remaining_tokens)
587
- if truncated:
588
- context_parts.insert(0, truncated)
589
- break
590
-
591
- # Combine context with new prompt
592
- if context_parts:
593
- full_context = "\n\n---\n\n".join(context_parts)
594
- return f"{full_context}\n\n---\n\n{new_prompt}"
595
-
596
- return new_prompt
1
+ # ai_processing.py
2
+ import re
3
+ import time
4
+ import logging
5
+ import os
6
+ from typing import TypedDict, Union
7
+
8
+ from collections.abc import Callable
9
+
10
+ from .config import PipelineConfig, CrateMetadata, EnrichedCrate
11
+
12
+ # Optional imports with fallbacks
13
+ _ai_dependencies_available = True
14
+ try:
15
+ import tiktoken
16
+ from llama_cpp import Llama
17
+ except ImportError as e:
18
+ logging.warning(f"AI dependencies not available: {e}")
19
+ tiktoken = None # type: ignore[assignment]
20
+ Llama = None # type: ignore[assignment,misc]
21
+ _ai_dependencies_available = False
22
+
23
+
24
+ class Section(TypedDict):
25
+ heading: str
26
+ content: str
27
+ priority: int
28
+
29
+
30
+ class LLMEnricher:
31
+ def __init__(self, config: PipelineConfig) -> None:
32
+ """Initialize LLMEnricher with automatic provider detection"""
33
+ if not _ai_dependencies_available:
34
+ raise ImportError("Cannot load model: AI dependencies not available")
35
+
36
+ self.config = config
37
+ self.tokenizer = tiktoken.get_encoding("cl100k_base") # type: ignore
38
+
39
+ # Auto-detect and configure the appropriate LLM provider
40
+ self.model = self._auto_detect_and_load_model()
41
+
42
+ def _auto_detect_and_load_model(self):
43
+ """Automatically detect and load the appropriate LLM provider"""
44
+
45
+ # Priority 1: Check if Azure OpenAI is configured and available
46
+ if (self.config.use_azure_openai and
47
+ self.config.azure_openai_endpoint and
48
+ self.config.azure_openai_api_key and
49
+ self.config.azure_openai_deployment_name):
50
+
51
+ try:
52
+ # Use the UnifiedLLMProcessor for Azure
53
+ from .unified_llm_processor import create_llm_processor_from_config
54
+ return create_llm_processor_from_config(self.config)
55
+ except Exception as e:
56
+ logging.warning(f"Azure OpenAI setup failed, falling back to local: {e}")
57
+
58
+ # Priority 2: Check if local model file exists
59
+ if os.path.exists(self.config.model_path):
60
+ try:
61
+ return self._load_local_model()
62
+ except Exception as e:
63
+ logging.warning(f"Local model loading failed: {e}")
64
+
65
+ # Priority 3: Check for other local providers (Ollama, LM Studio)
66
+ if self._check_ollama_available():
67
+ try:
68
+ from .unified_llm_processor import LLMConfig, UnifiedLLMProcessor
69
+ llm_config = LLMConfig(
70
+ provider="ollama",
71
+ model="llama2", # Default model
72
+ temperature=0.2,
73
+ max_tokens=self.config.max_tokens,
74
+ timeout=30,
75
+ max_retries=self.config.max_retries
76
+ )
77
+ return UnifiedLLMProcessor(llm_config)
78
+ except Exception as e:
79
+ logging.warning(f"Ollama setup failed: {e}")
80
+
81
+ # Priority 4: Check for LM Studio
82
+ if self._check_lmstudio_available():
83
+ try:
84
+ from .unified_llm_processor import LLMConfig, UnifiedLLMProcessor
85
+ llm_config = LLMConfig(
86
+ provider="lmstudio",
87
+ model="local-model", # Default model
88
+ temperature=0.2,
89
+ max_tokens=self.config.max_tokens,
90
+ timeout=30,
91
+ max_retries=self.config.max_retries
92
+ )
93
+ return UnifiedLLMProcessor(llm_config)
94
+ except Exception as e:
95
+ logging.warning(f"LM Studio setup failed: {e}")
96
+
97
+ # If all else fails, raise a clear error
98
+ raise RuntimeError(
99
+ "No LLM provider available. Please configure one of:\n"
100
+ "1. Azure OpenAI (set use_azure_openai=True and credentials)\n"
101
+ "2. Local model file (set model_path to existing .gguf file)\n"
102
+ "3. Ollama (install and run ollama serve)\n"
103
+ "4. LM Studio (install and run LM Studio server)"
104
+ )
105
+
106
+ def _load_local_model(self):
107
+ """Load local llama.cpp model"""
108
+ return Llama( # type: ignore
109
+ model_path=self.config.model_path,
110
+ n_ctx=4096, # Larger context for L4's 24GB VRAM
111
+ n_batch=1024, # Larger batch size for better throughput
112
+ # Load ALL layers on GPU (L4 has plenty VRAM)
113
+ n_gpu_layers=-1,
114
+ n_threads=4, # Match the 4 vCPUs
115
+ n_threads_batch=4, # Parallel batch processing
116
+ use_mmap=True, # Memory-mapped files for efficiency
117
+ use_mlock=True, # Lock model in memory
118
+ rope_scaling_type=1, # RoPE scaling for longer contexts
119
+ rope_freq_base=10000.0, # Base frequency for RoPE
120
+ flash_attn=True, # Enable flash attention if available
121
+ verbose=False, # Reduce logging overhead
122
+ )
123
+
124
+ def _check_ollama_available(self):
125
+ """Check if Ollama is available"""
126
+ try:
127
+ import requests
128
+ response = requests.get("http://localhost:11434/api/tags", timeout=5)
129
+ return response.status_code == 200
130
+ except:
131
+ return False
132
+
133
+ def _check_lmstudio_available(self):
134
+ """Check if LM Studio is available"""
135
+ try:
136
+ import requests
137
+ response = requests.get("http://localhost:1234/v1/models", timeout=5)
138
+ return response.status_code == 200
139
+ except:
140
+ return False
141
+
142
+ def estimate_tokens(self, text: str) -> int:
143
+ return len(self.tokenizer.encode(text))
144
+
145
+ def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
146
+ """Truncate content to fit within token limit"""
147
+ paragraphs = content.split("\n\n")
148
+ result, current_tokens = "", 0
149
+
150
+ for para in paragraphs:
151
+ tokens = len(self.tokenizer.encode(para))
152
+ if current_tokens + tokens <= max_tokens:
153
+ result += para + "\n\n"
154
+ current_tokens += tokens
155
+ else:
156
+ break
157
+ return result.strip()
158
+
159
+ def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
160
+ """Intelligently truncate content to preserve the most important parts"""
161
+ if not content:
162
+ return ""
163
+
164
+ # If content is short enough, return it all
165
+ if len(self.tokenizer.encode(content)) <= max_tokens:
166
+ return content
167
+
168
+ # Split into sections based on markdown headers
169
+ sections: list[Section] = []
170
+ current_section: Section = {
171
+ "heading": "Introduction",
172
+ "content": "",
173
+ "priority": 10,
174
+ }
175
+
176
+ for line in content.splitlines():
177
+ if re.match(r"^#+\s+", line): # It's a header
178
+ # Save previous section if not empty
179
+ if current_section["content"].strip():
180
+ sections.append(current_section)
181
+
182
+ # Create new section with appropriate priority
183
+ heading = re.sub(r"^#+\s+", "", line)
184
+ priority = 5 # Default priority
185
+
186
+ # Assign priority based on content type
187
+ if re.search(r"\b(Union[usage, example]|getting started)\b", heading, re.I):
188
+ priority = 10
189
+ elif re.search(r"\b(Union[feature, overview]|about)\b", heading, re.I):
190
+ priority = 9
191
+ elif re.search(r"\b(Union[install, setup]|config)\b", heading, re.I):
192
+ priority = 8
193
+ elif re.search(r"\b(Union[api, interface])\b", heading, re.I):
194
+ priority = 7
195
+
196
+ current_section = {
197
+ "heading": heading,
198
+ "content": line + "\n",
199
+ "priority": priority,
200
+ }
201
+ else:
202
+ current_section["content"] += line + "\n"
203
+
204
+ # Boost priority if code block is found
205
+ if "```rust" in line or "```no_run" in line:
206
+ current_section["priority"] = max(current_section["priority"], 8)
207
+
208
+ # Add the last section
209
+ if current_section["content"].strip():
210
+ sections.append(current_section)
211
+
212
+ # Sort sections by priority (highest first)
213
+ sections.sort(key=lambda x: x["priority"], reverse=True)
214
+
215
+ # Build the result, respecting token limits
216
+ result = ""
217
+ tokens_used = 0
218
+
219
+ for section in sections:
220
+ section_text = f'## {section["heading"]}\n{section["content"]}\n'
221
+ section_tokens = len(self.tokenizer.encode(section_text))
222
+
223
+ if tokens_used + section_tokens <= max_tokens:
224
+ result += section_text
225
+ tokens_used += section_tokens
226
+ elif tokens_used < max_tokens - 100: # If we can fit a truncated version
227
+ # Take what we can
228
+ remaining_tokens = max_tokens - tokens_used
229
+ truncated_text = self.tokenizer.decode(
230
+ self.tokenizer.encode(section_text)[:remaining_tokens]
231
+ )
232
+ result += truncated_text
233
+ break
234
+
235
+ return result
236
+
237
+ def clean_output(self, output: str, task: str = "general") -> str:
238
+ """Task-specific output cleaning"""
239
+ if not output:
240
+ return ""
241
+
242
+ # Remove any remaining prompt artifacts
243
+ output = output.split("<|end|>")[0].strip()
244
+
245
+ if task == "classification":
246
+ # For classification tasks, extract just the category
247
+ categories = [
248
+ "AI",
249
+ "Database",
250
+ "Web Framework",
251
+ "Networking",
252
+ "Serialization",
253
+ "Utilities",
254
+ "DevTools",
255
+ "ML",
256
+ "Cryptography",
257
+ "Unknown",
258
+ ]
259
+ for category in categories:
260
+ if re.search(
261
+ r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
262
+ ):
263
+ return category
264
+ return "Unknown"
265
+
266
+ elif task == "factual_pairs":
267
+ # For factual pairs, ensure proper formatting
268
+ pairs: list[str] = []
269
+ facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
270
+ counterfacts = re.findall(
271
+ r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
272
+ )
273
+
274
+ # Pair them up
275
+ for i in range(min(len(facts), len(counterfacts))):
276
+ pairs.append(
277
+ f"✅ Factual: {facts[i].strip()}\n"
278
+ f" Counterfactual: {counterfacts[i].strip()}"
279
+ )
280
+
281
+ return "\n\n".join(pairs)
282
+
283
+ else:
284
+ # General cleaning - more permissive than before
285
+ lines = [line.strip() for line in output.splitlines() if line.strip()]
286
+ return "\n".join(lines)
287
+
288
+ def run_llama(
289
+ self, prompt: str, temp: float = 0.2, max_tokens: int = 256
290
+ ) -> Union[str, None]:
291
+ """Run the LLM with customizable parameters per task"""
292
+ try:
293
+ token_count = self.estimate_tokens(prompt)
294
+ if token_count > self.config.prompt_token_margin:
295
+ logging.warning(f"Prompt too long ({token_count} tokens). Truncating.")
296
+ prompt = self.truncate_content(
297
+ prompt, self.config.prompt_token_margin - 100
298
+ )
299
+
300
+ # Handle different model types
301
+ from .unified_llm_processor import UnifiedLLMProcessor
302
+ if isinstance(self.model, UnifiedLLMProcessor):
303
+ # UnifiedLLMProcessor
304
+ return self.model.call_llm(prompt, temp, max_tokens)
305
+ else:
306
+ # Local Llama model
307
+ output = self.model(
308
+ prompt,
309
+ max_tokens=max_tokens,
310
+ temperature=temp,
311
+ # Stop at these tokens
312
+ stop=["<|end|>", "<|user|>", "<|system|>"],
313
+ )
314
+
315
+ raw_text: str = output["choices"][0]["text"] # type: ignore
316
+ return self.clean_output(raw_text)
317
+ except Exception as e:
318
+ logging.error(f"Model inference failed: {str(e)}")
319
+ raise
320
+
321
+ def validate_and_retry(
322
+ self,
323
+ prompt: str,
324
+ validation_func: Callable[[str], bool],
325
+ temp: float = 0.2,
326
+ max_tokens: int = 256,
327
+ retries: int = 4, # Increased from 2 to 4 for better success rates
328
+ ) -> Union[str, None]:
329
+ """Run LLM with validation and automatic retry on failure"""
330
+ result = None
331
+ for attempt in range(retries):
332
+ try:
333
+ # More generous temperature adjustment for better variety
334
+ # 20% increases instead of 10%
335
+ adjusted_temp = temp * (1 + (attempt * 0.2))
336
+ result = self.run_llama(
337
+ prompt, temp=adjusted_temp, max_tokens=max_tokens
338
+ )
339
+
340
+ # Validate the result
341
+ if result and validation_func(result):
342
+ return result
343
+
344
+ # If we get here, validation failed - use debug level for early
345
+ # attempts
346
+ if attempt == retries - 1:
347
+ logging.debug(
348
+ f"All {retries} validation attempts failed, "
349
+ "using last available result."
350
+ )
351
+ else:
352
+ logging.debug(
353
+ f"Validation failed on attempt {attempt + 1}/{retries}. "
354
+ f"Retrying with adjusted temp={adjusted_temp:.2f}"
355
+ )
356
+
357
+ # Only simplify prompt on later attempts (attempt 2+)
358
+ if attempt >= 2:
359
+ prompt = self.simplify_prompt(prompt)
360
+
361
+ except Exception as e:
362
+ logging.error(f"Generation error on attempt {attempt + 1}: {str(e)}")
363
+
364
+ # More generous backoff - give the model more time
365
+ time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
366
+
367
+ # If we exhausted all retries, return the last result even if not
368
+ # perfect
369
+ return result if "result" in locals() else None
370
+
371
+ def simplify_prompt(self, prompt: str) -> str:
372
+ """Simplify a prompt by removing examples and reducing context"""
373
+ # Remove few-shot examples
374
+ prompt = re.sub(
375
+ r"# Example [0-9].*?(?=# Crate to Classify|\Z)",
376
+ "",
377
+ prompt,
378
+ flags=re.DOTALL,
379
+ )
380
+
381
+ # Make instructions more direct
382
+ prompt = re.sub(
383
+ r"<\|system\|>.*?<\|user\|>",
384
+ "<|system|>Be concise.\n<|user|>",
385
+ prompt,
386
+ flags=re.DOTALL,
387
+ )
388
+
389
+ return prompt
390
+
391
+ def validate_classification(self, result: str) -> bool:
392
+ """Ensure a valid category was returned"""
393
+ if not result:
394
+ return False
395
+ valid_categories = [
396
+ "AI",
397
+ "Database",
398
+ "Web Framework",
399
+ "Networking",
400
+ "Serialization",
401
+ "Utilities",
402
+ "DevTools",
403
+ "ML",
404
+ "Cryptography",
405
+ "Unknown",
406
+ ]
407
+ return any(
408
+ category.lower() == result.strip().lower() for category in valid_categories
409
+ )
410
+
411
+ def validate_factual_pairs(self, result: str) -> bool:
412
+ """Ensure exactly 5 factual/counterfactual pairs exist"""
413
+ if not result:
414
+ return False
415
+
416
+ facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", result, re.DOTALL)
417
+ counterfacts = re.findall(
418
+ r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", result, re.DOTALL
419
+ )
420
+
421
+ return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
422
+
423
+ def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
424
+ """Apply all AI enrichments to a crate"""
425
+ # Convert CrateMetadata to EnrichedCrate
426
+ enriched_dict = crate.__dict__.copy()
427
+ enriched = EnrichedCrate(**enriched_dict)
428
+
429
+ try:
430
+ # Generate README summary first
431
+ if crate.readme:
432
+ readme_content = self.smart_truncate(crate.readme, 2000)
433
+ prompt = (
434
+ "<|system|>Extract key features from README.\n"
435
+ "<|user|>Summarize key aspects of this Rust crate from its "
436
+ f"README:\n{readme_content}\n"
437
+ "<|end|>"
438
+ )
439
+ enriched.readme_summary = self.validate_and_retry(
440
+ prompt, lambda x: len(x) > 50, temp=0.3, max_tokens=300
441
+ )
442
+
443
+ # Generate other enrichments
444
+ enriched.feature_summary = self.summarize_features(crate)
445
+ enriched.use_case = self.classify_use_case(
446
+ crate, enriched.readme_summary or ""
447
+ )
448
+ enriched.score = self.score_crate(crate)
449
+ enriched.factual_counterfactual = self.generate_factual_pairs(crate)
450
+
451
+ return enriched
452
+ except Exception as e:
453
+ logging.error(f"Failed to enrich {crate.name}: {str(e)}")
454
+ return enriched
455
+
456
+ def summarize_features(self, crate: CrateMetadata) -> str:
457
+ """Generate summaries for crate features with better prompting"""
458
+ try:
459
+ if not crate.features:
460
+ return "No features documented for this crate."
461
+
462
+ # Handle both dict and list feature formats
463
+ feature_text = ""
464
+ if isinstance(crate.features, dict):
465
+ # Format features with their dependencies
466
+ for feature_name, deps in list(crate.features.items())[:8]:
467
+ deps_str = ", ".join(deps) if deps else "none"
468
+ feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
469
+ elif isinstance(crate.features, list):
470
+ # Handle list format - assume each item is a feature name
471
+ for feature in crate.features[:8]:
472
+ if isinstance(feature, str):
473
+ feature_text += f"- {feature} (dependencies: none)\n"
474
+ elif isinstance(feature, dict):
475
+ # If feature is a dict, try to extract name and deps
476
+ feature_name = feature.get('name', str(feature))
477
+ deps = feature.get('dependencies', [])
478
+ deps_str = ", ".join(deps) if deps else "none"
479
+ feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
480
+ else:
481
+ feature_text += f"- {str(feature)} (dependencies: none)\n"
482
+ else:
483
+ return "Features format not recognized."
484
+
485
+ prompt = (
486
+ "<|system|>You are a Rust programming expert analyzing crate "
487
+ "features.\n"
488
+ f"<|user|>For the Rust crate `{crate.name}`, explain these "
489
+ "features and what functionality they provide:\n\n"
490
+ f"{feature_text}\n\n"
491
+ "Provide a concise explanation of each feature's purpose and "
492
+ "when a developer would enable it.\n"
493
+ "<|end|>"
494
+ )
495
+
496
+ # Use moderate temperature for informative but natural explanation
497
+ result = self.run_llama(prompt, temp=0.2, max_tokens=350)
498
+ return result or "Feature summary not available."
499
+ except Exception as e:
500
+ logging.warning(f"Feature summarization failed for {crate.name}: {str(e)}")
501
+ return "Feature summary not available."
502
+
503
+ def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
504
+ """Classify the use case of a crate with rich context"""
505
+ try:
506
+ # Calculate available tokens for prompt
507
+ available_prompt_tokens = self.config.model_token_limit - 200
508
+
509
+ joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
510
+ key_deps = [
511
+ dep.get("crate_id")
512
+ for dep in crate.dependencies[:5]
513
+ if dep.get("kind") == "normal" and dep.get("crate_id")
514
+ ]
515
+ key_deps_str = (
516
+ ", ".join(str(dep) for dep in key_deps) if key_deps else "None"
517
+ )
518
+
519
+ # Adaptively truncate different sections based on importance
520
+ token_budget = available_prompt_tokens - 400
521
+
522
+ # Allocate different percentages to each section
523
+ desc_tokens = int(token_budget * 0.2)
524
+ readme_tokens = int(token_budget * 0.6)
525
+
526
+ desc = self.truncate_content(crate.description, desc_tokens)
527
+ readme_summary = self.smart_truncate(readme_summary, readme_tokens)
528
+
529
+ # Few-shot prompting with examples
530
+ prompt = (
531
+ "<|system|>You are a Rust expert classifying crates into the "
532
+ "most appropriate category.\n"
533
+ "<|user|>\n"
534
+ "# Example 1\n"
535
+ "Crate: `tokio`\n"
536
+ "Description: An asynchronous runtime for the Rust programming "
537
+ "language\n"
538
+ "Keywords: async, runtime, futures\n"
539
+ "Key Dependencies: mio, bytes, parking_lot\n"
540
+ "Category: Networking\n\n"
541
+ "# Example 2\n"
542
+ "Crate: `serde`\n"
543
+ "Description: A generic serialization/deserialization framework\n"
544
+ "Keywords: serde, serialization\n"
545
+ "Key Dependencies: serde_derive\n"
546
+ "Category: Serialization\n\n"
547
+ "# Crate to Classify\n"
548
+ f"Crate: `{crate.name}`\n"
549
+ f"Description: {desc}\n"
550
+ f"Keywords: {joined}\n"
551
+ f"README Summary: {readme_summary}\n"
552
+ f"Key Dependencies: {key_deps_str}\n\n"
553
+ "Category (pick only one): [AI, Database, Web Framework, "
554
+ "Networking, Serialization, Utilities, DevTools, ML, "
555
+ "Cryptography, Unknown]\n"
556
+ "<|end|>"
557
+ )
558
+ # Validate classification with retry - more generous parameters
559
+ result = self.validate_and_retry(
560
+ prompt,
561
+ validation_func=self.validate_classification,
562
+ temp=0.2,
563
+ max_tokens=50,
564
+ )
565
+
566
+ return result or "Unknown"
567
+ except Exception as e:
568
+ logging.error(f"Classification failed for {crate.name}: {str(e)}")
569
+ return "Unknown"
570
+
571
+ def generate_factual_pairs(self, crate: CrateMetadata) -> str:
572
+ """Generate factual/counterfactual pairs with retry and validation"""
573
+ try:
574
+ desc = self.truncate_content(crate.description, 300)
575
+ readme_summary = self.truncate_content(
576
+ getattr(crate, "readme_summary", "") or "", 300
577
+ )
578
+
579
+ # Handle both dict and list feature formats
580
+ if isinstance(crate.features, dict):
581
+ features = ", ".join(list(crate.features.keys())[:5])
582
+ elif isinstance(crate.features, list):
583
+ feature_names = []
584
+ for feature in crate.features[:5]:
585
+ if isinstance(feature, str):
586
+ feature_names.append(feature)
587
+ elif isinstance(feature, dict):
588
+ feature_name = feature.get('name', str(feature))
589
+ feature_names.append(feature_name)
590
+ else:
591
+ feature_names.append(str(feature))
592
+ features = ", ".join(feature_names)
593
+ else:
594
+ features = ""
595
+
596
+ prompt = (
597
+ "<|system|>Create exactly 5 factual/counterfactual pairs for "
598
+ "the Rust crate. Factual statements must be true. "
599
+ "Counterfactuals should be plausible but incorrect - make them "
600
+ "subtle and convincing rather than simple negations.\n"
601
+ "<|user|>\n"
602
+ f"Crate: {crate.name}\n"
603
+ f"Description: {desc}\n"
604
+ f"Repo: {crate.repository}\n"
605
+ f"README Summary: {readme_summary}\n"
606
+ f"Key Features: {features}\n\n"
607
+ "Format each pair as:\n"
608
+ "✅ Factual: [true statement about the crate]\n"
609
+ "❌ Counterfactual: [plausible but false statement]\n\n"
610
+ "Create exactly 5 pairs.\n"
611
+ "<|end|>"
612
+ )
613
+ # Use validation for retry - more generous parameters
614
+ result = self.validate_and_retry(
615
+ prompt,
616
+ validation_func=self.validate_factual_pairs,
617
+ temp=0.7,
618
+ max_tokens=800,
619
+ )
620
+
621
+ return result or "Factual pairs generation failed."
622
+ except Exception as e:
623
+ logging.error(f"Exception in factual_pairs for {crate.name}: {str(e)}")
624
+ return "Factual pairs generation failed."
625
+
626
+ def score_crate(self, crate: CrateMetadata) -> float:
627
+ """Calculate a score for the crate based on various metrics"""
628
+ score = (crate.downloads / 1000) + (crate.github_stars * 10)
629
+ score += len(self.truncate_content(crate.readme, 1000)) / 500
630
+ return round(score, 2)
631
+
632
+ def batch_process_prompts(
633
+ self, prompts: list[tuple[str, float, int]], batch_size: int = 4
634
+ ) -> list[Union[str, None]]:
635
+ """
636
+ L4 GPU-optimized batch processing for multiple prompts.
637
+ Processes prompts in batches to maximize GPU utilization.
638
+
639
+ Args:
640
+ prompts: List of (prompt, temperature, max_tokens) tuples
641
+ batch_size: Number of prompts to process simultaneously
642
+ """
643
+ results: list[Union[str, None]] = []
644
+
645
+ # Process in batches optimized for L4's capabilities
646
+ for i in range(0, len(prompts), batch_size):
647
+ batch = prompts[i : i + batch_size]
648
+ batch_results: list[Union[str, None]] = []
649
+
650
+ for prompt, temp, max_tokens in batch:
651
+ try:
652
+ # Prepare prompt with context preservation
653
+ if self.estimate_tokens(prompt) > 3500:
654
+ prompt = self.smart_truncate(prompt, 3500)
655
+
656
+ # Use optimized parameters for L4
657
+ output = self.model(
658
+ prompt,
659
+ max_tokens=max_tokens,
660
+ temperature=temp,
661
+ top_p=0.95,
662
+ repeat_penalty=1.1,
663
+ stop=["<|end|>", "<|user|>", "<|system|>"],
664
+ echo=False,
665
+ stream=False,
666
+ )
667
+
668
+ # The type checker incorrectly infers a stream response
669
+ choice_text: str = output["choices"][0]["text"] # type: ignore
670
+ result = self.clean_output(choice_text)
671
+ batch_results.append(result)
672
+ except Exception as e:
673
+ logging.error(f"LLM batch processing error: {e}", exc_info=True)
674
+ batch_results.append(None)
675
+
676
+ results.extend(batch_results)
677
+
678
+ return results
679
+
680
+ def smart_context_management(
681
+ self, context_history: list[str], new_prompt: str
682
+ ) -> str:
683
+ """
684
+ Intelligent context management for prefix cache optimization.
685
+ Maximizes cache hits by preserving common context patterns.
686
+ """
687
+ # Calculate available tokens for context
688
+ base_tokens = self.estimate_tokens(new_prompt)
689
+ available_context = 4000 - base_tokens # Leave buffer for response
690
+
691
+ if available_context <= 0:
692
+ return new_prompt
693
+
694
+ # Build context from most recent and most relevant history
695
+ context_parts: list[str] = []
696
+ tokens_used = 0
697
+
698
+ # Prioritize recent context (better cache hits)
699
+ for context in reversed(context_history[-5:]): # Last 5 contexts
700
+ context_tokens = self.estimate_tokens(context)
701
+ if tokens_used + context_tokens <= available_context:
702
+ context_parts.insert(0, context)
703
+ tokens_used += context_tokens
704
+ else:
705
+ # Try to fit truncated version
706
+ remaining_tokens = available_context - tokens_used
707
+ if remaining_tokens > 100: # Only if meaningful space left
708
+ truncated = self.smart_truncate(context, remaining_tokens)
709
+ if truncated:
710
+ context_parts.insert(0, truncated)
711
+ break
712
+
713
+ # Combine context with new prompt
714
+ if context_parts:
715
+ full_context = "\n\n---\n\n".join(context_parts)
716
+ return f"{full_context}\n\n---\n\n{new_prompt}"
717
+
718
+ return new_prompt