rust-crate-pipeline 1.3.6__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,598 +1,718 @@
1
- # ai_processing.py
2
- import re
3
- import time
4
- import logging
5
- from typing import TypedDict, Union
6
-
7
- from collections.abc import Callable
8
-
9
- from .config import PipelineConfig, CrateMetadata, EnrichedCrate
10
-
11
- # Optional imports with fallbacks
12
- _ai_dependencies_available = True
13
- try:
14
- import tiktoken
15
- from llama_cpp import Llama
16
- except ImportError as e:
17
- logging.warning(f"AI dependencies not available: {e}")
18
- tiktoken = None # type: ignore[assignment]
19
- Llama = None # type: ignore[assignment,misc]
20
- _ai_dependencies_available = False
21
-
22
-
23
- class Section(TypedDict):
24
- heading: str
25
- content: str
26
- priority: int
27
-
28
-
29
- class LLMEnricher:
30
- def __init__(self, config: PipelineConfig) -> None:
31
- if not _ai_dependencies_available:
32
- raise ImportError(
33
- "AI dependencies (tiktoken, llama_cpp) are not available. "
34
- "Please install them to use LLMEnricher."
35
- )
36
-
37
- self.config = config
38
- self.tokenizer = tiktoken.get_encoding("cl100k_base") # type: ignore
39
- self.model = self._load_model()
40
-
41
- def _load_model(self) -> None:
42
- """Optimized for GCP g2-standard-4 with L4 GPU (24GB VRAM)"""
43
- if not _ai_dependencies_available:
44
- raise ImportError("Cannot load model: AI dependencies not available")
45
-
46
- return Llama( # type: ignore
47
- model_path=self.config.model_path,
48
- n_ctx=4096, # Larger context for L4's 24GB VRAM
49
- n_batch=1024, # Larger batch size for better throughput
50
- # Load ALL layers on GPU (L4 has plenty VRAM)
51
- n_gpu_layers=-1,
52
- n_threads=4, # Match the 4 vCPUs
53
- n_threads_batch=4, # Parallel batch processing
54
- use_mmap=True, # Memory-mapped files for efficiency
55
- use_mlock=True, # Lock model in memory
56
- rope_scaling_type=1, # RoPE scaling for longer contexts
57
- rope_freq_base=10000.0, # Base frequency for RoPE
58
- flash_attn=True, # Enable flash attention if available
59
- verbose=False, # Reduce logging overhead
60
- )
61
-
62
- def estimate_tokens(self, text: str) -> int:
63
- return len(self.tokenizer.encode(text))
64
-
65
- def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
66
- """Truncate content to fit within token limit"""
67
- paragraphs = content.split("\n\n")
68
- result, current_tokens = "", 0
69
-
70
- for para in paragraphs:
71
- tokens = len(self.tokenizer.encode(para))
72
- if current_tokens + tokens <= max_tokens:
73
- result += para + "\n\n"
74
- current_tokens += tokens
75
- else:
76
- break
77
- return result.strip()
78
-
79
- def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
80
- """Intelligently truncate content to preserve the most important parts"""
81
- if not content:
82
- return ""
83
-
84
- # If content is short enough, return it all
85
- if len(self.tokenizer.encode(content)) <= max_tokens:
86
- return content
87
-
88
- # Split into sections based on markdown headers
89
- sections: list[Section] = []
90
- current_section: Section = {
91
- "heading": "Introduction",
92
- "content": "",
93
- "priority": 10,
94
- }
95
-
96
- for line in content.splitlines():
97
- if re.match(r"^#+\s+", line): # It's a header
98
- # Save previous section if not empty
99
- if current_section["content"].strip():
100
- sections.append(current_section)
101
-
102
- # Create new section with appropriate priority
103
- heading = re.sub(r"^#+\s+", "", line)
104
- priority = 5 # Default priority
105
-
106
- # Assign priority based on content type
107
- if re.search(r"\b(Union[usage, example]|getting started)\b", heading, re.I):
108
- priority = 10
109
- elif re.search(r"\b(Union[feature, overview]|about)\b", heading, re.I):
110
- priority = 9
111
- elif re.search(r"\b(Union[install, setup]|config)\b", heading, re.I):
112
- priority = 8
113
- elif re.search(r"\b(Union[api, interface])\b", heading, re.I):
114
- priority = 7
115
-
116
- current_section = {
117
- "heading": heading,
118
- "content": line + "\n",
119
- "priority": priority,
120
- }
121
- else:
122
- current_section["content"] += line + "\n"
123
-
124
- # Boost priority if code block is found
125
- if "```rust" in line or "```no_run" in line:
126
- current_section["priority"] = max(current_section["priority"], 8)
127
-
128
- # Add the last section
129
- if current_section["content"].strip():
130
- sections.append(current_section)
131
-
132
- # Sort sections by priority (highest first)
133
- sections.sort(key=lambda x: x["priority"], reverse=True)
134
-
135
- # Build the result, respecting token limits
136
- result = ""
137
- tokens_used = 0
138
-
139
- for section in sections:
140
- section_text = f'## {section["heading"]}\n{section["content"]}\n'
141
- section_tokens = len(self.tokenizer.encode(section_text))
142
-
143
- if tokens_used + section_tokens <= max_tokens:
144
- result += section_text
145
- tokens_used += section_tokens
146
- elif tokens_used < max_tokens - 100: # If we can fit a truncated version
147
- # Take what we can
148
- remaining_tokens = max_tokens - tokens_used
149
- truncated_text = self.tokenizer.decode(
150
- self.tokenizer.encode(section_text)[:remaining_tokens]
151
- )
152
- result += truncated_text
153
- break
154
-
155
- return result
156
-
157
- def clean_output(self, output: str, task: str = "general") -> str:
158
- """Task-specific output cleaning"""
159
- if not output:
160
- return ""
161
-
162
- # Remove any remaining prompt artifacts
163
- output = output.split("<|end|>")[0].strip()
164
-
165
- if task == "classification":
166
- # For classification tasks, extract just the category
167
- categories = [
168
- "AI",
169
- "Database",
170
- "Web Framework",
171
- "Networking",
172
- "Serialization",
173
- "Utilities",
174
- "DevTools",
175
- "ML",
176
- "Cryptography",
177
- "Unknown",
178
- ]
179
- for category in categories:
180
- if re.search(
181
- r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
182
- ):
183
- return category
184
- return "Unknown"
185
-
186
- elif task == "factual_pairs":
187
- # For factual pairs, ensure proper formatting
188
- pairs: list[str] = []
189
- facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
190
- counterfacts = re.findall(
191
- r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
192
- )
193
-
194
- # Pair them up
195
- for i in range(min(len(facts), len(counterfacts))):
196
- pairs.append(
197
- f"✅ Factual: {facts[i].strip()}\n"
198
- f"❌ Counterfactual: {counterfacts[i].strip()}"
199
- )
200
-
201
- return "\n\n".join(pairs)
202
-
203
- else:
204
- # General cleaning - more permissive than before
205
- lines = [line.strip() for line in output.splitlines() if line.strip()]
206
- return "\n".join(lines)
207
-
208
- def run_llama(
209
- self, prompt: str, temp: float = 0.2, max_tokens: int = 256
210
- ) -> Union[str, None]:
211
- """Run the LLM with customizable parameters per task"""
212
- try:
213
- token_count = self.estimate_tokens(prompt)
214
- if token_count > self.config.prompt_token_margin:
215
- logging.warning(f"Prompt too long ({token_count} tokens). Truncating.")
216
- prompt = self.truncate_content(
217
- prompt, self.config.prompt_token_margin - 100
218
- )
219
-
220
- output = self.model(
221
- prompt,
222
- max_tokens=max_tokens,
223
- temperature=temp,
224
- # Stop at these tokens
225
- stop=["<|end|>", "<|user|>", "<|system|>"],
226
- )
227
-
228
- raw_text: str = output["choices"][0]["text"] # type: ignore
229
- return self.clean_output(raw_text)
230
- except Exception as e:
231
- logging.error(f"Model inference failed: {str(e)}")
232
- raise
233
-
234
- def validate_and_retry(
235
- self,
236
- prompt: str,
237
- validation_func: Callable[[str], bool],
238
- temp: float = 0.2,
239
- max_tokens: int = 256,
240
- retries: int = 4, # Increased from 2 to 4 for better success rates
241
- ) -> Union[str, None]:
242
- """Run LLM with validation and automatic retry on failure"""
243
- result = None
244
- for attempt in range(retries):
245
- try:
246
- # More generous temperature adjustment for better variety
247
- # 20% increases instead of 10%
248
- adjusted_temp = temp * (1 + (attempt * 0.2))
249
- result = self.run_llama(
250
- prompt, temp=adjusted_temp, max_tokens=max_tokens
251
- )
252
-
253
- # Validate the result
254
- if result and validation_func(result):
255
- return result
256
-
257
- # If we get here, validation failed - use debug level for early
258
- # attempts
259
- if attempt == retries - 1:
260
- logging.debug(
261
- f"All {retries} validation attempts failed, "
262
- "using last available result."
263
- )
264
- else:
265
- logging.debug(
266
- f"Validation failed on attempt {attempt + 1}/{retries}. "
267
- f"Retrying with adjusted temp={adjusted_temp:.2f}"
268
- )
269
-
270
- # Only simplify prompt on later attempts (attempt 2+)
271
- if attempt >= 2:
272
- prompt = self.simplify_prompt(prompt)
273
-
274
- except Exception as e:
275
- logging.error(f"Generation error on attempt {attempt + 1}: {str(e)}")
276
-
277
- # More generous backoff - give the model more time
278
- time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
279
-
280
- # If we exhausted all retries, return the last result even if not
281
- # perfect
282
- return result if "result" in locals() else None
283
-
284
- def simplify_prompt(self, prompt: str) -> str:
285
- """Simplify a prompt by removing examples and reducing context"""
286
- # Remove few-shot examples
287
- prompt = re.sub(
288
- r"# Example [0-9].*?(?=# Crate to Classify|\Z)",
289
- "",
290
- prompt,
291
- flags=re.DOTALL,
292
- )
293
-
294
- # Make instructions more direct
295
- prompt = re.sub(
296
- r"<\|system\|>.*?<\|user\|>",
297
- "<|system|>Be concise.\n<|user|>",
298
- prompt,
299
- flags=re.DOTALL,
300
- )
301
-
302
- return prompt
303
-
304
- def validate_classification(self, result: str) -> bool:
305
- """Ensure a valid category was returned"""
306
- if not result:
307
- return False
308
- valid_categories = [
309
- "AI",
310
- "Database",
311
- "Web Framework",
312
- "Networking",
313
- "Serialization",
314
- "Utilities",
315
- "DevTools",
316
- "ML",
317
- "Cryptography",
318
- "Unknown",
319
- ]
320
- return any(
321
- category.lower() == result.strip().lower() for category in valid_categories
322
- )
323
-
324
- def validate_factual_pairs(self, result: str) -> bool:
325
- """Ensure exactly 5 factual/counterfactual pairs exist"""
326
- if not result:
327
- return False
328
-
329
- facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", result, re.DOTALL)
330
- counterfacts = re.findall(
331
- r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", result, re.DOTALL
332
- )
333
-
334
- return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
335
-
336
- def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
337
- """Apply all AI enrichments to a crate"""
338
- # Convert CrateMetadata to EnrichedCrate
339
- enriched_dict = crate.__dict__.copy()
340
- enriched = EnrichedCrate(**enriched_dict)
341
-
342
- try:
343
- # Generate README summary first
344
- if crate.readme:
345
- readme_content = self.smart_truncate(crate.readme, 2000)
346
- prompt = (
347
- "<|system|>Extract key features from README.\n"
348
- "<|user|>Summarize key aspects of this Rust crate from its "
349
- f"README:\n{readme_content}\n"
350
- "<|end|>"
351
- )
352
- enriched.readme_summary = self.validate_and_retry(
353
- prompt, lambda x: len(x) > 50, temp=0.3, max_tokens=300
354
- )
355
-
356
- # Generate other enrichments
357
- enriched.feature_summary = self.summarize_features(crate)
358
- enriched.use_case = self.classify_use_case(
359
- crate, enriched.readme_summary or ""
360
- )
361
- enriched.score = self.score_crate(crate)
362
- enriched.factual_counterfactual = self.generate_factual_pairs(crate)
363
-
364
- return enriched
365
- except Exception as e:
366
- logging.error(f"Failed to enrich {crate.name}: {str(e)}")
367
- return enriched
368
-
369
- def summarize_features(self, crate: CrateMetadata) -> str:
370
- """Generate summaries for crate features with better prompting"""
371
- try:
372
- if not crate.features:
373
- return "No features documented for this crate."
374
-
375
- # Format features with their dependencies
376
- feature_text = ""
377
- for feature_name, deps in list(crate.features.items())[:8]:
378
- deps_str = ", ".join(deps) if deps else "none"
379
- feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
380
-
381
- prompt = (
382
- "<|system|>You are a Rust programming expert analyzing crate "
383
- "features.\n"
384
- f"<|user|>For the Rust crate `{crate.name}`, explain these "
385
- "features and what functionality they provide:\n\n"
386
- f"{feature_text}\n\n"
387
- "Provide a concise explanation of each feature's purpose and "
388
- "when a developer would enable it.\n"
389
- "<|end|>"
390
- )
391
-
392
- # Use moderate temperature for informative but natural explanation
393
- result = self.run_llama(prompt, temp=0.2, max_tokens=350)
394
- return result or "Feature summary not available."
395
- except Exception as e:
396
- logging.warning(f"Feature summarization failed for {crate.name}: {str(e)}")
397
- return "Feature summary not available."
398
-
399
- def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
400
- """Classify the use case of a crate with rich context"""
401
- try:
402
- # Calculate available tokens for prompt
403
- available_prompt_tokens = self.config.model_token_limit - 200
404
-
405
- joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
406
- key_deps = [
407
- dep.get("crate_id")
408
- for dep in crate.dependencies[:5]
409
- if dep.get("kind") == "normal" and dep.get("crate_id")
410
- ]
411
- key_deps_str = (
412
- ", ".join(str(dep) for dep in key_deps) if key_deps else "None"
413
- )
414
-
415
- # Adaptively truncate different sections based on importance
416
- token_budget = available_prompt_tokens - 400
417
-
418
- # Allocate different percentages to each section
419
- desc_tokens = int(token_budget * 0.2)
420
- readme_tokens = int(token_budget * 0.6)
421
-
422
- desc = self.truncate_content(crate.description, desc_tokens)
423
- readme_summary = self.smart_truncate(readme_summary, readme_tokens)
424
-
425
- # Few-shot prompting with examples
426
- prompt = (
427
- "<|system|>You are a Rust expert classifying crates into the "
428
- "most appropriate category.\n"
429
- "<|user|>\n"
430
- "# Example 1\n"
431
- "Crate: `tokio`\n"
432
- "Description: An asynchronous runtime for the Rust programming "
433
- "language\n"
434
- "Keywords: async, runtime, futures\n"
435
- "Key Dependencies: mio, bytes, parking_lot\n"
436
- "Category: Networking\n\n"
437
- "# Example 2\n"
438
- "Crate: `serde`\n"
439
- "Description: A generic serialization/deserialization framework\n"
440
- "Keywords: serde, serialization\n"
441
- "Key Dependencies: serde_derive\n"
442
- "Category: Serialization\n\n"
443
- "# Crate to Classify\n"
444
- f"Crate: `{crate.name}`\n"
445
- f"Description: {desc}\n"
446
- f"Keywords: {joined}\n"
447
- f"README Summary: {readme_summary}\n"
448
- f"Key Dependencies: {key_deps_str}\n\n"
449
- "Category (pick only one): [AI, Database, Web Framework, "
450
- "Networking, Serialization, Utilities, DevTools, ML, "
451
- "Cryptography, Unknown]\n"
452
- "<|end|>"
453
- )
454
- # Validate classification with retry - more generous parameters
455
- result = self.validate_and_retry(
456
- prompt,
457
- validation_func=self.validate_classification,
458
- temp=0.2,
459
- max_tokens=50,
460
- )
461
-
462
- return result or "Unknown"
463
- except Exception as e:
464
- logging.error(f"Classification failed for {crate.name}: {str(e)}")
465
- return "Unknown"
466
-
467
- def generate_factual_pairs(self, crate: CrateMetadata) -> str:
468
- """Generate factual/counterfactual pairs with retry and validation"""
469
- try:
470
- desc = self.truncate_content(crate.description, 300)
471
- readme_summary = self.truncate_content(
472
- getattr(crate, "readme_summary", "") or "", 300
473
- )
474
- features = ", ".join(list(crate.features.keys())[:5])
475
-
476
- prompt = (
477
- "<|system|>Create exactly 5 factual/counterfactual pairs for "
478
- "the Rust crate. Factual statements must be true. "
479
- "Counterfactuals should be plausible but incorrect - make them "
480
- "subtle and convincing rather than simple negations.\n"
481
- "<|user|>\n"
482
- f"Crate: {crate.name}\n"
483
- f"Description: {desc}\n"
484
- f"Repo: {crate.repository}\n"
485
- f"README Summary: {readme_summary}\n"
486
- f"Key Features: {features}\n\n"
487
- "Format each pair as:\n"
488
- " Factual: [true statement about the crate]\n"
489
- " Counterfactual: [plausible but false statement]\n\n"
490
- "Create exactly 5 pairs.\n"
491
- "<|end|>"
492
- )
493
- # Use validation for retry - more generous parameters
494
- result = self.validate_and_retry(
495
- prompt,
496
- validation_func=self.validate_factual_pairs,
497
- temp=0.7,
498
- max_tokens=800,
499
- )
500
-
501
- return result or "Factual pairs generation failed."
502
- except Exception as e:
503
- logging.error(f"Exception in factual_pairs for {crate.name}: {str(e)}")
504
- return "Factual pairs generation failed."
505
-
506
- def score_crate(self, crate: CrateMetadata) -> float:
507
- """Calculate a score for the crate based on various metrics"""
508
- score = (crate.downloads / 1000) + (crate.github_stars * 10)
509
- score += len(self.truncate_content(crate.readme, 1000)) / 500
510
- return round(score, 2)
511
-
512
- def batch_process_prompts(
513
- self, prompts: list[tuple[str, float, int]], batch_size: int = 4
514
- ) -> list[Union[str, None]]:
515
- """
516
- L4 GPU-optimized batch processing for multiple prompts.
517
- Processes prompts in batches to maximize GPU utilization.
518
-
519
- Args:
520
- prompts: List of (prompt, temperature, max_tokens) tuples
521
- batch_size: Number of prompts to process simultaneously
522
- """
523
- results: list[Union[str, None]] = []
524
-
525
- # Process in batches optimized for L4's capabilities
526
- for i in range(0, len(prompts), batch_size):
527
- batch = prompts[i : i + batch_size]
528
- batch_results: list[Union[str, None]] = []
529
-
530
- for prompt, temp, max_tokens in batch:
531
- try:
532
- # Prepare prompt with context preservation
533
- if self.estimate_tokens(prompt) > 3500:
534
- prompt = self.smart_truncate(prompt, 3500)
535
-
536
- # Use optimized parameters for L4
537
- output = self.model(
538
- prompt,
539
- max_tokens=max_tokens,
540
- temperature=temp,
541
- top_p=0.95,
542
- repeat_penalty=1.1,
543
- stop=["<|end|>", "<|user|>", "<|system|>"],
544
- echo=False,
545
- stream=False,
546
- )
547
-
548
- # The type checker incorrectly infers a stream response
549
- choice_text: str = output["choices"][0]["text"] # type: ignore
550
- result = self.clean_output(choice_text)
551
- batch_results.append(result)
552
- except Exception as e:
553
- logging.error(f"LLM batch processing error: {e}", exc_info=True)
554
- batch_results.append(None)
555
-
556
- results.extend(batch_results)
557
-
558
- return results
559
-
560
- def smart_context_management(
561
- self, context_history: list[str], new_prompt: str
562
- ) -> str:
563
- """
564
- Intelligent context management for prefix cache optimization.
565
- Maximizes cache hits by preserving common context patterns.
566
- """
567
- # Calculate available tokens for context
568
- base_tokens = self.estimate_tokens(new_prompt)
569
- available_context = 4000 - base_tokens # Leave buffer for response
570
-
571
- if available_context <= 0:
572
- return new_prompt
573
-
574
- # Build context from most recent and most relevant history
575
- context_parts: list[str] = []
576
- tokens_used = 0
577
-
578
- # Prioritize recent context (better cache hits)
579
- for context in reversed(context_history[-5:]): # Last 5 contexts
580
- context_tokens = self.estimate_tokens(context)
581
- if tokens_used + context_tokens <= available_context:
582
- context_parts.insert(0, context)
583
- tokens_used += context_tokens
584
- else:
585
- # Try to fit truncated version
586
- remaining_tokens = available_context - tokens_used
587
- if remaining_tokens > 100: # Only if meaningful space left
588
- truncated = self.smart_truncate(context, remaining_tokens)
589
- if truncated:
590
- context_parts.insert(0, truncated)
591
- break
592
-
593
- # Combine context with new prompt
594
- if context_parts:
595
- full_context = "\n\n---\n\n".join(context_parts)
596
- return f"{full_context}\n\n---\n\n{new_prompt}"
597
-
598
- return new_prompt
1
+ # ai_processing.py
2
+ import re
3
+ import time
4
+ import logging
5
+ import os
6
+ from typing import TypedDict, Union
7
+
8
+ from collections.abc import Callable
9
+
10
+ from .config import PipelineConfig, CrateMetadata, EnrichedCrate
11
+
12
+ # Optional imports with fallbacks
13
+ _ai_dependencies_available = True
14
+ try:
15
+ import tiktoken
16
+ from llama_cpp import Llama
17
+ except ImportError as e:
18
+ logging.warning(f"AI dependencies not available: {e}")
19
+ tiktoken = None # type: ignore[assignment]
20
+ Llama = None # type: ignore[assignment,misc]
21
+ _ai_dependencies_available = False
22
+
23
+
24
+ class Section(TypedDict):
25
+ heading: str
26
+ content: str
27
+ priority: int
28
+
29
+
30
+ class LLMEnricher:
31
+ def __init__(self, config: PipelineConfig) -> None:
32
+ """Initialize LLMEnricher with automatic provider detection"""
33
+ if not _ai_dependencies_available:
34
+ raise ImportError("Cannot load model: AI dependencies not available")
35
+
36
+ self.config = config
37
+ self.tokenizer = tiktoken.get_encoding("cl100k_base") # type: ignore
38
+
39
+ # Auto-detect and configure the appropriate LLM provider
40
+ self.model = self._auto_detect_and_load_model()
41
+
42
+ def _auto_detect_and_load_model(self):
43
+ """Automatically detect and load the appropriate LLM provider"""
44
+
45
+ # Priority 1: Check if Azure OpenAI is configured and available
46
+ if (self.config.use_azure_openai and
47
+ self.config.azure_openai_endpoint and
48
+ self.config.azure_openai_api_key and
49
+ self.config.azure_openai_deployment_name):
50
+
51
+ try:
52
+ # Use the UnifiedLLMProcessor for Azure
53
+ from .unified_llm_processor import create_llm_processor_from_config
54
+ return create_llm_processor_from_config(self.config)
55
+ except Exception as e:
56
+ logging.warning(f"Azure OpenAI setup failed, falling back to local: {e}")
57
+
58
+ # Priority 2: Check if local model file exists
59
+ if os.path.exists(self.config.model_path):
60
+ try:
61
+ return self._load_local_model()
62
+ except Exception as e:
63
+ logging.warning(f"Local model loading failed: {e}")
64
+
65
+ # Priority 3: Check for other local providers (Ollama, LM Studio)
66
+ if self._check_ollama_available():
67
+ try:
68
+ from .unified_llm_processor import LLMConfig, UnifiedLLMProcessor
69
+ llm_config = LLMConfig(
70
+ provider="ollama",
71
+ model="llama2", # Default model
72
+ temperature=0.2,
73
+ max_tokens=self.config.max_tokens,
74
+ timeout=30,
75
+ max_retries=self.config.max_retries
76
+ )
77
+ return UnifiedLLMProcessor(llm_config)
78
+ except Exception as e:
79
+ logging.warning(f"Ollama setup failed: {e}")
80
+
81
+ # Priority 4: Check for LM Studio
82
+ if self._check_lmstudio_available():
83
+ try:
84
+ from .unified_llm_processor import LLMConfig, UnifiedLLMProcessor
85
+ llm_config = LLMConfig(
86
+ provider="lmstudio",
87
+ model="local-model", # Default model
88
+ temperature=0.2,
89
+ max_tokens=self.config.max_tokens,
90
+ timeout=30,
91
+ max_retries=self.config.max_retries
92
+ )
93
+ return UnifiedLLMProcessor(llm_config)
94
+ except Exception as e:
95
+ logging.warning(f"LM Studio setup failed: {e}")
96
+
97
+ # If all else fails, raise a clear error
98
+ raise RuntimeError(
99
+ "No LLM provider available. Please configure one of:\n"
100
+ "1. Azure OpenAI (set use_azure_openai=True and credentials)\n"
101
+ "2. Local model file (set model_path to existing .gguf file)\n"
102
+ "3. Ollama (install and run ollama serve)\n"
103
+ "4. LM Studio (install and run LM Studio server)"
104
+ )
105
+
106
+ def _load_local_model(self):
107
+ """Load local llama.cpp model"""
108
+ return Llama( # type: ignore
109
+ model_path=self.config.model_path,
110
+ n_ctx=4096, # Larger context for L4's 24GB VRAM
111
+ n_batch=1024, # Larger batch size for better throughput
112
+ # Load ALL layers on GPU (L4 has plenty VRAM)
113
+ n_gpu_layers=-1,
114
+ n_threads=4, # Match the 4 vCPUs
115
+ n_threads_batch=4, # Parallel batch processing
116
+ use_mmap=True, # Memory-mapped files for efficiency
117
+ use_mlock=True, # Lock model in memory
118
+ rope_scaling_type=1, # RoPE scaling for longer contexts
119
+ rope_freq_base=10000.0, # Base frequency for RoPE
120
+ flash_attn=True, # Enable flash attention if available
121
+ verbose=False, # Reduce logging overhead
122
+ )
123
+
124
+ def _check_ollama_available(self):
125
+ """Check if Ollama is available"""
126
+ try:
127
+ import requests
128
+ response = requests.get("http://localhost:11434/api/tags", timeout=5)
129
+ return response.status_code == 200
130
+ except:
131
+ return False
132
+
133
+ def _check_lmstudio_available(self):
134
+ """Check if LM Studio is available"""
135
+ try:
136
+ import requests
137
+ response = requests.get("http://localhost:1234/v1/models", timeout=5)
138
+ return response.status_code == 200
139
+ except:
140
+ return False
141
+
142
+ def estimate_tokens(self, text: str) -> int:
143
+ return len(self.tokenizer.encode(text))
144
+
145
+ def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
146
+ """Truncate content to fit within token limit"""
147
+ paragraphs = content.split("\n\n")
148
+ result, current_tokens = "", 0
149
+
150
+ for para in paragraphs:
151
+ tokens = len(self.tokenizer.encode(para))
152
+ if current_tokens + tokens <= max_tokens:
153
+ result += para + "\n\n"
154
+ current_tokens += tokens
155
+ else:
156
+ break
157
+ return result.strip()
158
+
159
+ def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
160
+ """Intelligently truncate content to preserve the most important parts"""
161
+ if not content:
162
+ return ""
163
+
164
+ # If content is short enough, return it all
165
+ if len(self.tokenizer.encode(content)) <= max_tokens:
166
+ return content
167
+
168
+ # Split into sections based on markdown headers
169
+ sections: list[Section] = []
170
+ current_section: Section = {
171
+ "heading": "Introduction",
172
+ "content": "",
173
+ "priority": 10,
174
+ }
175
+
176
+ for line in content.splitlines():
177
+ if re.match(r"^#+\s+", line): # It's a header
178
+ # Save previous section if not empty
179
+ if current_section["content"].strip():
180
+ sections.append(current_section)
181
+
182
+ # Create new section with appropriate priority
183
+ heading = re.sub(r"^#+\s+", "", line)
184
+ priority = 5 # Default priority
185
+
186
+ # Assign priority based on content type
187
+ if re.search(r"\b(Union[usage, example]|getting started)\b", heading, re.I):
188
+ priority = 10
189
+ elif re.search(r"\b(Union[feature, overview]|about)\b", heading, re.I):
190
+ priority = 9
191
+ elif re.search(r"\b(Union[install, setup]|config)\b", heading, re.I):
192
+ priority = 8
193
+ elif re.search(r"\b(Union[api, interface])\b", heading, re.I):
194
+ priority = 7
195
+
196
+ current_section = {
197
+ "heading": heading,
198
+ "content": line + "\n",
199
+ "priority": priority,
200
+ }
201
+ else:
202
+ current_section["content"] += line + "\n"
203
+
204
+ # Boost priority if code block is found
205
+ if "```rust" in line or "```no_run" in line:
206
+ current_section["priority"] = max(current_section["priority"], 8)
207
+
208
+ # Add the last section
209
+ if current_section["content"].strip():
210
+ sections.append(current_section)
211
+
212
+ # Sort sections by priority (highest first)
213
+ sections.sort(key=lambda x: x["priority"], reverse=True)
214
+
215
+ # Build the result, respecting token limits
216
+ result = ""
217
+ tokens_used = 0
218
+
219
+ for section in sections:
220
+ section_text = f'## {section["heading"]}\n{section["content"]}\n'
221
+ section_tokens = len(self.tokenizer.encode(section_text))
222
+
223
+ if tokens_used + section_tokens <= max_tokens:
224
+ result += section_text
225
+ tokens_used += section_tokens
226
+ elif tokens_used < max_tokens - 100: # If we can fit a truncated version
227
+ # Take what we can
228
+ remaining_tokens = max_tokens - tokens_used
229
+ truncated_text = self.tokenizer.decode(
230
+ self.tokenizer.encode(section_text)[:remaining_tokens]
231
+ )
232
+ result += truncated_text
233
+ break
234
+
235
+ return result
236
+
237
+ def clean_output(self, output: str, task: str = "general") -> str:
238
+ """Task-specific output cleaning"""
239
+ if not output:
240
+ return ""
241
+
242
+ # Remove any remaining prompt artifacts
243
+ output = output.split("<|end|>")[0].strip()
244
+
245
+ if task == "classification":
246
+ # For classification tasks, extract just the category
247
+ categories = [
248
+ "AI",
249
+ "Database",
250
+ "Web Framework",
251
+ "Networking",
252
+ "Serialization",
253
+ "Utilities",
254
+ "DevTools",
255
+ "ML",
256
+ "Cryptography",
257
+ "Unknown",
258
+ ]
259
+ for category in categories:
260
+ if re.search(
261
+ r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
262
+ ):
263
+ return category
264
+ return "Unknown"
265
+
266
+ elif task == "factual_pairs":
267
+ # For factual pairs, ensure proper formatting
268
+ pairs: list[str] = []
269
+ facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
270
+ counterfacts = re.findall(
271
+ r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
272
+ )
273
+
274
+ # Pair them up
275
+ for i in range(min(len(facts), len(counterfacts))):
276
+ pairs.append(
277
+ f"✅ Factual: {facts[i].strip()}\n"
278
+ f"❌ Counterfactual: {counterfacts[i].strip()}"
279
+ )
280
+
281
+ return "\n\n".join(pairs)
282
+
283
+ else:
284
+ # General cleaning - more permissive than before
285
+ lines = [line.strip() for line in output.splitlines() if line.strip()]
286
+ return "\n".join(lines)
287
+
288
+ def run_llama(
289
+ self, prompt: str, temp: float = 0.2, max_tokens: int = 256
290
+ ) -> Union[str, None]:
291
+ """Run the LLM with customizable parameters per task"""
292
+ try:
293
+ token_count = self.estimate_tokens(prompt)
294
+ if token_count > self.config.prompt_token_margin:
295
+ logging.warning(f"Prompt too long ({token_count} tokens). Truncating.")
296
+ prompt = self.truncate_content(
297
+ prompt, self.config.prompt_token_margin - 100
298
+ )
299
+
300
+ # Handle different model types
301
+ from .unified_llm_processor import UnifiedLLMProcessor
302
+ if isinstance(self.model, UnifiedLLMProcessor):
303
+ # UnifiedLLMProcessor
304
+ return self.model.call_llm(prompt, temp, max_tokens)
305
+ else:
306
+ # Local Llama model
307
+ output = self.model(
308
+ prompt,
309
+ max_tokens=max_tokens,
310
+ temperature=temp,
311
+ # Stop at these tokens
312
+ stop=["<|end|>", "<|user|>", "<|system|>"],
313
+ )
314
+
315
+ raw_text: str = output["choices"][0]["text"] # type: ignore
316
+ return self.clean_output(raw_text)
317
+ except Exception as e:
318
+ logging.error(f"Model inference failed: {str(e)}")
319
+ raise
320
+
321
+ def validate_and_retry(
322
+ self,
323
+ prompt: str,
324
+ validation_func: Callable[[str], bool],
325
+ temp: float = 0.2,
326
+ max_tokens: int = 256,
327
+ retries: int = 4, # Increased from 2 to 4 for better success rates
328
+ ) -> Union[str, None]:
329
+ """Run LLM with validation and automatic retry on failure"""
330
+ result = None
331
+ for attempt in range(retries):
332
+ try:
333
+ # More generous temperature adjustment for better variety
334
+ # 20% increases instead of 10%
335
+ adjusted_temp = temp * (1 + (attempt * 0.2))
336
+ result = self.run_llama(
337
+ prompt, temp=adjusted_temp, max_tokens=max_tokens
338
+ )
339
+
340
+ # Validate the result
341
+ if result and validation_func(result):
342
+ return result
343
+
344
+ # If we get here, validation failed - use debug level for early
345
+ # attempts
346
+ if attempt == retries - 1:
347
+ logging.debug(
348
+ f"All {retries} validation attempts failed, "
349
+ "using last available result."
350
+ )
351
+ else:
352
+ logging.debug(
353
+ f"Validation failed on attempt {attempt + 1}/{retries}. "
354
+ f"Retrying with adjusted temp={adjusted_temp:.2f}"
355
+ )
356
+
357
+ # Only simplify prompt on later attempts (attempt 2+)
358
+ if attempt >= 2:
359
+ prompt = self.simplify_prompt(prompt)
360
+
361
+ except Exception as e:
362
+ logging.error(f"Generation error on attempt {attempt + 1}: {str(e)}")
363
+
364
+ # More generous backoff - give the model more time
365
+ time.sleep(2.0 + (attempt * 1.0)) # 2s, 3s, 4s, 5s delays
366
+
367
+ # If we exhausted all retries, return the last result even if not
368
+ # perfect
369
+ return result if "result" in locals() else None
370
+
371
+ def simplify_prompt(self, prompt: str) -> str:
372
+ """Simplify a prompt by removing examples and reducing context"""
373
+ # Remove few-shot examples
374
+ prompt = re.sub(
375
+ r"# Example [0-9].*?(?=# Crate to Classify|\Z)",
376
+ "",
377
+ prompt,
378
+ flags=re.DOTALL,
379
+ )
380
+
381
+ # Make instructions more direct
382
+ prompt = re.sub(
383
+ r"<\|system\|>.*?<\|user\|>",
384
+ "<|system|>Be concise.\n<|user|>",
385
+ prompt,
386
+ flags=re.DOTALL,
387
+ )
388
+
389
+ return prompt
390
+
391
+ def validate_classification(self, result: str) -> bool:
392
+ """Ensure a valid category was returned"""
393
+ if not result:
394
+ return False
395
+ valid_categories = [
396
+ "AI",
397
+ "Database",
398
+ "Web Framework",
399
+ "Networking",
400
+ "Serialization",
401
+ "Utilities",
402
+ "DevTools",
403
+ "ML",
404
+ "Cryptography",
405
+ "Unknown",
406
+ ]
407
+ return any(
408
+ category.lower() == result.strip().lower() for category in valid_categories
409
+ )
410
+
411
+ def validate_factual_pairs(self, result: str) -> bool:
412
+ """Ensure exactly 5 factual/counterfactual pairs exist"""
413
+ if not result:
414
+ return False
415
+
416
+ facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", result, re.DOTALL)
417
+ counterfacts = re.findall(
418
+ r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", result, re.DOTALL
419
+ )
420
+
421
+ return len(facts) >= 3 and len(counterfacts) >= 3 # At least 3 pairs
422
+
423
+ def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
424
+ """Apply all AI enrichments to a crate"""
425
+ # Convert CrateMetadata to EnrichedCrate
426
+ enriched_dict = crate.__dict__.copy()
427
+ enriched = EnrichedCrate(**enriched_dict)
428
+
429
+ try:
430
+ # Generate README summary first
431
+ if crate.readme:
432
+ readme_content = self.smart_truncate(crate.readme, 2000)
433
+ prompt = (
434
+ "<|system|>Extract key features from README.\n"
435
+ "<|user|>Summarize key aspects of this Rust crate from its "
436
+ f"README:\n{readme_content}\n"
437
+ "<|end|>"
438
+ )
439
+ enriched.readme_summary = self.validate_and_retry(
440
+ prompt, lambda x: len(x) > 50, temp=0.3, max_tokens=300
441
+ )
442
+
443
+ # Generate other enrichments
444
+ enriched.feature_summary = self.summarize_features(crate)
445
+ enriched.use_case = self.classify_use_case(
446
+ crate, enriched.readme_summary or ""
447
+ )
448
+ enriched.score = self.score_crate(crate)
449
+ enriched.factual_counterfactual = self.generate_factual_pairs(crate)
450
+
451
+ return enriched
452
+ except Exception as e:
453
+ logging.error(f"Failed to enrich {crate.name}: {str(e)}")
454
+ return enriched
455
+
456
+ def summarize_features(self, crate: CrateMetadata) -> str:
457
+ """Generate summaries for crate features with better prompting"""
458
+ try:
459
+ if not crate.features:
460
+ return "No features documented for this crate."
461
+
462
+ # Handle both dict and list feature formats
463
+ feature_text = ""
464
+ if isinstance(crate.features, dict):
465
+ # Format features with their dependencies
466
+ for feature_name, deps in list(crate.features.items())[:8]:
467
+ deps_str = ", ".join(deps) if deps else "none"
468
+ feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
469
+ elif isinstance(crate.features, list):
470
+ # Handle list format - assume each item is a feature name
471
+ for feature in crate.features[:8]:
472
+ if isinstance(feature, str):
473
+ feature_text += f"- {feature} (dependencies: none)\n"
474
+ elif isinstance(feature, dict):
475
+ # If feature is a dict, try to extract name and deps
476
+ feature_name = feature.get('name', str(feature))
477
+ deps = feature.get('dependencies', [])
478
+ deps_str = ", ".join(deps) if deps else "none"
479
+ feature_text += f"- {feature_name} (dependencies: {deps_str})\n"
480
+ else:
481
+ feature_text += f"- {str(feature)} (dependencies: none)\n"
482
+ else:
483
+ return "Features format not recognized."
484
+
485
+ prompt = (
486
+ "<|system|>You are a Rust programming expert analyzing crate "
487
+ "features.\n"
488
+ f"<|user|>For the Rust crate `{crate.name}`, explain these "
489
+ "features and what functionality they provide:\n\n"
490
+ f"{feature_text}\n\n"
491
+ "Provide a concise explanation of each feature's purpose and "
492
+ "when a developer would enable it.\n"
493
+ "<|end|>"
494
+ )
495
+
496
+ # Use moderate temperature for informative but natural explanation
497
+ result = self.run_llama(prompt, temp=0.2, max_tokens=350)
498
+ return result or "Feature summary not available."
499
+ except Exception as e:
500
+ logging.warning(f"Feature summarization failed for {crate.name}: {str(e)}")
501
+ return "Feature summary not available."
502
+
503
+ def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
504
+ """Classify the use case of a crate with rich context"""
505
+ try:
506
+ # Calculate available tokens for prompt
507
+ available_prompt_tokens = self.config.model_token_limit - 200
508
+
509
+ joined = ", ".join(crate.keywords[:10]) if crate.keywords else "None"
510
+ key_deps = [
511
+ dep.get("crate_id")
512
+ for dep in crate.dependencies[:5]
513
+ if dep.get("kind") == "normal" and dep.get("crate_id")
514
+ ]
515
+ key_deps_str = (
516
+ ", ".join(str(dep) for dep in key_deps) if key_deps else "None"
517
+ )
518
+
519
+ # Adaptively truncate different sections based on importance
520
+ token_budget = available_prompt_tokens - 400
521
+
522
+ # Allocate different percentages to each section
523
+ desc_tokens = int(token_budget * 0.2)
524
+ readme_tokens = int(token_budget * 0.6)
525
+
526
+ desc = self.truncate_content(crate.description, desc_tokens)
527
+ readme_summary = self.smart_truncate(readme_summary, readme_tokens)
528
+
529
+ # Few-shot prompting with examples
530
+ prompt = (
531
+ "<|system|>You are a Rust expert classifying crates into the "
532
+ "most appropriate category.\n"
533
+ "<|user|>\n"
534
+ "# Example 1\n"
535
+ "Crate: `tokio`\n"
536
+ "Description: An asynchronous runtime for the Rust programming "
537
+ "language\n"
538
+ "Keywords: async, runtime, futures\n"
539
+ "Key Dependencies: mio, bytes, parking_lot\n"
540
+ "Category: Networking\n\n"
541
+ "# Example 2\n"
542
+ "Crate: `serde`\n"
543
+ "Description: A generic serialization/deserialization framework\n"
544
+ "Keywords: serde, serialization\n"
545
+ "Key Dependencies: serde_derive\n"
546
+ "Category: Serialization\n\n"
547
+ "# Crate to Classify\n"
548
+ f"Crate: `{crate.name}`\n"
549
+ f"Description: {desc}\n"
550
+ f"Keywords: {joined}\n"
551
+ f"README Summary: {readme_summary}\n"
552
+ f"Key Dependencies: {key_deps_str}\n\n"
553
+ "Category (pick only one): [AI, Database, Web Framework, "
554
+ "Networking, Serialization, Utilities, DevTools, ML, "
555
+ "Cryptography, Unknown]\n"
556
+ "<|end|>"
557
+ )
558
+ # Validate classification with retry - more generous parameters
559
+ result = self.validate_and_retry(
560
+ prompt,
561
+ validation_func=self.validate_classification,
562
+ temp=0.2,
563
+ max_tokens=50,
564
+ )
565
+
566
+ return result or "Unknown"
567
+ except Exception as e:
568
+ logging.error(f"Classification failed for {crate.name}: {str(e)}")
569
+ return "Unknown"
570
+
571
+ def generate_factual_pairs(self, crate: CrateMetadata) -> str:
572
+ """Generate factual/counterfactual pairs with retry and validation"""
573
+ try:
574
+ desc = self.truncate_content(crate.description, 300)
575
+ readme_summary = self.truncate_content(
576
+ getattr(crate, "readme_summary", "") or "", 300
577
+ )
578
+
579
+ # Handle both dict and list feature formats
580
+ if isinstance(crate.features, dict):
581
+ features = ", ".join(list(crate.features.keys())[:5])
582
+ elif isinstance(crate.features, list):
583
+ feature_names = []
584
+ for feature in crate.features[:5]:
585
+ if isinstance(feature, str):
586
+ feature_names.append(feature)
587
+ elif isinstance(feature, dict):
588
+ feature_name = feature.get('name', str(feature))
589
+ feature_names.append(feature_name)
590
+ else:
591
+ feature_names.append(str(feature))
592
+ features = ", ".join(feature_names)
593
+ else:
594
+ features = ""
595
+
596
+ prompt = (
597
+ "<|system|>Create exactly 5 factual/counterfactual pairs for "
598
+ "the Rust crate. Factual statements must be true. "
599
+ "Counterfactuals should be plausible but incorrect - make them "
600
+ "subtle and convincing rather than simple negations.\n"
601
+ "<|user|>\n"
602
+ f"Crate: {crate.name}\n"
603
+ f"Description: {desc}\n"
604
+ f"Repo: {crate.repository}\n"
605
+ f"README Summary: {readme_summary}\n"
606
+ f"Key Features: {features}\n\n"
607
+ "Format each pair as:\n"
608
+ "✅ Factual: [true statement about the crate]\n"
609
+ "❌ Counterfactual: [plausible but false statement]\n\n"
610
+ "Create exactly 5 pairs.\n"
611
+ "<|end|>"
612
+ )
613
+ # Use validation for retry - more generous parameters
614
+ result = self.validate_and_retry(
615
+ prompt,
616
+ validation_func=self.validate_factual_pairs,
617
+ temp=0.7,
618
+ max_tokens=800,
619
+ )
620
+
621
+ return result or "Factual pairs generation failed."
622
+ except Exception as e:
623
+ logging.error(f"Exception in factual_pairs for {crate.name}: {str(e)}")
624
+ return "Factual pairs generation failed."
625
+
626
+ def score_crate(self, crate: CrateMetadata) -> float:
627
+ """Calculate a score for the crate based on various metrics"""
628
+ score = (crate.downloads / 1000) + (crate.github_stars * 10)
629
+ score += len(self.truncate_content(crate.readme, 1000)) / 500
630
+ return round(score, 2)
631
+
632
+ def batch_process_prompts(
633
+ self, prompts: list[tuple[str, float, int]], batch_size: int = 4
634
+ ) -> list[Union[str, None]]:
635
+ """
636
+ L4 GPU-optimized batch processing for multiple prompts.
637
+ Processes prompts in batches to maximize GPU utilization.
638
+
639
+ Args:
640
+ prompts: List of (prompt, temperature, max_tokens) tuples
641
+ batch_size: Number of prompts to process simultaneously
642
+ """
643
+ results: list[Union[str, None]] = []
644
+
645
+ # Process in batches optimized for L4's capabilities
646
+ for i in range(0, len(prompts), batch_size):
647
+ batch = prompts[i : i + batch_size]
648
+ batch_results: list[Union[str, None]] = []
649
+
650
+ for prompt, temp, max_tokens in batch:
651
+ try:
652
+ # Prepare prompt with context preservation
653
+ if self.estimate_tokens(prompt) > 3500:
654
+ prompt = self.smart_truncate(prompt, 3500)
655
+
656
+ # Use optimized parameters for L4
657
+ output = self.model(
658
+ prompt,
659
+ max_tokens=max_tokens,
660
+ temperature=temp,
661
+ top_p=0.95,
662
+ repeat_penalty=1.1,
663
+ stop=["<|end|>", "<|user|>", "<|system|>"],
664
+ echo=False,
665
+ stream=False,
666
+ )
667
+
668
+ # The type checker incorrectly infers a stream response
669
+ choice_text: str = output["choices"][0]["text"] # type: ignore
670
+ result = self.clean_output(choice_text)
671
+ batch_results.append(result)
672
+ except Exception as e:
673
+ logging.error(f"LLM batch processing error: {e}", exc_info=True)
674
+ batch_results.append(None)
675
+
676
+ results.extend(batch_results)
677
+
678
+ return results
679
+
680
+ def smart_context_management(
681
+ self, context_history: list[str], new_prompt: str
682
+ ) -> str:
683
+ """
684
+ Intelligent context management for prefix cache optimization.
685
+ Maximizes cache hits by preserving common context patterns.
686
+ """
687
+ # Calculate available tokens for context
688
+ base_tokens = self.estimate_tokens(new_prompt)
689
+ available_context = 4000 - base_tokens # Leave buffer for response
690
+
691
+ if available_context <= 0:
692
+ return new_prompt
693
+
694
+ # Build context from most recent and most relevant history
695
+ context_parts: list[str] = []
696
+ tokens_used = 0
697
+
698
+ # Prioritize recent context (better cache hits)
699
+ for context in reversed(context_history[-5:]): # Last 5 contexts
700
+ context_tokens = self.estimate_tokens(context)
701
+ if tokens_used + context_tokens <= available_context:
702
+ context_parts.insert(0, context)
703
+ tokens_used += context_tokens
704
+ else:
705
+ # Try to fit truncated version
706
+ remaining_tokens = available_context - tokens_used
707
+ if remaining_tokens > 100: # Only if meaningful space left
708
+ truncated = self.smart_truncate(context, remaining_tokens)
709
+ if truncated:
710
+ context_parts.insert(0, truncated)
711
+ break
712
+
713
+ # Combine context with new prompt
714
+ if context_parts:
715
+ full_context = "\n\n---\n\n".join(context_parts)
716
+ return f"{full_context}\n\n---\n\n{new_prompt}"
717
+
718
+ return new_prompt