rust-crate-pipeline 1.3.6__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,453 +1,462 @@
1
- # azure_ai_processing.py
2
- import re
3
- import time
4
- import logging
5
- import json
6
- from typing import TypedDict, Union, Optional
7
- from collections.abc import Callable
8
-
9
- import requests # type: ignore # May lack stubs in some environments
10
- from .config import PipelineConfig, CrateMetadata, EnrichedCrate # Ensure these are defined and correct
11
-
12
-
13
- class Section(TypedDict, total=True):
14
- heading: str
15
- content: str
16
- priority: int
17
-
18
-
19
- class AzureOpenAIEnricher:
20
- def __init__(self, config: PipelineConfig) -> None:
21
- self.config = config
22
- self.session = requests.Session() # type: ignore[attr-defined]
23
- self.session.headers.update({
24
- "Content-Type": "application/json",
25
- "api-key": config.azure_openai_api_key
26
- })
27
-
28
- # Construct the Azure OpenAI API URL
29
- self.api_url = f"{config.azure_openai_endpoint}openai/deployments/{config.azure_openai_deployment_name}/chat/completions"
30
- self.api_url += f"?api-version={config.azure_openai_api_version}"
31
-
32
- def estimate_tokens(self, text: str) -> int:
33
- """Rough token estimation (4 characters per token)"""
34
- return len(text) // 4
35
-
36
- def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
37
- """Truncate content to fit within token limit"""
38
- paragraphs = content.split("\n\n")
39
- result, current_tokens = "", 0
40
-
41
- for para in paragraphs:
42
- tokens = self.estimate_tokens(para)
43
- if current_tokens + tokens <= max_tokens:
44
- result += para + "\n\n"
45
- current_tokens += tokens
46
- else:
47
- break
48
- return result.strip()
49
-
50
- def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
51
- """Intelligently truncate content to preserve the most important parts"""
52
- if not content:
53
- return ""
54
-
55
- # If content is short enough, return it all
56
- if self.estimate_tokens(content) <= max_tokens:
57
- return content
58
-
59
- # Split into sections based on markdown headers
60
- sections: list[Section] = []
61
- current_section: Section = {
62
- "heading": "Introduction",
63
- "content": "",
64
- "priority": 10,
65
- }
66
-
67
- for line in content.splitlines():
68
- if re.match(r"^#+\s+", line): # It's a header
69
- # Save previous section if not empty
70
- if current_section["content"].strip():
71
- sections.append(current_section)
72
-
73
- # Create new section with appropriate priority
74
- heading = re.sub(r"^#+\s+", "", line)
75
- priority = 5 # Default priority
76
-
77
- # Assign priority based on content type
78
- if re.search(r"\b(usage|example|getting started)\b", heading, re.I):
79
- priority = 10
80
- elif re.search(r"\b(feature|overview|about)\b", heading, re.I):
81
- priority = 9
82
- elif re.search(r"\b(install|setup|config)\b", heading, re.I):
83
- priority = 8
84
- elif re.search(r"\b(api|interface)\b", heading, re.I):
85
- priority = 7
86
-
87
- current_section = {
88
- "heading": heading,
89
- "content": line + "\n",
90
- "priority": priority,
91
- }
92
- else:
93
- current_section["content"] += line + "\n"
94
-
95
- # Boost priority if code block is found
96
- if "```rust" in line or "```no_run" in line:
97
- current_section["priority"] = max(current_section["priority"], 8)
98
-
99
- # Add the last section
100
- if current_section["content"].strip():
101
- sections.append(current_section)
102
-
103
- # Sort sections by priority (highest first)
104
- sections.sort(key=lambda x: x["priority"], reverse=True)
105
-
106
- # Build the result, respecting token limits
107
- result = ""
108
- tokens_used = 0
109
-
110
- for section in sections:
111
- section_text = f'## {section["heading"]}\n{section["content"]}\n'
112
- section_tokens = self.estimate_tokens(section_text)
113
-
114
- if tokens_used + section_tokens <= max_tokens:
115
- result += section_text
116
- tokens_used += section_tokens
117
- elif tokens_used < max_tokens - 100: # If we can fit a truncated version
118
- # Take what we can
119
- remaining_tokens = max_tokens - tokens_used
120
- # Simple truncation by characters
121
- max_chars = remaining_tokens * 4
122
- if len(section_text) > max_chars:
123
- result += section_text[:max_chars] + "..."
124
- else:
125
- result += section_text
126
- break
127
-
128
- return result
129
-
130
- def clean_output(self, output: str, task: str = "general") -> str:
131
- """Task-specific output cleaning"""
132
- if not output:
133
- return ""
134
-
135
- # Remove any remaining prompt artifacts
136
- output = output.split("<|end|>")[0].strip()
137
-
138
- if task == "classification":
139
- # For classification tasks, extract just the category
140
- categories = [
141
- "AI",
142
- "Database",
143
- "Web Framework",
144
- "Networking",
145
- "Serialization",
146
- "Utilities",
147
- "DevTools",
148
- "ML",
149
- "Cryptography",
150
- "Unknown",
151
- ]
152
- for category in categories:
153
- if re.search(
154
- r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
155
- ):
156
- return category
157
- return "Unknown"
158
-
159
- elif task == "factual_pairs":
160
- # For factual pairs, ensure proper formatting
161
- pairs: list[str] = []
162
- facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
163
- counterfacts = re.findall(
164
- r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
165
- )
166
-
167
- # Pair them up
168
- for i in range(min(len(facts), len(counterfacts))):
169
- pairs.append(
170
- f"✅ Factual: {facts[i].strip()}\n"
171
- f"❌ Counterfactual: {counterfacts[i].strip()}"
172
- )
173
-
174
- return "\n\n".join(pairs)
175
-
176
- return output
177
-
178
- def call_azure_openai(
179
- self,
180
- prompt: str,
181
- temperature: float = 0.2,
182
- max_tokens: int = 256,
183
- system_message: str = "You are a helpful AI assistant that analyzes Rust crates and provides insights."
184
- ) -> Optional[str]:
185
- """Call Azure OpenAI API"""
186
- try:
187
- payload = {
188
- "messages": [
189
- {"role": "system", "content": system_message},
190
- {"role": "user", "content": prompt}
191
- ],
192
- "temperature": temperature,
193
- "max_tokens": max_tokens,
194
- "top_p": 1.0,
195
- "frequency_penalty": 0.0,
196
- "presence_penalty": 0.0
197
- }
198
-
199
- response = self.session.post(
200
- self.api_url,
201
- json=payload,
202
- timeout=60
203
- )
204
-
205
- if response.status_code == 200:
206
- result = response.json()
207
- return result["choices"][0]["message"]["content"]
208
- else:
209
- logging.error(f"Azure OpenAI API error: {response.status_code} - {response.text}")
210
- return None
211
-
212
- except Exception as e:
213
- logging.error(f"Error calling Azure OpenAI: {e}")
214
- return None
215
-
216
- def validate_and_retry(
217
- self,
218
- prompt: str,
219
- validation_func: Callable[[str], bool],
220
- temperature: float = 0.2,
221
- max_tokens: int = 256,
222
- retries: int = 4,
223
- system_message: str = "You are a helpful AI assistant that analyzes Rust crates and provides insights."
224
- ) -> Optional[str]:
225
- """Run prompt with validation and retry logic"""
226
- for attempt in range(retries):
227
- try:
228
- result = self.call_azure_openai(prompt, temperature, max_tokens, system_message)
229
-
230
- if result and validation_func(result):
231
- return result
232
-
233
- # If validation failed, try with a different temperature
234
- if attempt < retries - 1:
235
- temperature = min(0.8, temperature + 0.1)
236
- time.sleep(1) # Brief delay between retries
237
-
238
- except Exception as e:
239
- logging.warning(f"Attempt {attempt + 1} failed: {e}")
240
- if attempt < retries - 1:
241
- time.sleep(2 ** attempt) # Exponential backoff
242
-
243
- return None
244
-
245
- def simplify_prompt(self, prompt: str) -> str:
246
- """Simplify complex prompts for better Azure OpenAI performance"""
247
- # Remove excessive whitespace and newlines
248
- prompt = re.sub(r'\n\s*\n', '\n\n', prompt)
249
- prompt = re.sub(r' +', ' ', prompt)
250
-
251
- # Truncate if too long (Azure OpenAI has limits)
252
- if len(prompt) > 8000: # Conservative limit
253
- prompt = prompt[:8000] + "..."
254
-
255
- return prompt.strip()
256
-
257
- def validate_classification(self, result: str) -> bool:
258
- """Validate classification output"""
259
- valid_categories = [
260
- "AI", "Database", "Web Framework", "Networking",
261
- "Serialization", "Utilities", "DevTools", "ML",
262
- "Cryptography", "Unknown"
263
- ]
264
- return any(cat.lower() in result.lower() for cat in valid_categories)
265
-
266
- def validate_factual_pairs(self, result: str) -> bool:
267
- """Validate factual pairs output"""
268
- return "✅" in result and "❌" in result and len(result.split("✅")) > 1
269
-
270
- def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
271
- """Enrich crate with AI-generated insights using Azure OpenAI"""
272
- enriched = EnrichedCrate(**crate.__dict__)
273
-
274
- # Generate readme summary
275
- if crate.readme:
276
- readme_content = self.smart_truncate(crate.readme, 2000)
277
- prompt = f"""Summarize this Rust crate's README in 2-3 sentences:
278
-
279
- {readme_content}
280
-
281
- Summary:"""
282
-
283
- enriched.readme_summary = self.call_azure_openai(
284
- prompt, temperature=0.3, max_tokens=150
285
- )
286
-
287
- # Classify use case
288
- if crate.readme:
289
- enriched.use_case = self.classify_use_case(crate, enriched.readme_summary or "")
290
-
291
- # Generate factual pairs
292
- enriched.factual_counterfactual = self.generate_factual_pairs(crate)
293
-
294
- # Score the crate
295
- enriched.score = self.score_crate(crate)
296
-
297
- return enriched
298
-
299
- def summarize_features(self, crate: CrateMetadata) -> str:
300
- """Summarize crate features using Azure OpenAI"""
301
- if not crate.features:
302
- return "No specific features documented."
303
-
304
- features_text = "\n".join([
305
- f"- {feature}: {', '.join(versions)}"
306
- for feature, versions in crate.features.items()
307
- ])
308
-
309
- prompt = f"""Summarize the key features of this Rust crate in 2-3 sentences:
310
-
311
- {features_text}
312
-
313
- Summary:"""
314
-
315
- result = self.call_azure_openai(prompt, temperature=0.3, max_tokens=150)
316
- return result or "Features analysis unavailable."
317
-
318
- def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
319
- """Classify crate use case using Azure OpenAI"""
320
- context = f"""
321
- Crate: {crate.name}
322
- Description: {crate.description}
323
- Summary: {readme_summary}
324
- Keywords: {', '.join(crate.keywords)}
325
- Categories: {', '.join(crate.categories)}
326
- """
327
-
328
- prompt = f"""Classify this Rust crate into one of these categories:
329
- - AI: Machine learning, AI, neural networks
330
- - Database: Database drivers, ORMs, data storage
331
- - Web Framework: Web servers, HTTP, REST APIs
332
- - Networking: Network protocols, communication
333
- - Serialization: Data formats, JSON, binary
334
- - Utilities: General utilities, helpers
335
- - DevTools: Development tools, debugging
336
- - ML: Machine learning, statistics
337
- - Cryptography: Security, encryption, hashing
338
- - Unknown: Doesn't fit other categories
339
-
340
- {context}
341
-
342
- Category:"""
343
-
344
- result = self.validate_and_retry(
345
- prompt,
346
- self.validate_classification,
347
- temperature=0.1,
348
- max_tokens=50
349
- )
350
-
351
- return result or "Unknown"
352
-
353
- def generate_factual_pairs(self, crate: CrateMetadata) -> str:
354
- """Generate factual/counterfactual pairs using Azure OpenAI"""
355
- context = f"""
356
- Crate: {crate.name}
357
- Description: {crate.description}
358
- Keywords: {', '.join(crate.keywords)}
359
- Categories: {', '.join(crate.categories)}
360
- """
361
-
362
- prompt = f"""Generate 2-3 factual statements about this Rust crate, followed by their counterfactual opposites.
363
-
364
- Format each pair as:
365
- ✅ Factual: [true statement about the crate]
366
- ❌ Counterfactual: [opposite/incorrect statement]
367
-
368
- {context}
369
-
370
- Factual/Counterfactual pairs:"""
371
-
372
- result = self.validate_and_retry(
373
- prompt,
374
- self.validate_factual_pairs,
375
- temperature=0.4,
376
- max_tokens=300
377
- )
378
-
379
- return result or "Factual analysis unavailable."
380
-
381
- def score_crate(self, crate: CrateMetadata) -> float:
382
- """Score crate quality using Azure OpenAI"""
383
- context = f"""
384
- Crate: {crate.name}
385
- Description: {crate.description}
386
- Downloads: {crate.downloads}
387
- GitHub Stars: {crate.github_stars}
388
- Keywords: {', '.join(crate.keywords)}
389
- Categories: {', '.join(crate.categories)}
390
- """
391
-
392
- prompt = f"""Rate this Rust crate on a scale of 1-10 based on:
393
- - Popularity (downloads, stars)
394
- - Documentation quality
395
- - Usefulness and relevance
396
- - Community adoption
397
-
398
- {context}
399
-
400
- Score (1-10):"""
401
-
402
- result = self.call_azure_openai(prompt, temperature=0.1, max_tokens=10)
403
-
404
- if result:
405
- # Extract numeric score
406
- score_match = re.search(r'(\d+(?:\.\d+)?)', result)
407
- if score_match:
408
- try:
409
- score = float(score_match.group(1))
410
- return min(10.0, max(1.0, score)) # Clamp between 1-10
411
- except ValueError:
412
- pass
413
-
414
- return 5.0 # Default score
415
-
416
- def batch_process_prompts(
417
- self,
418
- prompts: "list[tuple[str, float, int]]",
419
- batch_size: int = 4
420
- ) -> "list[Optional[str]]":
421
- """Process multiple prompts in batches"""
422
- results: "list[Optional[str]]" = []
423
-
424
- for i in range(0, len(prompts), batch_size):
425
- batch = prompts[i:i + batch_size]
426
- batch_results: "list[Optional[str]]" = []
427
-
428
- for prompt_tuple in batch:
429
- prompt, temp, max_tokens = prompt_tuple
430
- result = self.call_azure_openai(prompt, temp, max_tokens)
431
- batch_results.append(result)
432
- time.sleep(0.1) # Rate limiting
433
-
434
- results.extend(batch_results)
435
-
436
- return results
437
-
438
- def smart_context_management(
439
- self, context_history: "list[str]", new_prompt: str
440
- ) -> str:
441
- """Manage context for long conversations"""
442
- # For Azure OpenAI, we can be more generous with context
443
- # but still need to manage it carefully
444
-
445
- total_context = "\n".join(context_history) + "\n" + new_prompt
446
- max_context_tokens = 6000 # Conservative limit for Azure OpenAI
447
-
448
- if self.estimate_tokens(total_context) <= max_context_tokens:
449
- return total_context
450
-
451
- # If too long, keep most recent context
452
- recent_context = context_history[-2:] if len(context_history) >= 2 else context_history
1
+ # azure_ai_processing.py
2
+ import re
3
+ import time
4
+ import logging
5
+ import json
6
+ from typing import TypedDict, Union, Optional
7
+ from collections.abc import Callable
8
+
9
+ import requests # type: ignore # May lack stubs in some environments
10
+ from .config import PipelineConfig, CrateMetadata, EnrichedCrate # Ensure these are defined and correct
11
+
12
+
13
+ class Section(TypedDict, total=True):
14
+ heading: str
15
+ content: str
16
+ priority: int
17
+
18
+
19
+ class AzureOpenAIEnricher:
20
+ def __init__(self, config: PipelineConfig) -> None:
21
+ self.config = config
22
+ self.session = requests.Session() # type: ignore[attr-defined]
23
+ self.session.headers.update({
24
+ "Content-Type": "application/json",
25
+ "api-key": config.azure_openai_api_key
26
+ })
27
+
28
+ # Construct the Azure OpenAI API URL
29
+ self.api_url = f"{config.azure_openai_endpoint}openai/deployments/{config.azure_openai_deployment_name}/chat/completions"
30
+ self.api_url += f"?api-version={config.azure_openai_api_version}"
31
+
32
+ def estimate_tokens(self, text: str) -> int:
33
+ """Rough token estimation (4 characters per token)"""
34
+ return len(text) // 4
35
+
36
+ def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
37
+ """Truncate content to fit within token limit"""
38
+ paragraphs = content.split("\n\n")
39
+ result, current_tokens = "", 0
40
+
41
+ for para in paragraphs:
42
+ tokens = self.estimate_tokens(para)
43
+ if current_tokens + tokens <= max_tokens:
44
+ result += para + "\n\n"
45
+ current_tokens += tokens
46
+ else:
47
+ break
48
+ return result.strip()
49
+
50
+ def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
51
+ """Intelligently truncate content to preserve the most important parts"""
52
+ if not content:
53
+ return ""
54
+
55
+ # If content is short enough, return it all
56
+ if self.estimate_tokens(content) <= max_tokens:
57
+ return content
58
+
59
+ # Split into sections based on markdown headers
60
+ sections: list[Section] = []
61
+ current_section: Section = {
62
+ "heading": "Introduction",
63
+ "content": "",
64
+ "priority": 10,
65
+ }
66
+
67
+ for line in content.splitlines():
68
+ if re.match(r"^#+\s+", line): # It's a header
69
+ # Save previous section if not empty
70
+ if current_section["content"].strip():
71
+ sections.append(current_section)
72
+
73
+ # Create new section with appropriate priority
74
+ heading = re.sub(r"^#+\s+", "", line)
75
+ priority = 5 # Default priority
76
+
77
+ # Assign priority based on content type
78
+ if re.search(r"\b(usage|example|getting started)\b", heading, re.I):
79
+ priority = 10
80
+ elif re.search(r"\b(feature|overview|about)\b", heading, re.I):
81
+ priority = 9
82
+ elif re.search(r"\b(install|setup|config)\b", heading, re.I):
83
+ priority = 8
84
+ elif re.search(r"\b(api|interface)\b", heading, re.I):
85
+ priority = 7
86
+
87
+ current_section = {
88
+ "heading": heading,
89
+ "content": line + "\n",
90
+ "priority": priority,
91
+ }
92
+ else:
93
+ current_section["content"] += line + "\n"
94
+
95
+ # Boost priority if code block is found
96
+ if "```rust" in line or "```no_run" in line:
97
+ current_section["priority"] = max(current_section["priority"], 8)
98
+
99
+ # Add the last section
100
+ if current_section["content"].strip():
101
+ sections.append(current_section)
102
+
103
+ # Sort sections by priority (highest first)
104
+ sections.sort(key=lambda x: x["priority"], reverse=True)
105
+
106
+ # Build the result, respecting token limits
107
+ result = ""
108
+ tokens_used = 0
109
+
110
+ for section in sections:
111
+ section_text = f'## {section["heading"]}\n{section["content"]}\n'
112
+ section_tokens = self.estimate_tokens(section_text)
113
+
114
+ if tokens_used + section_tokens <= max_tokens:
115
+ result += section_text
116
+ tokens_used += section_tokens
117
+ elif tokens_used < max_tokens - 100: # If we can fit a truncated version
118
+ # Take what we can
119
+ remaining_tokens = max_tokens - tokens_used
120
+ # Simple truncation by characters
121
+ max_chars = remaining_tokens * 4
122
+ if len(section_text) > max_chars:
123
+ result += section_text[:max_chars] + "..."
124
+ else:
125
+ result += section_text
126
+ break
127
+
128
+ return result
129
+
130
+ def clean_output(self, output: str, task: str = "general") -> str:
131
+ """Task-specific output cleaning"""
132
+ if not output:
133
+ return ""
134
+
135
+ # Remove any remaining prompt artifacts
136
+ output = output.split("<|end|>")[0].strip()
137
+
138
+ if task == "classification":
139
+ # For classification tasks, extract just the category
140
+ categories = [
141
+ "AI",
142
+ "Database",
143
+ "Web Framework",
144
+ "Networking",
145
+ "Serialization",
146
+ "Utilities",
147
+ "DevTools",
148
+ "ML",
149
+ "Cryptography",
150
+ "Unknown",
151
+ ]
152
+ for category in categories:
153
+ if re.search(
154
+ r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
155
+ ):
156
+ return category
157
+ return "Unknown"
158
+
159
+ elif task == "factual_pairs":
160
+ # For factual pairs, ensure proper formatting
161
+ pairs: list[str] = []
162
+ facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
163
+ counterfacts = re.findall(
164
+ r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
165
+ )
166
+
167
+ # Pair them up
168
+ for i in range(min(len(facts), len(counterfacts))):
169
+ pairs.append(
170
+ f"✅ Factual: {facts[i].strip()}\n"
171
+ f"❌ Counterfactual: {counterfacts[i].strip()}"
172
+ )
173
+
174
+ return "\n\n".join(pairs)
175
+
176
+ return output
177
+
178
+ def call_azure_openai(
179
+ self,
180
+ prompt: str,
181
+ temperature: float = 0.2,
182
+ max_tokens: int = 256,
183
+ system_message: str = "You are a helpful AI assistant that analyzes Rust crates and provides insights."
184
+ ) -> Optional[str]:
185
+ """Call Azure OpenAI API"""
186
+ try:
187
+ payload = {
188
+ "messages": [
189
+ {"role": "system", "content": system_message},
190
+ {"role": "user", "content": prompt}
191
+ ],
192
+ "temperature": temperature,
193
+ "max_tokens": max_tokens,
194
+ "top_p": 1.0,
195
+ "frequency_penalty": 0.0,
196
+ "presence_penalty": 0.0
197
+ }
198
+
199
+ response = self.session.post(
200
+ self.api_url,
201
+ json=payload,
202
+ timeout=60
203
+ )
204
+
205
+ if response.status_code == 200:
206
+ result = response.json()
207
+ return result["choices"][0]["message"]["content"]
208
+ else:
209
+ logging.error(f"Azure OpenAI API error: {response.status_code} - {response.text}")
210
+ return None
211
+
212
+ except Exception as e:
213
+ logging.error(f"Error calling Azure OpenAI: {e}")
214
+ return None
215
+
216
+ def validate_and_retry(
217
+ self,
218
+ prompt: str,
219
+ validation_func: Callable[[str], bool],
220
+ temperature: float = 0.2,
221
+ max_tokens: int = 256,
222
+ retries: int = 4,
223
+ system_message: str = "You are a helpful AI assistant that analyzes Rust crates and provides insights."
224
+ ) -> Optional[str]:
225
+ """Run prompt with validation and retry logic"""
226
+ for attempt in range(retries):
227
+ try:
228
+ result = self.call_azure_openai(prompt, temperature, max_tokens, system_message)
229
+
230
+ if result and validation_func(result):
231
+ return result
232
+
233
+ # If validation failed, try with a different temperature
234
+ if attempt < retries - 1:
235
+ temperature = min(0.8, temperature + 0.1)
236
+ time.sleep(1) # Brief delay between retries
237
+
238
+ except Exception as e:
239
+ logging.warning(f"Attempt {attempt + 1} failed: {e}")
240
+ if attempt < retries - 1:
241
+ time.sleep(2 ** attempt) # Exponential backoff
242
+
243
+ return None
244
+
245
+ def simplify_prompt(self, prompt: str) -> str:
246
+ """Simplify complex prompts for better Azure OpenAI performance"""
247
+ # Remove excessive whitespace and newlines
248
+ prompt = re.sub(r'\n\s*\n', '\n\n', prompt)
249
+ prompt = re.sub(r' +', ' ', prompt)
250
+
251
+ # Truncate if too long (Azure OpenAI has limits)
252
+ if len(prompt) > 8000: # Conservative limit
253
+ prompt = prompt[:8000] + "..."
254
+
255
+ return prompt.strip()
256
+
257
+ def validate_classification(self, result: str) -> bool:
258
+ """Validate classification output"""
259
+ valid_categories = [
260
+ "AI", "Database", "Web Framework", "Networking",
261
+ "Serialization", "Utilities", "DevTools", "ML",
262
+ "Cryptography", "Unknown"
263
+ ]
264
+ return any(cat.lower() in result.lower() for cat in valid_categories)
265
+
266
+ def validate_factual_pairs(self, result: str) -> bool:
267
+ """Validate factual pairs output"""
268
+ return "✅" in result and "❌" in result and len(result.split("✅")) > 1
269
+
270
+ def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
271
+ """Enrich crate with AI-generated insights using Azure OpenAI"""
272
+ enriched = EnrichedCrate(**crate.__dict__)
273
+
274
+ # Generate readme summary
275
+ if crate.readme:
276
+ readme_content = self.smart_truncate(crate.readme, 2000)
277
+ prompt = f"""Summarize this Rust crate's README in 2-3 sentences:
278
+
279
+ {readme_content}
280
+
281
+ Summary:"""
282
+
283
+ enriched.readme_summary = self.call_azure_openai(
284
+ prompt, temperature=0.3, max_tokens=150
285
+ )
286
+
287
+ # Classify use case
288
+ if crate.readme:
289
+ enriched.use_case = self.classify_use_case(crate, enriched.readme_summary or "")
290
+
291
+ # Generate factual pairs
292
+ enriched.factual_counterfactual = self.generate_factual_pairs(crate)
293
+
294
+ # Score the crate
295
+ enriched.score = self.score_crate(crate)
296
+
297
+ return enriched
298
+
299
+ def summarize_features(self, crate: CrateMetadata) -> str:
300
+ """Summarize crate features using Azure OpenAI"""
301
+ if not crate.features:
302
+ return "No specific features documented."
303
+
304
+ # Handle both dict and list feature formats
305
+ if isinstance(crate.features, dict):
306
+ features_text = "\n".join([
307
+ f"- {feature}: {', '.join(versions)}"
308
+ for feature, versions in crate.features.items()
309
+ ])
310
+ elif isinstance(crate.features, list):
311
+ features_text = "\n".join([
312
+ f"- {feature}" if isinstance(feature, str) else f"- {str(feature)}"
313
+ for feature in crate.features
314
+ ])
315
+ else:
316
+ return "Features format not recognized."
317
+
318
+ prompt = f"""Summarize the key features of this Rust crate in 2-3 sentences:
319
+
320
+ {features_text}
321
+
322
+ Summary:"""
323
+
324
+ result = self.call_azure_openai(prompt, temperature=0.3, max_tokens=150)
325
+ return result or "Features analysis unavailable."
326
+
327
+ def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
328
+ """Classify crate use case using Azure OpenAI"""
329
+ context = f"""
330
+ Crate: {crate.name}
331
+ Description: {crate.description}
332
+ Summary: {readme_summary}
333
+ Keywords: {', '.join(crate.keywords)}
334
+ Categories: {', '.join(crate.categories)}
335
+ """
336
+
337
+ prompt = f"""Classify this Rust crate into one of these categories:
338
+ - AI: Machine learning, AI, neural networks
339
+ - Database: Database drivers, ORMs, data storage
340
+ - Web Framework: Web servers, HTTP, REST APIs
341
+ - Networking: Network protocols, communication
342
+ - Serialization: Data formats, JSON, binary
343
+ - Utilities: General utilities, helpers
344
+ - DevTools: Development tools, debugging
345
+ - ML: Machine learning, statistics
346
+ - Cryptography: Security, encryption, hashing
347
+ - Unknown: Doesn't fit other categories
348
+
349
+ {context}
350
+
351
+ Category:"""
352
+
353
+ result = self.validate_and_retry(
354
+ prompt,
355
+ self.validate_classification,
356
+ temperature=0.1,
357
+ max_tokens=50
358
+ )
359
+
360
+ return result or "Unknown"
361
+
362
+ def generate_factual_pairs(self, crate: CrateMetadata) -> str:
363
+ """Generate factual/counterfactual pairs using Azure OpenAI"""
364
+ context = f"""
365
+ Crate: {crate.name}
366
+ Description: {crate.description}
367
+ Keywords: {', '.join(crate.keywords)}
368
+ Categories: {', '.join(crate.categories)}
369
+ """
370
+
371
+ prompt = f"""Generate 2-3 factual statements about this Rust crate, followed by their counterfactual opposites.
372
+
373
+ Format each pair as:
374
+ Factual: [true statement about the crate]
375
+ Counterfactual: [opposite/incorrect statement]
376
+
377
+ {context}
378
+
379
+ Factual/Counterfactual pairs:"""
380
+
381
+ result = self.validate_and_retry(
382
+ prompt,
383
+ self.validate_factual_pairs,
384
+ temperature=0.4,
385
+ max_tokens=300
386
+ )
387
+
388
+ return result or "Factual analysis unavailable."
389
+
390
+ def score_crate(self, crate: CrateMetadata) -> float:
391
+ """Score crate quality using Azure OpenAI"""
392
+ context = f"""
393
+ Crate: {crate.name}
394
+ Description: {crate.description}
395
+ Downloads: {crate.downloads}
396
+ GitHub Stars: {crate.github_stars}
397
+ Keywords: {', '.join(crate.keywords)}
398
+ Categories: {', '.join(crate.categories)}
399
+ """
400
+
401
+ prompt = f"""Rate this Rust crate on a scale of 1-10 based on:
402
+ - Popularity (downloads, stars)
403
+ - Documentation quality
404
+ - Usefulness and relevance
405
+ - Community adoption
406
+
407
+ {context}
408
+
409
+ Score (1-10):"""
410
+
411
+ result = self.call_azure_openai(prompt, temperature=0.1, max_tokens=10)
412
+
413
+ if result:
414
+ # Extract numeric score
415
+ score_match = re.search(r'(\d+(?:\.\d+)?)', result)
416
+ if score_match:
417
+ try:
418
+ score = float(score_match.group(1))
419
+ return min(10.0, max(1.0, score)) # Clamp between 1-10
420
+ except ValueError:
421
+ pass
422
+
423
+ return 5.0 # Default score
424
+
425
+ def batch_process_prompts(
426
+ self,
427
+ prompts: "list[tuple[str, float, int]]",
428
+ batch_size: int = 4
429
+ ) -> "list[Optional[str]]":
430
+ """Process multiple prompts in batches"""
431
+ results: "list[Optional[str]]" = []
432
+
433
+ for i in range(0, len(prompts), batch_size):
434
+ batch = prompts[i:i + batch_size]
435
+ batch_results: "list[Optional[str]]" = []
436
+
437
+ for prompt_tuple in batch:
438
+ prompt, temp, max_tokens = prompt_tuple
439
+ result = self.call_azure_openai(prompt, temp, max_tokens)
440
+ batch_results.append(result)
441
+ time.sleep(0.1) # Rate limiting
442
+
443
+ results.extend(batch_results)
444
+
445
+ return results
446
+
447
+ def smart_context_management(
448
+ self, context_history: "list[str]", new_prompt: str
449
+ ) -> str:
450
+ """Manage context for long conversations"""
451
+ # For Azure OpenAI, we can be more generous with context
452
+ # but still need to manage it carefully
453
+
454
+ total_context = "\n".join(context_history) + "\n" + new_prompt
455
+ max_context_tokens = 6000 # Conservative limit for Azure OpenAI
456
+
457
+ if self.estimate_tokens(total_context) <= max_context_tokens:
458
+ return total_context
459
+
460
+ # If too long, keep most recent context
461
+ recent_context = context_history[-2:] if len(context_history) >= 2 else context_history
453
462
  return "\n".join(recent_context) + "\n" + new_prompt