rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,462 @@
|
|
1
|
+
# azure_ai_processing.py
|
2
|
+
import re
|
3
|
+
import time
|
4
|
+
import logging
|
5
|
+
import json
|
6
|
+
from typing import TypedDict, Union, Optional
|
7
|
+
from collections.abc import Callable
|
8
|
+
|
9
|
+
import requests # type: ignore # May lack stubs in some environments
|
10
|
+
from .config import PipelineConfig, CrateMetadata, EnrichedCrate # Ensure these are defined and correct
|
11
|
+
|
12
|
+
|
13
|
+
class Section(TypedDict, total=True):
|
14
|
+
heading: str
|
15
|
+
content: str
|
16
|
+
priority: int
|
17
|
+
|
18
|
+
|
19
|
+
class AzureOpenAIEnricher:
|
20
|
+
def __init__(self, config: PipelineConfig) -> None:
|
21
|
+
self.config = config
|
22
|
+
self.session = requests.Session() # type: ignore[attr-defined]
|
23
|
+
self.session.headers.update({
|
24
|
+
"Content-Type": "application/json",
|
25
|
+
"api-key": config.azure_openai_api_key
|
26
|
+
})
|
27
|
+
|
28
|
+
# Construct the Azure OpenAI API URL
|
29
|
+
self.api_url = f"{config.azure_openai_endpoint}openai/deployments/{config.azure_openai_deployment_name}/chat/completions"
|
30
|
+
self.api_url += f"?api-version={config.azure_openai_api_version}"
|
31
|
+
|
32
|
+
def estimate_tokens(self, text: str) -> int:
|
33
|
+
"""Rough token estimation (4 characters per token)"""
|
34
|
+
return len(text) // 4
|
35
|
+
|
36
|
+
def truncate_content(self, content: str, max_tokens: int = 1000) -> str:
|
37
|
+
"""Truncate content to fit within token limit"""
|
38
|
+
paragraphs = content.split("\n\n")
|
39
|
+
result, current_tokens = "", 0
|
40
|
+
|
41
|
+
for para in paragraphs:
|
42
|
+
tokens = self.estimate_tokens(para)
|
43
|
+
if current_tokens + tokens <= max_tokens:
|
44
|
+
result += para + "\n\n"
|
45
|
+
current_tokens += tokens
|
46
|
+
else:
|
47
|
+
break
|
48
|
+
return result.strip()
|
49
|
+
|
50
|
+
def smart_truncate(self, content: str, max_tokens: int = 1000) -> str:
|
51
|
+
"""Intelligently truncate content to preserve the most important parts"""
|
52
|
+
if not content:
|
53
|
+
return ""
|
54
|
+
|
55
|
+
# If content is short enough, return it all
|
56
|
+
if self.estimate_tokens(content) <= max_tokens:
|
57
|
+
return content
|
58
|
+
|
59
|
+
# Split into sections based on markdown headers
|
60
|
+
sections: list[Section] = []
|
61
|
+
current_section: Section = {
|
62
|
+
"heading": "Introduction",
|
63
|
+
"content": "",
|
64
|
+
"priority": 10,
|
65
|
+
}
|
66
|
+
|
67
|
+
for line in content.splitlines():
|
68
|
+
if re.match(r"^#+\s+", line): # It's a header
|
69
|
+
# Save previous section if not empty
|
70
|
+
if current_section["content"].strip():
|
71
|
+
sections.append(current_section)
|
72
|
+
|
73
|
+
# Create new section with appropriate priority
|
74
|
+
heading = re.sub(r"^#+\s+", "", line)
|
75
|
+
priority = 5 # Default priority
|
76
|
+
|
77
|
+
# Assign priority based on content type
|
78
|
+
if re.search(r"\b(usage|example|getting started)\b", heading, re.I):
|
79
|
+
priority = 10
|
80
|
+
elif re.search(r"\b(feature|overview|about)\b", heading, re.I):
|
81
|
+
priority = 9
|
82
|
+
elif re.search(r"\b(install|setup|config)\b", heading, re.I):
|
83
|
+
priority = 8
|
84
|
+
elif re.search(r"\b(api|interface)\b", heading, re.I):
|
85
|
+
priority = 7
|
86
|
+
|
87
|
+
current_section = {
|
88
|
+
"heading": heading,
|
89
|
+
"content": line + "\n",
|
90
|
+
"priority": priority,
|
91
|
+
}
|
92
|
+
else:
|
93
|
+
current_section["content"] += line + "\n"
|
94
|
+
|
95
|
+
# Boost priority if code block is found
|
96
|
+
if "```rust" in line or "```no_run" in line:
|
97
|
+
current_section["priority"] = max(current_section["priority"], 8)
|
98
|
+
|
99
|
+
# Add the last section
|
100
|
+
if current_section["content"].strip():
|
101
|
+
sections.append(current_section)
|
102
|
+
|
103
|
+
# Sort sections by priority (highest first)
|
104
|
+
sections.sort(key=lambda x: x["priority"], reverse=True)
|
105
|
+
|
106
|
+
# Build the result, respecting token limits
|
107
|
+
result = ""
|
108
|
+
tokens_used = 0
|
109
|
+
|
110
|
+
for section in sections:
|
111
|
+
section_text = f'## {section["heading"]}\n{section["content"]}\n'
|
112
|
+
section_tokens = self.estimate_tokens(section_text)
|
113
|
+
|
114
|
+
if tokens_used + section_tokens <= max_tokens:
|
115
|
+
result += section_text
|
116
|
+
tokens_used += section_tokens
|
117
|
+
elif tokens_used < max_tokens - 100: # If we can fit a truncated version
|
118
|
+
# Take what we can
|
119
|
+
remaining_tokens = max_tokens - tokens_used
|
120
|
+
# Simple truncation by characters
|
121
|
+
max_chars = remaining_tokens * 4
|
122
|
+
if len(section_text) > max_chars:
|
123
|
+
result += section_text[:max_chars] + "..."
|
124
|
+
else:
|
125
|
+
result += section_text
|
126
|
+
break
|
127
|
+
|
128
|
+
return result
|
129
|
+
|
130
|
+
def clean_output(self, output: str, task: str = "general") -> str:
|
131
|
+
"""Task-specific output cleaning"""
|
132
|
+
if not output:
|
133
|
+
return ""
|
134
|
+
|
135
|
+
# Remove any remaining prompt artifacts
|
136
|
+
output = output.split("<|end|>")[0].strip()
|
137
|
+
|
138
|
+
if task == "classification":
|
139
|
+
# For classification tasks, extract just the category
|
140
|
+
categories = [
|
141
|
+
"AI",
|
142
|
+
"Database",
|
143
|
+
"Web Framework",
|
144
|
+
"Networking",
|
145
|
+
"Serialization",
|
146
|
+
"Utilities",
|
147
|
+
"DevTools",
|
148
|
+
"ML",
|
149
|
+
"Cryptography",
|
150
|
+
"Unknown",
|
151
|
+
]
|
152
|
+
for category in categories:
|
153
|
+
if re.search(
|
154
|
+
r"\b" + re.escape(category) + r"\b", output, re.IGNORECASE
|
155
|
+
):
|
156
|
+
return category
|
157
|
+
return "Unknown"
|
158
|
+
|
159
|
+
elif task == "factual_pairs":
|
160
|
+
# For factual pairs, ensure proper formatting
|
161
|
+
pairs: list[str] = []
|
162
|
+
facts = re.findall(r"✅\s*Factual:?\s*(.*?)(?=❌|\Z)", output, re.DOTALL)
|
163
|
+
counterfacts = re.findall(
|
164
|
+
r"❌\s*Counterfactual:?\s*(.*?)(?=✅|\Z)", output, re.DOTALL
|
165
|
+
)
|
166
|
+
|
167
|
+
# Pair them up
|
168
|
+
for i in range(min(len(facts), len(counterfacts))):
|
169
|
+
pairs.append(
|
170
|
+
f"✅ Factual: {facts[i].strip()}\n"
|
171
|
+
f"❌ Counterfactual: {counterfacts[i].strip()}"
|
172
|
+
)
|
173
|
+
|
174
|
+
return "\n\n".join(pairs)
|
175
|
+
|
176
|
+
return output
|
177
|
+
|
178
|
+
def call_azure_openai(
|
179
|
+
self,
|
180
|
+
prompt: str,
|
181
|
+
temperature: float = 0.2,
|
182
|
+
max_tokens: int = 256,
|
183
|
+
system_message: str = "You are a helpful AI assistant that analyzes Rust crates and provides insights."
|
184
|
+
) -> Optional[str]:
|
185
|
+
"""Call Azure OpenAI API"""
|
186
|
+
try:
|
187
|
+
payload = {
|
188
|
+
"messages": [
|
189
|
+
{"role": "system", "content": system_message},
|
190
|
+
{"role": "user", "content": prompt}
|
191
|
+
],
|
192
|
+
"temperature": temperature,
|
193
|
+
"max_tokens": max_tokens,
|
194
|
+
"top_p": 1.0,
|
195
|
+
"frequency_penalty": 0.0,
|
196
|
+
"presence_penalty": 0.0
|
197
|
+
}
|
198
|
+
|
199
|
+
response = self.session.post(
|
200
|
+
self.api_url,
|
201
|
+
json=payload,
|
202
|
+
timeout=60
|
203
|
+
)
|
204
|
+
|
205
|
+
if response.status_code == 200:
|
206
|
+
result = response.json()
|
207
|
+
return result["choices"][0]["message"]["content"]
|
208
|
+
else:
|
209
|
+
logging.error(f"Azure OpenAI API error: {response.status_code} - {response.text}")
|
210
|
+
return None
|
211
|
+
|
212
|
+
except Exception as e:
|
213
|
+
logging.error(f"Error calling Azure OpenAI: {e}")
|
214
|
+
return None
|
215
|
+
|
216
|
+
def validate_and_retry(
|
217
|
+
self,
|
218
|
+
prompt: str,
|
219
|
+
validation_func: Callable[[str], bool],
|
220
|
+
temperature: float = 0.2,
|
221
|
+
max_tokens: int = 256,
|
222
|
+
retries: int = 4,
|
223
|
+
system_message: str = "You are a helpful AI assistant that analyzes Rust crates and provides insights."
|
224
|
+
) -> Optional[str]:
|
225
|
+
"""Run prompt with validation and retry logic"""
|
226
|
+
for attempt in range(retries):
|
227
|
+
try:
|
228
|
+
result = self.call_azure_openai(prompt, temperature, max_tokens, system_message)
|
229
|
+
|
230
|
+
if result and validation_func(result):
|
231
|
+
return result
|
232
|
+
|
233
|
+
# If validation failed, try with a different temperature
|
234
|
+
if attempt < retries - 1:
|
235
|
+
temperature = min(0.8, temperature + 0.1)
|
236
|
+
time.sleep(1) # Brief delay between retries
|
237
|
+
|
238
|
+
except Exception as e:
|
239
|
+
logging.warning(f"Attempt {attempt + 1} failed: {e}")
|
240
|
+
if attempt < retries - 1:
|
241
|
+
time.sleep(2 ** attempt) # Exponential backoff
|
242
|
+
|
243
|
+
return None
|
244
|
+
|
245
|
+
def simplify_prompt(self, prompt: str) -> str:
|
246
|
+
"""Simplify complex prompts for better Azure OpenAI performance"""
|
247
|
+
# Remove excessive whitespace and newlines
|
248
|
+
prompt = re.sub(r'\n\s*\n', '\n\n', prompt)
|
249
|
+
prompt = re.sub(r' +', ' ', prompt)
|
250
|
+
|
251
|
+
# Truncate if too long (Azure OpenAI has limits)
|
252
|
+
if len(prompt) > 8000: # Conservative limit
|
253
|
+
prompt = prompt[:8000] + "..."
|
254
|
+
|
255
|
+
return prompt.strip()
|
256
|
+
|
257
|
+
def validate_classification(self, result: str) -> bool:
|
258
|
+
"""Validate classification output"""
|
259
|
+
valid_categories = [
|
260
|
+
"AI", "Database", "Web Framework", "Networking",
|
261
|
+
"Serialization", "Utilities", "DevTools", "ML",
|
262
|
+
"Cryptography", "Unknown"
|
263
|
+
]
|
264
|
+
return any(cat.lower() in result.lower() for cat in valid_categories)
|
265
|
+
|
266
|
+
def validate_factual_pairs(self, result: str) -> bool:
|
267
|
+
"""Validate factual pairs output"""
|
268
|
+
return "✅" in result and "❌" in result and len(result.split("✅")) > 1
|
269
|
+
|
270
|
+
def enrich_crate(self, crate: CrateMetadata) -> EnrichedCrate:
|
271
|
+
"""Enrich crate with AI-generated insights using Azure OpenAI"""
|
272
|
+
enriched = EnrichedCrate(**crate.__dict__)
|
273
|
+
|
274
|
+
# Generate readme summary
|
275
|
+
if crate.readme:
|
276
|
+
readme_content = self.smart_truncate(crate.readme, 2000)
|
277
|
+
prompt = f"""Summarize this Rust crate's README in 2-3 sentences:
|
278
|
+
|
279
|
+
{readme_content}
|
280
|
+
|
281
|
+
Summary:"""
|
282
|
+
|
283
|
+
enriched.readme_summary = self.call_azure_openai(
|
284
|
+
prompt, temperature=0.3, max_tokens=150
|
285
|
+
)
|
286
|
+
|
287
|
+
# Classify use case
|
288
|
+
if crate.readme:
|
289
|
+
enriched.use_case = self.classify_use_case(crate, enriched.readme_summary or "")
|
290
|
+
|
291
|
+
# Generate factual pairs
|
292
|
+
enriched.factual_counterfactual = self.generate_factual_pairs(crate)
|
293
|
+
|
294
|
+
# Score the crate
|
295
|
+
enriched.score = self.score_crate(crate)
|
296
|
+
|
297
|
+
return enriched
|
298
|
+
|
299
|
+
def summarize_features(self, crate: CrateMetadata) -> str:
|
300
|
+
"""Summarize crate features using Azure OpenAI"""
|
301
|
+
if not crate.features:
|
302
|
+
return "No specific features documented."
|
303
|
+
|
304
|
+
# Handle both dict and list feature formats
|
305
|
+
if isinstance(crate.features, dict):
|
306
|
+
features_text = "\n".join([
|
307
|
+
f"- {feature}: {', '.join(versions)}"
|
308
|
+
for feature, versions in crate.features.items()
|
309
|
+
])
|
310
|
+
elif isinstance(crate.features, list):
|
311
|
+
features_text = "\n".join([
|
312
|
+
f"- {feature}" if isinstance(feature, str) else f"- {str(feature)}"
|
313
|
+
for feature in crate.features
|
314
|
+
])
|
315
|
+
else:
|
316
|
+
return "Features format not recognized."
|
317
|
+
|
318
|
+
prompt = f"""Summarize the key features of this Rust crate in 2-3 sentences:
|
319
|
+
|
320
|
+
{features_text}
|
321
|
+
|
322
|
+
Summary:"""
|
323
|
+
|
324
|
+
result = self.call_azure_openai(prompt, temperature=0.3, max_tokens=150)
|
325
|
+
return result or "Features analysis unavailable."
|
326
|
+
|
327
|
+
def classify_use_case(self, crate: CrateMetadata, readme_summary: str) -> str:
|
328
|
+
"""Classify crate use case using Azure OpenAI"""
|
329
|
+
context = f"""
|
330
|
+
Crate: {crate.name}
|
331
|
+
Description: {crate.description}
|
332
|
+
Summary: {readme_summary}
|
333
|
+
Keywords: {', '.join(crate.keywords)}
|
334
|
+
Categories: {', '.join(crate.categories)}
|
335
|
+
"""
|
336
|
+
|
337
|
+
prompt = f"""Classify this Rust crate into one of these categories:
|
338
|
+
- AI: Machine learning, AI, neural networks
|
339
|
+
- Database: Database drivers, ORMs, data storage
|
340
|
+
- Web Framework: Web servers, HTTP, REST APIs
|
341
|
+
- Networking: Network protocols, communication
|
342
|
+
- Serialization: Data formats, JSON, binary
|
343
|
+
- Utilities: General utilities, helpers
|
344
|
+
- DevTools: Development tools, debugging
|
345
|
+
- ML: Machine learning, statistics
|
346
|
+
- Cryptography: Security, encryption, hashing
|
347
|
+
- Unknown: Doesn't fit other categories
|
348
|
+
|
349
|
+
{context}
|
350
|
+
|
351
|
+
Category:"""
|
352
|
+
|
353
|
+
result = self.validate_and_retry(
|
354
|
+
prompt,
|
355
|
+
self.validate_classification,
|
356
|
+
temperature=0.1,
|
357
|
+
max_tokens=50
|
358
|
+
)
|
359
|
+
|
360
|
+
return result or "Unknown"
|
361
|
+
|
362
|
+
def generate_factual_pairs(self, crate: CrateMetadata) -> str:
|
363
|
+
"""Generate factual/counterfactual pairs using Azure OpenAI"""
|
364
|
+
context = f"""
|
365
|
+
Crate: {crate.name}
|
366
|
+
Description: {crate.description}
|
367
|
+
Keywords: {', '.join(crate.keywords)}
|
368
|
+
Categories: {', '.join(crate.categories)}
|
369
|
+
"""
|
370
|
+
|
371
|
+
prompt = f"""Generate 2-3 factual statements about this Rust crate, followed by their counterfactual opposites.
|
372
|
+
|
373
|
+
Format each pair as:
|
374
|
+
✅ Factual: [true statement about the crate]
|
375
|
+
❌ Counterfactual: [opposite/incorrect statement]
|
376
|
+
|
377
|
+
{context}
|
378
|
+
|
379
|
+
Factual/Counterfactual pairs:"""
|
380
|
+
|
381
|
+
result = self.validate_and_retry(
|
382
|
+
prompt,
|
383
|
+
self.validate_factual_pairs,
|
384
|
+
temperature=0.4,
|
385
|
+
max_tokens=300
|
386
|
+
)
|
387
|
+
|
388
|
+
return result or "Factual analysis unavailable."
|
389
|
+
|
390
|
+
def score_crate(self, crate: CrateMetadata) -> float:
|
391
|
+
"""Score crate quality using Azure OpenAI"""
|
392
|
+
context = f"""
|
393
|
+
Crate: {crate.name}
|
394
|
+
Description: {crate.description}
|
395
|
+
Downloads: {crate.downloads}
|
396
|
+
GitHub Stars: {crate.github_stars}
|
397
|
+
Keywords: {', '.join(crate.keywords)}
|
398
|
+
Categories: {', '.join(crate.categories)}
|
399
|
+
"""
|
400
|
+
|
401
|
+
prompt = f"""Rate this Rust crate on a scale of 1-10 based on:
|
402
|
+
- Popularity (downloads, stars)
|
403
|
+
- Documentation quality
|
404
|
+
- Usefulness and relevance
|
405
|
+
- Community adoption
|
406
|
+
|
407
|
+
{context}
|
408
|
+
|
409
|
+
Score (1-10):"""
|
410
|
+
|
411
|
+
result = self.call_azure_openai(prompt, temperature=0.1, max_tokens=10)
|
412
|
+
|
413
|
+
if result:
|
414
|
+
# Extract numeric score
|
415
|
+
score_match = re.search(r'(\d+(?:\.\d+)?)', result)
|
416
|
+
if score_match:
|
417
|
+
try:
|
418
|
+
score = float(score_match.group(1))
|
419
|
+
return min(10.0, max(1.0, score)) # Clamp between 1-10
|
420
|
+
except ValueError:
|
421
|
+
pass
|
422
|
+
|
423
|
+
return 5.0 # Default score
|
424
|
+
|
425
|
+
def batch_process_prompts(
|
426
|
+
self,
|
427
|
+
prompts: "list[tuple[str, float, int]]",
|
428
|
+
batch_size: int = 4
|
429
|
+
) -> "list[Optional[str]]":
|
430
|
+
"""Process multiple prompts in batches"""
|
431
|
+
results: "list[Optional[str]]" = []
|
432
|
+
|
433
|
+
for i in range(0, len(prompts), batch_size):
|
434
|
+
batch = prompts[i:i + batch_size]
|
435
|
+
batch_results: "list[Optional[str]]" = []
|
436
|
+
|
437
|
+
for prompt_tuple in batch:
|
438
|
+
prompt, temp, max_tokens = prompt_tuple
|
439
|
+
result = self.call_azure_openai(prompt, temp, max_tokens)
|
440
|
+
batch_results.append(result)
|
441
|
+
time.sleep(0.1) # Rate limiting
|
442
|
+
|
443
|
+
results.extend(batch_results)
|
444
|
+
|
445
|
+
return results
|
446
|
+
|
447
|
+
def smart_context_management(
|
448
|
+
self, context_history: "list[str]", new_prompt: str
|
449
|
+
) -> str:
|
450
|
+
"""Manage context for long conversations"""
|
451
|
+
# For Azure OpenAI, we can be more generous with context
|
452
|
+
# but still need to manage it carefully
|
453
|
+
|
454
|
+
total_context = "\n".join(context_history) + "\n" + new_prompt
|
455
|
+
max_context_tokens = 6000 # Conservative limit for Azure OpenAI
|
456
|
+
|
457
|
+
if self.estimate_tokens(total_context) <= max_context_tokens:
|
458
|
+
return total_context
|
459
|
+
|
460
|
+
# If too long, keep most recent context
|
461
|
+
recent_context = context_history[-2:] if len(context_history) >= 2 else context_history
|
462
|
+
return "\n".join(recent_context) + "\n" + new_prompt
|
rust_crate_pipeline/config.py
CHANGED
@@ -1,21 +1,27 @@
|
|
1
1
|
# config.py
|
2
2
|
import os
|
3
3
|
import warnings
|
4
|
-
from dataclasses import dataclass, field
|
5
|
-
from typing import
|
4
|
+
from dataclasses import dataclass, field, asdict
|
5
|
+
from typing import Any, Union, TYPE_CHECKING
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from typing import Dict, List
|
6
9
|
|
7
10
|
# Filter Pydantic deprecation warnings from dependencies
|
8
11
|
# Rule Zero Compliance: Suppress third-party warnings while maintaining awareness
|
9
|
-
warnings.filterwarnings(
|
10
|
-
|
11
|
-
|
12
|
-
|
12
|
+
warnings.filterwarnings(
|
13
|
+
"ignore",
|
14
|
+
message=".*Support for class-based `config` is deprecated.*",
|
15
|
+
category=DeprecationWarning,
|
16
|
+
module="pydantic._internal._config",
|
17
|
+
)
|
13
18
|
|
14
19
|
|
15
20
|
@dataclass
|
16
21
|
class PipelineConfig:
|
17
22
|
model_path: str = os.path.expanduser(
|
18
|
-
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
|
23
|
+
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
|
24
|
+
)
|
19
25
|
max_tokens: int = 256
|
20
26
|
model_token_limit: int = 4096
|
21
27
|
prompt_token_margin: int = 3000
|
@@ -24,11 +30,20 @@ class PipelineConfig:
|
|
24
30
|
github_token: str = os.getenv("GITHUB_TOKEN", "")
|
25
31
|
cache_ttl: int = 3600 # 1 hour
|
26
32
|
batch_size: int = 10
|
27
|
-
n_workers: int = 4
|
28
|
-
# Enhanced scraping configuration
|
33
|
+
n_workers: int = 4 # Enhanced scraping configuration
|
29
34
|
enable_crawl4ai: bool = True
|
30
|
-
crawl4ai_model: str =
|
35
|
+
crawl4ai_model: str = os.path.expanduser(
|
36
|
+
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
|
37
|
+
)
|
31
38
|
crawl4ai_timeout: int = 30
|
39
|
+
output_path: str = "output"
|
40
|
+
|
41
|
+
# Azure OpenAI Configuration
|
42
|
+
use_azure_openai: bool = True
|
43
|
+
azure_openai_endpoint: str = "https://david-mc08tirc-eastus2.services.ai.azure.com/"
|
44
|
+
azure_openai_api_key: str = "2hw0jjqwjtKke7DMGiJSPtlj6GhuLCNdQWPXoDGN2I3JMvzp4PmGJQQJ99BFACHYHv6XJ3w3AAAAACOGFPYA"
|
45
|
+
azure_openai_deployment_name: str = "gpt-4o" # or your specific deployment name
|
46
|
+
azure_openai_api_version: str = "2024-02-15-preview"
|
32
47
|
|
33
48
|
|
34
49
|
@dataclass
|
@@ -37,30 +52,33 @@ class CrateMetadata:
|
|
37
52
|
version: str
|
38
53
|
description: str
|
39
54
|
repository: str
|
40
|
-
keywords: List[str]
|
41
|
-
categories: List[str]
|
55
|
+
keywords: "List[str]"
|
56
|
+
categories: "List[str]"
|
42
57
|
readme: str
|
43
58
|
downloads: int
|
44
59
|
github_stars: int = 0
|
45
|
-
dependencies: List[Dict[str, Any]] = field(default_factory=list)
|
46
|
-
features:
|
47
|
-
code_snippets: List[str] = field(default_factory=list)
|
48
|
-
readme_sections: Dict[str, str] = field(default_factory=dict)
|
49
|
-
librs_downloads:
|
60
|
+
dependencies: "List[Dict[str, Any]]" = field(default_factory=list)
|
61
|
+
features: "Dict[str, List[str]]" = field(default_factory=dict)
|
62
|
+
code_snippets: "List[str]" = field(default_factory=list)
|
63
|
+
readme_sections: "Dict[str, str]" = field(default_factory=dict)
|
64
|
+
librs_downloads: Union[int, None] = None
|
50
65
|
source: str = "crates.io"
|
51
66
|
# Enhanced scraping fields
|
52
|
-
enhanced_scraping: Dict[str, Any] = field(default_factory=dict)
|
53
|
-
enhanced_features: List[str] = field(default_factory=list)
|
54
|
-
enhanced_dependencies: List[str] = field(default_factory=list)
|
67
|
+
enhanced_scraping: "Dict[str, Any]" = field(default_factory=dict)
|
68
|
+
enhanced_features: "List[str]" = field(default_factory=list)
|
69
|
+
enhanced_dependencies: "List[str]" = field(default_factory=list)
|
70
|
+
|
71
|
+
def to_dict(self) -> "Dict[str, Any]":
|
72
|
+
return asdict(self)
|
55
73
|
|
56
74
|
|
57
75
|
@dataclass
|
58
76
|
class EnrichedCrate(CrateMetadata):
|
59
|
-
readme_summary:
|
60
|
-
feature_summary:
|
61
|
-
use_case:
|
62
|
-
score:
|
63
|
-
factual_counterfactual:
|
64
|
-
source_analysis:
|
65
|
-
user_behavior:
|
66
|
-
security:
|
77
|
+
readme_summary: Union[str, None] = None
|
78
|
+
feature_summary: Union[str, None] = None
|
79
|
+
use_case: Union[str, None] = None
|
80
|
+
score: Union[float, None] = None
|
81
|
+
factual_counterfactual: Union[str, None] = None
|
82
|
+
source_analysis: Union["Dict[str, Any]", None] = None
|
83
|
+
user_behavior: Union["Dict[str, Any]", None] = None
|
84
|
+
security: Union["Dict[str, Any]", None] = None
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
Core Sigil Protocol Components
|
3
|
+
|
4
|
+
This module contains the foundational components for the Sigil Protocol
|
5
|
+
implementation, providing shared functionality across all pipeline variants.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .sacred_chain import SacredChainBase, SacredChainTrace, TrustVerdict
|
9
|
+
from .canon_registry import CanonRegistry, CanonEntry
|
10
|
+
from .irl_engine import IRLEngine
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"SacredChainBase",
|
14
|
+
"SacredChainTrace",
|
15
|
+
"TrustVerdict",
|
16
|
+
"CanonRegistry",
|
17
|
+
"CanonEntry",
|
18
|
+
"IRLEngine",
|
19
|
+
]
|