rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,259 @@
|
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
from dataclasses import dataclass, field
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
6
|
+
from pathlib import Path
|
7
|
+
import time
|
8
|
+
|
9
|
+
try:
|
10
|
+
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMExtractionStrategy, BrowserConfig, LLMConfig
|
11
|
+
CRAWL4AI_AVAILABLE = True
|
12
|
+
except ImportError:
|
13
|
+
CRAWL4AI_AVAILABLE = False
|
14
|
+
AsyncWebCrawler = None
|
15
|
+
CrawlerRunConfig = None
|
16
|
+
LLMExtractionStrategy = None
|
17
|
+
BrowserConfig = None
|
18
|
+
LLMConfig = None
|
19
|
+
|
20
|
+
|
21
|
+
class ScrapingError(Exception):
|
22
|
+
pass
|
23
|
+
|
24
|
+
|
25
|
+
@dataclass
|
26
|
+
class ScrapingResult:
|
27
|
+
url: str
|
28
|
+
title: str
|
29
|
+
content: str
|
30
|
+
structured_data: Dict[str, Any] = field(default_factory=dict)
|
31
|
+
quality_score: float = 0.0
|
32
|
+
extraction_method: str = "unknown"
|
33
|
+
error: Optional[str] = None
|
34
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
35
|
+
timestamp: str = field(default_factory=lambda: str(asyncio.get_event_loop().time() if asyncio.get_event_loop().is_running() else time.time()))
|
36
|
+
|
37
|
+
def __post_init__(self) -> None:
|
38
|
+
if self.timestamp == "0":
|
39
|
+
import time
|
40
|
+
self.timestamp = str(time.time())
|
41
|
+
|
42
|
+
|
43
|
+
class UnifiedScraper:
|
44
|
+
|
45
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
|
46
|
+
self.config = config or {}
|
47
|
+
self.logger = logging.getLogger(__name__)
|
48
|
+
self.crawler: Optional[Any] = None
|
49
|
+
self.browser_config: Optional[Any] = None
|
50
|
+
self._initialize_crawler()
|
51
|
+
|
52
|
+
def _initialize_crawler(self) -> None:
|
53
|
+
if not CRAWL4AI_AVAILABLE:
|
54
|
+
self.logger.warning("Crawl4AI not available - using basic scraping mode")
|
55
|
+
return
|
56
|
+
|
57
|
+
try:
|
58
|
+
# Configure browser for headless operation
|
59
|
+
self.browser_config = BrowserConfig(
|
60
|
+
headless=self.config.get("headless", True),
|
61
|
+
browser_type=self.config.get("browser_type", "chromium"),
|
62
|
+
verbose=self.config.get("verbose", False)
|
63
|
+
)
|
64
|
+
|
65
|
+
self.crawler = AsyncWebCrawler(config=self.browser_config)
|
66
|
+
self.logger.info("✅ Crawl4AI crawler initialized successfully")
|
67
|
+
except Exception as e:
|
68
|
+
self.logger.error(f"❌ Failed to initialize Crawl4AI: {e}")
|
69
|
+
self.crawler = None
|
70
|
+
|
71
|
+
async def __aenter__(self) -> "UnifiedScraper":
|
72
|
+
if self.crawler and hasattr(self.crawler, 'start'):
|
73
|
+
try:
|
74
|
+
await self.crawler.start()
|
75
|
+
except Exception as e:
|
76
|
+
self.logger.warning(f"Failed to start crawler: {e}")
|
77
|
+
return self
|
78
|
+
|
79
|
+
async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
|
80
|
+
if self.crawler and hasattr(self.crawler, 'stop'):
|
81
|
+
try:
|
82
|
+
await self.crawler.stop()
|
83
|
+
except Exception as e:
|
84
|
+
self.logger.warning(f"Error stopping crawler: {e}")
|
85
|
+
|
86
|
+
async def scrape_url(
|
87
|
+
self,
|
88
|
+
url: str,
|
89
|
+
doc_type: str = "general",
|
90
|
+
extraction_schema: Optional[Dict[str, Any]] = None
|
91
|
+
) -> ScrapingResult:
|
92
|
+
if not self.crawler:
|
93
|
+
raise ScrapingError("No crawler backend available")
|
94
|
+
|
95
|
+
try:
|
96
|
+
# Configure crawler run parameters
|
97
|
+
config_params: Dict[str, Any] = {
|
98
|
+
"word_count_threshold": self.config.get("word_count_threshold", 10),
|
99
|
+
"screenshot": self.config.get("screenshot", False),
|
100
|
+
}
|
101
|
+
|
102
|
+
# Add CSS selectors based on document type
|
103
|
+
if doc_type == "docs":
|
104
|
+
config_params["css_selector"] = "main"
|
105
|
+
elif doc_type == "readme":
|
106
|
+
config_params["css_selector"] = "article, .readme, main"
|
107
|
+
|
108
|
+
# Update with any additional crawl config
|
109
|
+
config_params.update(self.config.get("crawl_config", {}))
|
110
|
+
|
111
|
+
crawl_config = CrawlerRunConfig(**config_params)
|
112
|
+
|
113
|
+
# Set up extraction strategy if schema provided
|
114
|
+
extraction_strategy = None
|
115
|
+
if extraction_schema and CRAWL4AI_AVAILABLE:
|
116
|
+
# Get LLM configuration from config or use defaults
|
117
|
+
llm_provider = self.config.get("llm_provider", "ollama")
|
118
|
+
llm_api_base = self.config.get("llm_api_base", "http://localhost:11434")
|
119
|
+
llm_model = self.config.get("llm_model", "deepseek-coder:6.7b")
|
120
|
+
llm_api_token = self.config.get("llm_api_token", "no-token-needed")
|
121
|
+
|
122
|
+
# Create LLM config
|
123
|
+
llm_config = LLMConfig(
|
124
|
+
provider=llm_provider,
|
125
|
+
api_token=llm_api_token,
|
126
|
+
api_base=llm_api_base,
|
127
|
+
model=llm_model,
|
128
|
+
max_tokens=self.config.get("max_tokens", 2048),
|
129
|
+
temperature=self.config.get("temperature", 0.7)
|
130
|
+
)
|
131
|
+
|
132
|
+
extraction_strategy = LLMExtractionStrategy(
|
133
|
+
llm_config=llm_config,
|
134
|
+
schema=extraction_schema,
|
135
|
+
extraction_type="schema",
|
136
|
+
instruction=f"Extract structured data from this {doc_type} content according to the provided schema."
|
137
|
+
)
|
138
|
+
|
139
|
+
# Run the crawl
|
140
|
+
result = await self.crawler.arun(
|
141
|
+
url=url,
|
142
|
+
config=crawl_config,
|
143
|
+
extraction_strategy=extraction_strategy
|
144
|
+
)
|
145
|
+
|
146
|
+
# Handle result (Crawl4AI returns direct result, not container)
|
147
|
+
if not result:
|
148
|
+
raise ScrapingError("Crawl returned no result")
|
149
|
+
|
150
|
+
if not result.success:
|
151
|
+
error_message = getattr(result, 'error_message', 'Crawl was not successful')
|
152
|
+
raise ScrapingError(f"Crawl failed: {error_message}")
|
153
|
+
|
154
|
+
markdown_content = getattr(result, 'markdown', '') or ""
|
155
|
+
extracted_content = getattr(result, 'extracted_content', None)
|
156
|
+
|
157
|
+
structured_data = self._process_extracted_content(extracted_content)
|
158
|
+
quality_score = self._calculate_quality_score(markdown_content, structured_data)
|
159
|
+
|
160
|
+
return ScrapingResult(
|
161
|
+
url=url,
|
162
|
+
title=self._extract_title(markdown_content),
|
163
|
+
content=markdown_content,
|
164
|
+
structured_data=structured_data,
|
165
|
+
quality_score=quality_score,
|
166
|
+
extraction_method="crawl4ai",
|
167
|
+
metadata={
|
168
|
+
"doc_type": doc_type,
|
169
|
+
"content_length": len(markdown_content),
|
170
|
+
"has_structured_data": bool(structured_data),
|
171
|
+
"crawl_success": result.success,
|
172
|
+
}
|
173
|
+
)
|
174
|
+
|
175
|
+
except Exception as e:
|
176
|
+
self.logger.error(f"Scraping error for {url}: {e}")
|
177
|
+
raise ScrapingError(f"Failed to scrape {url}: {str(e)}")
|
178
|
+
|
179
|
+
async def scrape_crate_documentation(self, crate_name: str) -> Dict[str, ScrapingResult]:
|
180
|
+
results: Dict[str, ScrapingResult] = {}
|
181
|
+
|
182
|
+
urls = {
|
183
|
+
"crates_io": f"https://crates.io/crates/{crate_name}",
|
184
|
+
"docs_rs": f"https://docs.rs/{crate_name}",
|
185
|
+
"lib_rs": f"https://lib.rs/crates/{crate_name}",
|
186
|
+
}
|
187
|
+
|
188
|
+
for source, url in urls.items():
|
189
|
+
try:
|
190
|
+
result = await self.scrape_url(url, doc_type="docs")
|
191
|
+
results[source] = result
|
192
|
+
except ScrapingError as e:
|
193
|
+
self.logger.warning(f"Failed to scrape {source} for {crate_name}: {e}")
|
194
|
+
results[source] = ScrapingResult(
|
195
|
+
url=url,
|
196
|
+
title=f"{crate_name} - {source}",
|
197
|
+
content="",
|
198
|
+
error=str(e),
|
199
|
+
extraction_method="failed"
|
200
|
+
)
|
201
|
+
|
202
|
+
return results
|
203
|
+
|
204
|
+
def _process_extracted_content(
|
205
|
+
self, content: Optional[Union[str, Dict[str, Any]]]
|
206
|
+
) -> Dict[str, Any]:
|
207
|
+
if not content:
|
208
|
+
return {}
|
209
|
+
|
210
|
+
if isinstance(content, str):
|
211
|
+
try:
|
212
|
+
return json.loads(content)
|
213
|
+
except json.JSONDecodeError:
|
214
|
+
return {"raw_content": content}
|
215
|
+
|
216
|
+
return content if isinstance(content, dict) else {}
|
217
|
+
|
218
|
+
def _calculate_quality_score(
|
219
|
+
self, content: str, structured_data: Dict[str, Any]
|
220
|
+
) -> float:
|
221
|
+
if not content:
|
222
|
+
return 0.0
|
223
|
+
|
224
|
+
score = 0.0
|
225
|
+
|
226
|
+
content_length = len(content)
|
227
|
+
if content_length > 1000:
|
228
|
+
score += 3.0
|
229
|
+
elif content_length > 500:
|
230
|
+
score += 2.0
|
231
|
+
elif content_length > 100:
|
232
|
+
score += 1.0
|
233
|
+
|
234
|
+
if structured_data:
|
235
|
+
score += 2.0
|
236
|
+
|
237
|
+
if "title" in content.lower() or "description" in content.lower():
|
238
|
+
score += 1.0
|
239
|
+
|
240
|
+
return min(score, 10.0)
|
241
|
+
|
242
|
+
def _extract_title(self, markdown: str) -> str:
|
243
|
+
lines = markdown.split('\n')
|
244
|
+
for line in lines[:5]:
|
245
|
+
if line.startswith('# '):
|
246
|
+
return line[2:].strip()
|
247
|
+
return "Untitled"
|
248
|
+
|
249
|
+
async def close(self) -> None:
|
250
|
+
if self.crawler and hasattr(self.crawler, 'stop'):
|
251
|
+
try:
|
252
|
+
await self.crawler.stop()
|
253
|
+
except Exception as e:
|
254
|
+
self.logger.warning(f"Error closing crawler: {e}")
|
255
|
+
|
256
|
+
|
257
|
+
async def quick_scrape(url: str, **kwargs: Any) -> ScrapingResult:
|
258
|
+
async with UnifiedScraper() as scraper:
|
259
|
+
return await scraper.scrape_url(url, **kwargs)
|