rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. rust_crate_pipeline/__init__.py +18 -27
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +718 -596
  4. rust_crate_pipeline/analysis.py +330 -363
  5. rust_crate_pipeline/azure_ai_processing.py +462 -0
  6. rust_crate_pipeline/config.py +46 -28
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +108 -112
  14. rust_crate_pipeline/main.py +329 -109
  15. rust_crate_pipeline/network.py +317 -308
  16. rust_crate_pipeline/pipeline.py +300 -375
  17. rust_crate_pipeline/production_config.py +24 -27
  18. rust_crate_pipeline/progress_monitor.py +334 -0
  19. rust_crate_pipeline/scraping/__init__.py +13 -0
  20. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  21. rust_crate_pipeline/unified_llm_processor.py +637 -0
  22. rust_crate_pipeline/unified_pipeline.py +548 -0
  23. rust_crate_pipeline/utils/file_utils.py +32 -5
  24. rust_crate_pipeline/utils/logging_utils.py +21 -16
  25. rust_crate_pipeline/version.py +76 -47
  26. rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
  27. rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
  28. rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
  29. rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
  30. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
  31. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
  32. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
  33. {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,259 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Dict, List, Optional, Union
6
+ from pathlib import Path
7
+ import time
8
+
9
+ try:
10
+ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMExtractionStrategy, BrowserConfig, LLMConfig
11
+ CRAWL4AI_AVAILABLE = True
12
+ except ImportError:
13
+ CRAWL4AI_AVAILABLE = False
14
+ AsyncWebCrawler = None
15
+ CrawlerRunConfig = None
16
+ LLMExtractionStrategy = None
17
+ BrowserConfig = None
18
+ LLMConfig = None
19
+
20
+
21
+ class ScrapingError(Exception):
22
+ pass
23
+
24
+
25
+ @dataclass
26
+ class ScrapingResult:
27
+ url: str
28
+ title: str
29
+ content: str
30
+ structured_data: Dict[str, Any] = field(default_factory=dict)
31
+ quality_score: float = 0.0
32
+ extraction_method: str = "unknown"
33
+ error: Optional[str] = None
34
+ metadata: Dict[str, Any] = field(default_factory=dict)
35
+ timestamp: str = field(default_factory=lambda: str(asyncio.get_event_loop().time() if asyncio.get_event_loop().is_running() else time.time()))
36
+
37
+ def __post_init__(self) -> None:
38
+ if self.timestamp == "0":
39
+ import time
40
+ self.timestamp = str(time.time())
41
+
42
+
43
+ class UnifiedScraper:
44
+
45
+ def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
46
+ self.config = config or {}
47
+ self.logger = logging.getLogger(__name__)
48
+ self.crawler: Optional[Any] = None
49
+ self.browser_config: Optional[Any] = None
50
+ self._initialize_crawler()
51
+
52
+ def _initialize_crawler(self) -> None:
53
+ if not CRAWL4AI_AVAILABLE:
54
+ self.logger.warning("Crawl4AI not available - using basic scraping mode")
55
+ return
56
+
57
+ try:
58
+ # Configure browser for headless operation
59
+ self.browser_config = BrowserConfig(
60
+ headless=self.config.get("headless", True),
61
+ browser_type=self.config.get("browser_type", "chromium"),
62
+ verbose=self.config.get("verbose", False)
63
+ )
64
+
65
+ self.crawler = AsyncWebCrawler(config=self.browser_config)
66
+ self.logger.info("✅ Crawl4AI crawler initialized successfully")
67
+ except Exception as e:
68
+ self.logger.error(f"❌ Failed to initialize Crawl4AI: {e}")
69
+ self.crawler = None
70
+
71
+ async def __aenter__(self) -> "UnifiedScraper":
72
+ if self.crawler and hasattr(self.crawler, 'start'):
73
+ try:
74
+ await self.crawler.start()
75
+ except Exception as e:
76
+ self.logger.warning(f"Failed to start crawler: {e}")
77
+ return self
78
+
79
+ async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
80
+ if self.crawler and hasattr(self.crawler, 'stop'):
81
+ try:
82
+ await self.crawler.stop()
83
+ except Exception as e:
84
+ self.logger.warning(f"Error stopping crawler: {e}")
85
+
86
+ async def scrape_url(
87
+ self,
88
+ url: str,
89
+ doc_type: str = "general",
90
+ extraction_schema: Optional[Dict[str, Any]] = None
91
+ ) -> ScrapingResult:
92
+ if not self.crawler:
93
+ raise ScrapingError("No crawler backend available")
94
+
95
+ try:
96
+ # Configure crawler run parameters
97
+ config_params: Dict[str, Any] = {
98
+ "word_count_threshold": self.config.get("word_count_threshold", 10),
99
+ "screenshot": self.config.get("screenshot", False),
100
+ }
101
+
102
+ # Add CSS selectors based on document type
103
+ if doc_type == "docs":
104
+ config_params["css_selector"] = "main"
105
+ elif doc_type == "readme":
106
+ config_params["css_selector"] = "article, .readme, main"
107
+
108
+ # Update with any additional crawl config
109
+ config_params.update(self.config.get("crawl_config", {}))
110
+
111
+ crawl_config = CrawlerRunConfig(**config_params)
112
+
113
+ # Set up extraction strategy if schema provided
114
+ extraction_strategy = None
115
+ if extraction_schema and CRAWL4AI_AVAILABLE:
116
+ # Get LLM configuration from config or use defaults
117
+ llm_provider = self.config.get("llm_provider", "ollama")
118
+ llm_api_base = self.config.get("llm_api_base", "http://localhost:11434")
119
+ llm_model = self.config.get("llm_model", "deepseek-coder:6.7b")
120
+ llm_api_token = self.config.get("llm_api_token", "no-token-needed")
121
+
122
+ # Create LLM config
123
+ llm_config = LLMConfig(
124
+ provider=llm_provider,
125
+ api_token=llm_api_token,
126
+ api_base=llm_api_base,
127
+ model=llm_model,
128
+ max_tokens=self.config.get("max_tokens", 2048),
129
+ temperature=self.config.get("temperature", 0.7)
130
+ )
131
+
132
+ extraction_strategy = LLMExtractionStrategy(
133
+ llm_config=llm_config,
134
+ schema=extraction_schema,
135
+ extraction_type="schema",
136
+ instruction=f"Extract structured data from this {doc_type} content according to the provided schema."
137
+ )
138
+
139
+ # Run the crawl
140
+ result = await self.crawler.arun(
141
+ url=url,
142
+ config=crawl_config,
143
+ extraction_strategy=extraction_strategy
144
+ )
145
+
146
+ # Handle result (Crawl4AI returns direct result, not container)
147
+ if not result:
148
+ raise ScrapingError("Crawl returned no result")
149
+
150
+ if not result.success:
151
+ error_message = getattr(result, 'error_message', 'Crawl was not successful')
152
+ raise ScrapingError(f"Crawl failed: {error_message}")
153
+
154
+ markdown_content = getattr(result, 'markdown', '') or ""
155
+ extracted_content = getattr(result, 'extracted_content', None)
156
+
157
+ structured_data = self._process_extracted_content(extracted_content)
158
+ quality_score = self._calculate_quality_score(markdown_content, structured_data)
159
+
160
+ return ScrapingResult(
161
+ url=url,
162
+ title=self._extract_title(markdown_content),
163
+ content=markdown_content,
164
+ structured_data=structured_data,
165
+ quality_score=quality_score,
166
+ extraction_method="crawl4ai",
167
+ metadata={
168
+ "doc_type": doc_type,
169
+ "content_length": len(markdown_content),
170
+ "has_structured_data": bool(structured_data),
171
+ "crawl_success": result.success,
172
+ }
173
+ )
174
+
175
+ except Exception as e:
176
+ self.logger.error(f"Scraping error for {url}: {e}")
177
+ raise ScrapingError(f"Failed to scrape {url}: {str(e)}")
178
+
179
+ async def scrape_crate_documentation(self, crate_name: str) -> Dict[str, ScrapingResult]:
180
+ results: Dict[str, ScrapingResult] = {}
181
+
182
+ urls = {
183
+ "crates_io": f"https://crates.io/crates/{crate_name}",
184
+ "docs_rs": f"https://docs.rs/{crate_name}",
185
+ "lib_rs": f"https://lib.rs/crates/{crate_name}",
186
+ }
187
+
188
+ for source, url in urls.items():
189
+ try:
190
+ result = await self.scrape_url(url, doc_type="docs")
191
+ results[source] = result
192
+ except ScrapingError as e:
193
+ self.logger.warning(f"Failed to scrape {source} for {crate_name}: {e}")
194
+ results[source] = ScrapingResult(
195
+ url=url,
196
+ title=f"{crate_name} - {source}",
197
+ content="",
198
+ error=str(e),
199
+ extraction_method="failed"
200
+ )
201
+
202
+ return results
203
+
204
+ def _process_extracted_content(
205
+ self, content: Optional[Union[str, Dict[str, Any]]]
206
+ ) -> Dict[str, Any]:
207
+ if not content:
208
+ return {}
209
+
210
+ if isinstance(content, str):
211
+ try:
212
+ return json.loads(content)
213
+ except json.JSONDecodeError:
214
+ return {"raw_content": content}
215
+
216
+ return content if isinstance(content, dict) else {}
217
+
218
+ def _calculate_quality_score(
219
+ self, content: str, structured_data: Dict[str, Any]
220
+ ) -> float:
221
+ if not content:
222
+ return 0.0
223
+
224
+ score = 0.0
225
+
226
+ content_length = len(content)
227
+ if content_length > 1000:
228
+ score += 3.0
229
+ elif content_length > 500:
230
+ score += 2.0
231
+ elif content_length > 100:
232
+ score += 1.0
233
+
234
+ if structured_data:
235
+ score += 2.0
236
+
237
+ if "title" in content.lower() or "description" in content.lower():
238
+ score += 1.0
239
+
240
+ return min(score, 10.0)
241
+
242
+ def _extract_title(self, markdown: str) -> str:
243
+ lines = markdown.split('\n')
244
+ for line in lines[:5]:
245
+ if line.startswith('# '):
246
+ return line[2:].strip()
247
+ return "Untitled"
248
+
249
+ async def close(self) -> None:
250
+ if self.crawler and hasattr(self.crawler, 'stop'):
251
+ try:
252
+ await self.crawler.stop()
253
+ except Exception as e:
254
+ self.logger.warning(f"Error closing crawler: {e}")
255
+
256
+
257
+ async def quick_scrape(url: str, **kwargs: Any) -> ScrapingResult:
258
+ async with UnifiedScraper() as scraper:
259
+ return await scraper.scrape_url(url, **kwargs)