rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +25 -25
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +309 -200
- rust_crate_pipeline/analysis.py +304 -368
- rust_crate_pipeline/azure_ai_processing.py +453 -0
- rust_crate_pipeline/config.py +57 -19
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +42 -36
- rust_crate_pipeline/main.py +386 -102
- rust_crate_pipeline/network.py +153 -133
- rust_crate_pipeline/pipeline.py +340 -264
- rust_crate_pipeline/production_config.py +35 -32
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +45 -14
- rust_crate_pipeline/utils/logging_utils.py +34 -17
- rust_crate_pipeline/version.py +53 -2
- rust_crate_pipeline-1.3.1.dist-info/METADATA +357 -0
- rust_crate_pipeline-1.3.1.dist-info/RECORD +30 -0
- rust_crate_pipeline-1.2.6.dist-info/METADATA +0 -573
- rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.1.dist-info}/top_level.txt +0 -0
@@ -7,61 +7,64 @@ and improve the user experience in production environments.
|
|
7
7
|
|
8
8
|
import logging
|
9
9
|
import os
|
10
|
+
from typing import Any
|
10
11
|
|
11
12
|
# Production logging configuration
|
12
|
-
|
13
|
+
|
14
|
+
|
15
|
+
def configure_production_logging() -> None:
|
13
16
|
"""Configure logging for production to reduce verbose warnings"""
|
14
|
-
|
17
|
+
|
15
18
|
# Don't use basicConfig here - let main.py handle it
|
16
19
|
# Just set specific loggers to less verbose levels
|
17
|
-
logging.getLogger(
|
18
|
-
logging.getLogger(
|
19
|
-
logging.getLogger(
|
20
|
-
|
20
|
+
logging.getLogger("requests").setLevel(logging.WARNING)
|
21
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
22
|
+
logging.getLogger("requests_cache").setLevel(logging.WARNING)
|
23
|
+
|
21
24
|
# If PRODUCTION environment variable is set, be even quieter
|
22
|
-
if os.getenv(
|
25
|
+
if os.getenv("PRODUCTION", "false").lower() == "true":
|
23
26
|
logging.getLogger().setLevel(logging.WARNING)
|
24
|
-
logging.getLogger(
|
27
|
+
logging.getLogger("rust_crate_pipeline").setLevel(logging.INFO)
|
28
|
+
|
25
29
|
|
26
30
|
# Production-optimized settings
|
27
|
-
PRODUCTION_SETTINGS = {
|
31
|
+
PRODUCTION_SETTINGS: "dict[str, Any]" = {
|
28
32
|
# Reduced retries to minimize warnings
|
29
|
-
|
30
|
-
|
31
|
-
|
33
|
+
"max_retries": 2,
|
34
|
+
"validation_retries": 2,
|
32
35
|
# GitHub API management
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
'llm_max_attempts': 2,
|
39
|
-
|
36
|
+
"github_rate_limit_threshold": 100,
|
37
|
+
"github_critical_threshold": 50,
|
38
|
+
# LLM settings
|
39
|
+
"llm_timeout": 30,
|
40
|
+
"llm_max_attempts": 2,
|
40
41
|
# Logging preferences
|
41
|
-
|
42
|
-
|
43
|
-
|
42
|
+
"quiet_mode": True,
|
43
|
+
"log_level": "INFO",
|
44
44
|
# Performance settings
|
45
|
-
|
46
|
-
|
47
|
-
|
45
|
+
"batch_size": 10,
|
46
|
+
"checkpoint_interval": 10,
|
47
|
+
"cache_ttl": 3600,
|
48
48
|
}
|
49
49
|
|
50
|
-
|
50
|
+
|
51
|
+
def get_production_config() -> "dict[str, Any]":
|
51
52
|
"""Get production configuration dictionary"""
|
52
53
|
return PRODUCTION_SETTINGS.copy()
|
53
54
|
|
54
|
-
|
55
|
+
|
56
|
+
def is_production() -> bool:
|
55
57
|
"""Check if running in production mode"""
|
56
|
-
return os.getenv(
|
58
|
+
return os.getenv("PRODUCTION", "false").lower() == "true"
|
59
|
+
|
57
60
|
|
58
|
-
def setup_production_environment():
|
61
|
+
def setup_production_environment() -> "dict[str, Any]":
|
59
62
|
"""Set up the complete production environment"""
|
60
63
|
configure_production_logging()
|
61
|
-
|
64
|
+
|
62
65
|
# Set environment variables for quieter operation
|
63
|
-
os.environ.setdefault(
|
64
|
-
|
66
|
+
os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning")
|
67
|
+
|
65
68
|
if is_production():
|
66
69
|
print("🚀 Production mode enabled - optimized for minimal warnings")
|
67
70
|
return get_production_config()
|
@@ -0,0 +1,13 @@
|
|
1
|
+
"""
|
2
|
+
Unified Scraping Module
|
3
|
+
|
4
|
+
This module provides a unified interface for all web scraping operations,
|
5
|
+
consolidating Crawl4AI integration and other scraping capabilities.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .unified_scraper import UnifiedScraper, ScrapingResult
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"UnifiedScraper",
|
12
|
+
"ScrapingResult",
|
13
|
+
]
|
@@ -0,0 +1,259 @@
|
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
from dataclasses import dataclass, field
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
6
|
+
from pathlib import Path
|
7
|
+
import time
|
8
|
+
|
9
|
+
try:
|
10
|
+
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMExtractionStrategy, BrowserConfig, LLMConfig
|
11
|
+
CRAWL4AI_AVAILABLE = True
|
12
|
+
except ImportError:
|
13
|
+
CRAWL4AI_AVAILABLE = False
|
14
|
+
AsyncWebCrawler = None
|
15
|
+
CrawlerRunConfig = None
|
16
|
+
LLMExtractionStrategy = None
|
17
|
+
BrowserConfig = None
|
18
|
+
LLMConfig = None
|
19
|
+
|
20
|
+
|
21
|
+
class ScrapingError(Exception):
|
22
|
+
pass
|
23
|
+
|
24
|
+
|
25
|
+
@dataclass
|
26
|
+
class ScrapingResult:
|
27
|
+
url: str
|
28
|
+
title: str
|
29
|
+
content: str
|
30
|
+
structured_data: Dict[str, Any] = field(default_factory=dict)
|
31
|
+
quality_score: float = 0.0
|
32
|
+
extraction_method: str = "unknown"
|
33
|
+
error: Optional[str] = None
|
34
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
35
|
+
timestamp: str = field(default_factory=lambda: str(asyncio.get_event_loop().time() if asyncio.get_event_loop().is_running() else time.time()))
|
36
|
+
|
37
|
+
def __post_init__(self) -> None:
|
38
|
+
if self.timestamp == "0":
|
39
|
+
import time
|
40
|
+
self.timestamp = str(time.time())
|
41
|
+
|
42
|
+
|
43
|
+
class UnifiedScraper:
|
44
|
+
|
45
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
|
46
|
+
self.config = config or {}
|
47
|
+
self.logger = logging.getLogger(__name__)
|
48
|
+
self.crawler: Optional[Any] = None
|
49
|
+
self.browser_config: Optional[Any] = None
|
50
|
+
self._initialize_crawler()
|
51
|
+
|
52
|
+
def _initialize_crawler(self) -> None:
|
53
|
+
if not CRAWL4AI_AVAILABLE:
|
54
|
+
self.logger.warning("Crawl4AI not available - using basic scraping mode")
|
55
|
+
return
|
56
|
+
|
57
|
+
try:
|
58
|
+
# Configure browser for headless operation
|
59
|
+
self.browser_config = BrowserConfig(
|
60
|
+
headless=self.config.get("headless", True),
|
61
|
+
browser_type=self.config.get("browser_type", "chromium"),
|
62
|
+
verbose=self.config.get("verbose", False)
|
63
|
+
)
|
64
|
+
|
65
|
+
self.crawler = AsyncWebCrawler(config=self.browser_config)
|
66
|
+
self.logger.info("✅ Crawl4AI crawler initialized successfully")
|
67
|
+
except Exception as e:
|
68
|
+
self.logger.error(f"❌ Failed to initialize Crawl4AI: {e}")
|
69
|
+
self.crawler = None
|
70
|
+
|
71
|
+
async def __aenter__(self) -> "UnifiedScraper":
|
72
|
+
if self.crawler and hasattr(self.crawler, 'start'):
|
73
|
+
try:
|
74
|
+
await self.crawler.start()
|
75
|
+
except Exception as e:
|
76
|
+
self.logger.warning(f"Failed to start crawler: {e}")
|
77
|
+
return self
|
78
|
+
|
79
|
+
async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
|
80
|
+
if self.crawler and hasattr(self.crawler, 'stop'):
|
81
|
+
try:
|
82
|
+
await self.crawler.stop()
|
83
|
+
except Exception as e:
|
84
|
+
self.logger.warning(f"Error stopping crawler: {e}")
|
85
|
+
|
86
|
+
async def scrape_url(
|
87
|
+
self,
|
88
|
+
url: str,
|
89
|
+
doc_type: str = "general",
|
90
|
+
extraction_schema: Optional[Dict[str, Any]] = None
|
91
|
+
) -> ScrapingResult:
|
92
|
+
if not self.crawler:
|
93
|
+
raise ScrapingError("No crawler backend available")
|
94
|
+
|
95
|
+
try:
|
96
|
+
# Configure crawler run parameters
|
97
|
+
config_params: Dict[str, Any] = {
|
98
|
+
"word_count_threshold": self.config.get("word_count_threshold", 10),
|
99
|
+
"screenshot": self.config.get("screenshot", False),
|
100
|
+
}
|
101
|
+
|
102
|
+
# Add CSS selectors based on document type
|
103
|
+
if doc_type == "docs":
|
104
|
+
config_params["css_selector"] = "main"
|
105
|
+
elif doc_type == "readme":
|
106
|
+
config_params["css_selector"] = "article, .readme, main"
|
107
|
+
|
108
|
+
# Update with any additional crawl config
|
109
|
+
config_params.update(self.config.get("crawl_config", {}))
|
110
|
+
|
111
|
+
crawl_config = CrawlerRunConfig(**config_params)
|
112
|
+
|
113
|
+
# Set up extraction strategy if schema provided
|
114
|
+
extraction_strategy = None
|
115
|
+
if extraction_schema and CRAWL4AI_AVAILABLE:
|
116
|
+
# Get LLM configuration from config or use defaults
|
117
|
+
llm_provider = self.config.get("llm_provider", "ollama")
|
118
|
+
llm_api_base = self.config.get("llm_api_base", "http://localhost:11434")
|
119
|
+
llm_model = self.config.get("llm_model", "deepseek-coder:6.7b")
|
120
|
+
llm_api_token = self.config.get("llm_api_token", "no-token-needed")
|
121
|
+
|
122
|
+
# Create LLM config
|
123
|
+
llm_config = LLMConfig(
|
124
|
+
provider=llm_provider,
|
125
|
+
api_token=llm_api_token,
|
126
|
+
api_base=llm_api_base,
|
127
|
+
model=llm_model,
|
128
|
+
max_tokens=self.config.get("max_tokens", 2048),
|
129
|
+
temperature=self.config.get("temperature", 0.7)
|
130
|
+
)
|
131
|
+
|
132
|
+
extraction_strategy = LLMExtractionStrategy(
|
133
|
+
llm_config=llm_config,
|
134
|
+
schema=extraction_schema,
|
135
|
+
extraction_type="schema",
|
136
|
+
instruction=f"Extract structured data from this {doc_type} content according to the provided schema."
|
137
|
+
)
|
138
|
+
|
139
|
+
# Run the crawl
|
140
|
+
result = await self.crawler.arun(
|
141
|
+
url=url,
|
142
|
+
config=crawl_config,
|
143
|
+
extraction_strategy=extraction_strategy
|
144
|
+
)
|
145
|
+
|
146
|
+
# Handle result (Crawl4AI returns direct result, not container)
|
147
|
+
if not result:
|
148
|
+
raise ScrapingError("Crawl returned no result")
|
149
|
+
|
150
|
+
if not result.success:
|
151
|
+
error_message = getattr(result, 'error_message', 'Crawl was not successful')
|
152
|
+
raise ScrapingError(f"Crawl failed: {error_message}")
|
153
|
+
|
154
|
+
markdown_content = getattr(result, 'markdown', '') or ""
|
155
|
+
extracted_content = getattr(result, 'extracted_content', None)
|
156
|
+
|
157
|
+
structured_data = self._process_extracted_content(extracted_content)
|
158
|
+
quality_score = self._calculate_quality_score(markdown_content, structured_data)
|
159
|
+
|
160
|
+
return ScrapingResult(
|
161
|
+
url=url,
|
162
|
+
title=self._extract_title(markdown_content),
|
163
|
+
content=markdown_content,
|
164
|
+
structured_data=structured_data,
|
165
|
+
quality_score=quality_score,
|
166
|
+
extraction_method="crawl4ai",
|
167
|
+
metadata={
|
168
|
+
"doc_type": doc_type,
|
169
|
+
"content_length": len(markdown_content),
|
170
|
+
"has_structured_data": bool(structured_data),
|
171
|
+
"crawl_success": result.success,
|
172
|
+
}
|
173
|
+
)
|
174
|
+
|
175
|
+
except Exception as e:
|
176
|
+
self.logger.error(f"Scraping error for {url}: {e}")
|
177
|
+
raise ScrapingError(f"Failed to scrape {url}: {str(e)}")
|
178
|
+
|
179
|
+
async def scrape_crate_documentation(self, crate_name: str) -> Dict[str, ScrapingResult]:
|
180
|
+
results: Dict[str, ScrapingResult] = {}
|
181
|
+
|
182
|
+
urls = {
|
183
|
+
"crates_io": f"https://crates.io/crates/{crate_name}",
|
184
|
+
"docs_rs": f"https://docs.rs/{crate_name}",
|
185
|
+
"lib_rs": f"https://lib.rs/crates/{crate_name}",
|
186
|
+
}
|
187
|
+
|
188
|
+
for source, url in urls.items():
|
189
|
+
try:
|
190
|
+
result = await self.scrape_url(url, doc_type="docs")
|
191
|
+
results[source] = result
|
192
|
+
except ScrapingError as e:
|
193
|
+
self.logger.warning(f"Failed to scrape {source} for {crate_name}: {e}")
|
194
|
+
results[source] = ScrapingResult(
|
195
|
+
url=url,
|
196
|
+
title=f"{crate_name} - {source}",
|
197
|
+
content="",
|
198
|
+
error=str(e),
|
199
|
+
extraction_method="failed"
|
200
|
+
)
|
201
|
+
|
202
|
+
return results
|
203
|
+
|
204
|
+
def _process_extracted_content(
|
205
|
+
self, content: Optional[Union[str, Dict[str, Any]]]
|
206
|
+
) -> Dict[str, Any]:
|
207
|
+
if not content:
|
208
|
+
return {}
|
209
|
+
|
210
|
+
if isinstance(content, str):
|
211
|
+
try:
|
212
|
+
return json.loads(content)
|
213
|
+
except json.JSONDecodeError:
|
214
|
+
return {"raw_content": content}
|
215
|
+
|
216
|
+
return content if isinstance(content, dict) else {}
|
217
|
+
|
218
|
+
def _calculate_quality_score(
|
219
|
+
self, content: str, structured_data: Dict[str, Any]
|
220
|
+
) -> float:
|
221
|
+
if not content:
|
222
|
+
return 0.0
|
223
|
+
|
224
|
+
score = 0.0
|
225
|
+
|
226
|
+
content_length = len(content)
|
227
|
+
if content_length > 1000:
|
228
|
+
score += 3.0
|
229
|
+
elif content_length > 500:
|
230
|
+
score += 2.0
|
231
|
+
elif content_length > 100:
|
232
|
+
score += 1.0
|
233
|
+
|
234
|
+
if structured_data:
|
235
|
+
score += 2.0
|
236
|
+
|
237
|
+
if "title" in content.lower() or "description" in content.lower():
|
238
|
+
score += 1.0
|
239
|
+
|
240
|
+
return min(score, 10.0)
|
241
|
+
|
242
|
+
def _extract_title(self, markdown: str) -> str:
|
243
|
+
lines = markdown.split('\n')
|
244
|
+
for line in lines[:5]:
|
245
|
+
if line.startswith('# '):
|
246
|
+
return line[2:].strip()
|
247
|
+
return "Untitled"
|
248
|
+
|
249
|
+
async def close(self) -> None:
|
250
|
+
if self.crawler and hasattr(self.crawler, 'stop'):
|
251
|
+
try:
|
252
|
+
await self.crawler.stop()
|
253
|
+
except Exception as e:
|
254
|
+
self.logger.warning(f"Error closing crawler: {e}")
|
255
|
+
|
256
|
+
|
257
|
+
async def quick_scrape(url: str, **kwargs: Any) -> ScrapingResult:
|
258
|
+
async with UnifiedScraper() as scraper:
|
259
|
+
return await scraper.scrape_url(url, **kwargs)
|