rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. rust_crate_pipeline/__init__.py +25 -25
  2. rust_crate_pipeline/__main__.py +1 -0
  3. rust_crate_pipeline/ai_processing.py +309 -200
  4. rust_crate_pipeline/analysis.py +304 -368
  5. rust_crate_pipeline/azure_ai_processing.py +453 -0
  6. rust_crate_pipeline/config.py +57 -19
  7. rust_crate_pipeline/core/__init__.py +19 -0
  8. rust_crate_pipeline/core/canon_registry.py +133 -0
  9. rust_crate_pipeline/core/irl_engine.py +256 -0
  10. rust_crate_pipeline/core/sacred_chain.py +117 -0
  11. rust_crate_pipeline/crate_analysis.py +54 -0
  12. rust_crate_pipeline/crate_list.txt +424 -0
  13. rust_crate_pipeline/github_token_checker.py +42 -36
  14. rust_crate_pipeline/main.py +386 -102
  15. rust_crate_pipeline/network.py +153 -133
  16. rust_crate_pipeline/pipeline.py +340 -264
  17. rust_crate_pipeline/production_config.py +35 -32
  18. rust_crate_pipeline/scraping/__init__.py +13 -0
  19. rust_crate_pipeline/scraping/unified_scraper.py +259 -0
  20. rust_crate_pipeline/unified_llm_processor.py +637 -0
  21. rust_crate_pipeline/unified_pipeline.py +548 -0
  22. rust_crate_pipeline/utils/file_utils.py +45 -14
  23. rust_crate_pipeline/utils/logging_utils.py +34 -17
  24. rust_crate_pipeline/version.py +47 -2
  25. rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
  26. rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
  27. rust_crate_pipeline-1.2.6.dist-info/METADATA +0 -573
  28. rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
  29. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
  30. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
  31. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
  32. {rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0
@@ -7,61 +7,64 @@ and improve the user experience in production environments.
7
7
 
8
8
  import logging
9
9
  import os
10
+ from typing import Any
10
11
 
11
12
  # Production logging configuration
12
- def configure_production_logging():
13
+
14
+
15
+ def configure_production_logging() -> None:
13
16
  """Configure logging for production to reduce verbose warnings"""
14
-
17
+
15
18
  # Don't use basicConfig here - let main.py handle it
16
19
  # Just set specific loggers to less verbose levels
17
- logging.getLogger('requests').setLevel(logging.WARNING)
18
- logging.getLogger('urllib3').setLevel(logging.WARNING)
19
- logging.getLogger('requests_cache').setLevel(logging.WARNING)
20
-
20
+ logging.getLogger("requests").setLevel(logging.WARNING)
21
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
22
+ logging.getLogger("requests_cache").setLevel(logging.WARNING)
23
+
21
24
  # If PRODUCTION environment variable is set, be even quieter
22
- if os.getenv('PRODUCTION', 'false').lower() == 'true':
25
+ if os.getenv("PRODUCTION", "false").lower() == "true":
23
26
  logging.getLogger().setLevel(logging.WARNING)
24
- logging.getLogger('rust_crate_pipeline').setLevel(logging.INFO)
27
+ logging.getLogger("rust_crate_pipeline").setLevel(logging.INFO)
28
+
25
29
 
26
30
  # Production-optimized settings
27
- PRODUCTION_SETTINGS = {
31
+ PRODUCTION_SETTINGS: dict[str, Any] = {
28
32
  # Reduced retries to minimize warnings
29
- 'max_retries': 2,
30
- 'validation_retries': 2,
31
-
33
+ "max_retries": 2,
34
+ "validation_retries": 2,
32
35
  # GitHub API management
33
- 'github_rate_limit_threshold': 100,
34
- 'github_critical_threshold': 50,
35
-
36
- # LLM settings
37
- 'llm_timeout': 30,
38
- 'llm_max_attempts': 2,
39
-
36
+ "github_rate_limit_threshold": 100,
37
+ "github_critical_threshold": 50,
38
+ # LLM settings
39
+ "llm_timeout": 30,
40
+ "llm_max_attempts": 2,
40
41
  # Logging preferences
41
- 'quiet_mode': True,
42
- 'log_level': 'INFO',
43
-
42
+ "quiet_mode": True,
43
+ "log_level": "INFO",
44
44
  # Performance settings
45
- 'batch_size': 10,
46
- 'checkpoint_interval': 10,
47
- 'cache_ttl': 3600,
45
+ "batch_size": 10,
46
+ "checkpoint_interval": 10,
47
+ "cache_ttl": 3600,
48
48
  }
49
49
 
50
- def get_production_config():
50
+
51
+ def get_production_config() -> dict[str, Any]:
51
52
  """Get production configuration dictionary"""
52
53
  return PRODUCTION_SETTINGS.copy()
53
54
 
54
- def is_production():
55
+
56
+ def is_production() -> bool:
55
57
  """Check if running in production mode"""
56
- return os.getenv('PRODUCTION', 'false').lower() == 'true'
58
+ return os.getenv("PRODUCTION", "false").lower() == "true"
59
+
57
60
 
58
- def setup_production_environment():
61
+ def setup_production_environment() -> dict[str, Any]:
59
62
  """Set up the complete production environment"""
60
63
  configure_production_logging()
61
-
64
+
62
65
  # Set environment variables for quieter operation
63
- os.environ.setdefault('PYTHONWARNINGS', 'ignore::UserWarning')
64
-
66
+ os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning")
67
+
65
68
  if is_production():
66
69
  print("🚀 Production mode enabled - optimized for minimal warnings")
67
70
  return get_production_config()
@@ -0,0 +1,13 @@
1
+ """
2
+ Unified Scraping Module
3
+
4
+ This module provides a unified interface for all web scraping operations,
5
+ consolidating Crawl4AI integration and other scraping capabilities.
6
+ """
7
+
8
+ from .unified_scraper import UnifiedScraper, ScrapingResult
9
+
10
+ __all__ = [
11
+ "UnifiedScraper",
12
+ "ScrapingResult",
13
+ ]
@@ -0,0 +1,259 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Dict, List, Optional, Union
6
+ from pathlib import Path
7
+ import time
8
+
9
+ try:
10
+ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMExtractionStrategy, BrowserConfig, LLMConfig
11
+ CRAWL4AI_AVAILABLE = True
12
+ except ImportError:
13
+ CRAWL4AI_AVAILABLE = False
14
+ AsyncWebCrawler = None
15
+ CrawlerRunConfig = None
16
+ LLMExtractionStrategy = None
17
+ BrowserConfig = None
18
+ LLMConfig = None
19
+
20
+
21
+ class ScrapingError(Exception):
22
+ pass
23
+
24
+
25
+ @dataclass
26
+ class ScrapingResult:
27
+ url: str
28
+ title: str
29
+ content: str
30
+ structured_data: Dict[str, Any] = field(default_factory=dict)
31
+ quality_score: float = 0.0
32
+ extraction_method: str = "unknown"
33
+ error: Optional[str] = None
34
+ metadata: Dict[str, Any] = field(default_factory=dict)
35
+ timestamp: str = field(default_factory=lambda: str(asyncio.get_event_loop().time() if asyncio.get_event_loop().is_running() else time.time()))
36
+
37
+ def __post_init__(self) -> None:
38
+ if self.timestamp == "0":
39
+ import time
40
+ self.timestamp = str(time.time())
41
+
42
+
43
+ class UnifiedScraper:
44
+
45
+ def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
46
+ self.config = config or {}
47
+ self.logger = logging.getLogger(__name__)
48
+ self.crawler: Optional[Any] = None
49
+ self.browser_config: Optional[Any] = None
50
+ self._initialize_crawler()
51
+
52
+ def _initialize_crawler(self) -> None:
53
+ if not CRAWL4AI_AVAILABLE:
54
+ self.logger.warning("Crawl4AI not available - using basic scraping mode")
55
+ return
56
+
57
+ try:
58
+ # Configure browser for headless operation
59
+ self.browser_config = BrowserConfig(
60
+ headless=self.config.get("headless", True),
61
+ browser_type=self.config.get("browser_type", "chromium"),
62
+ verbose=self.config.get("verbose", False)
63
+ )
64
+
65
+ self.crawler = AsyncWebCrawler(config=self.browser_config)
66
+ self.logger.info("✅ Crawl4AI crawler initialized successfully")
67
+ except Exception as e:
68
+ self.logger.error(f"❌ Failed to initialize Crawl4AI: {e}")
69
+ self.crawler = None
70
+
71
+ async def __aenter__(self) -> "UnifiedScraper":
72
+ if self.crawler and hasattr(self.crawler, 'start'):
73
+ try:
74
+ await self.crawler.start()
75
+ except Exception as e:
76
+ self.logger.warning(f"Failed to start crawler: {e}")
77
+ return self
78
+
79
+ async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
80
+ if self.crawler and hasattr(self.crawler, 'stop'):
81
+ try:
82
+ await self.crawler.stop()
83
+ except Exception as e:
84
+ self.logger.warning(f"Error stopping crawler: {e}")
85
+
86
+ async def scrape_url(
87
+ self,
88
+ url: str,
89
+ doc_type: str = "general",
90
+ extraction_schema: Optional[Dict[str, Any]] = None
91
+ ) -> ScrapingResult:
92
+ if not self.crawler:
93
+ raise ScrapingError("No crawler backend available")
94
+
95
+ try:
96
+ # Configure crawler run parameters
97
+ config_params: Dict[str, Any] = {
98
+ "word_count_threshold": self.config.get("word_count_threshold", 10),
99
+ "screenshot": self.config.get("screenshot", False),
100
+ }
101
+
102
+ # Add CSS selectors based on document type
103
+ if doc_type == "docs":
104
+ config_params["css_selector"] = "main"
105
+ elif doc_type == "readme":
106
+ config_params["css_selector"] = "article, .readme, main"
107
+
108
+ # Update with any additional crawl config
109
+ config_params.update(self.config.get("crawl_config", {}))
110
+
111
+ crawl_config = CrawlerRunConfig(**config_params)
112
+
113
+ # Set up extraction strategy if schema provided
114
+ extraction_strategy = None
115
+ if extraction_schema and CRAWL4AI_AVAILABLE:
116
+ # Get LLM configuration from config or use defaults
117
+ llm_provider = self.config.get("llm_provider", "ollama")
118
+ llm_api_base = self.config.get("llm_api_base", "http://localhost:11434")
119
+ llm_model = self.config.get("llm_model", "deepseek-coder:6.7b")
120
+ llm_api_token = self.config.get("llm_api_token", "no-token-needed")
121
+
122
+ # Create LLM config
123
+ llm_config = LLMConfig(
124
+ provider=llm_provider,
125
+ api_token=llm_api_token,
126
+ api_base=llm_api_base,
127
+ model=llm_model,
128
+ max_tokens=self.config.get("max_tokens", 2048),
129
+ temperature=self.config.get("temperature", 0.7)
130
+ )
131
+
132
+ extraction_strategy = LLMExtractionStrategy(
133
+ llm_config=llm_config,
134
+ schema=extraction_schema,
135
+ extraction_type="schema",
136
+ instruction=f"Extract structured data from this {doc_type} content according to the provided schema."
137
+ )
138
+
139
+ # Run the crawl
140
+ result = await self.crawler.arun(
141
+ url=url,
142
+ config=crawl_config,
143
+ extraction_strategy=extraction_strategy
144
+ )
145
+
146
+ # Handle result (Crawl4AI returns direct result, not container)
147
+ if not result:
148
+ raise ScrapingError("Crawl returned no result")
149
+
150
+ if not result.success:
151
+ error_message = getattr(result, 'error_message', 'Crawl was not successful')
152
+ raise ScrapingError(f"Crawl failed: {error_message}")
153
+
154
+ markdown_content = getattr(result, 'markdown', '') or ""
155
+ extracted_content = getattr(result, 'extracted_content', None)
156
+
157
+ structured_data = self._process_extracted_content(extracted_content)
158
+ quality_score = self._calculate_quality_score(markdown_content, structured_data)
159
+
160
+ return ScrapingResult(
161
+ url=url,
162
+ title=self._extract_title(markdown_content),
163
+ content=markdown_content,
164
+ structured_data=structured_data,
165
+ quality_score=quality_score,
166
+ extraction_method="crawl4ai",
167
+ metadata={
168
+ "doc_type": doc_type,
169
+ "content_length": len(markdown_content),
170
+ "has_structured_data": bool(structured_data),
171
+ "crawl_success": result.success,
172
+ }
173
+ )
174
+
175
+ except Exception as e:
176
+ self.logger.error(f"Scraping error for {url}: {e}")
177
+ raise ScrapingError(f"Failed to scrape {url}: {str(e)}")
178
+
179
+ async def scrape_crate_documentation(self, crate_name: str) -> Dict[str, ScrapingResult]:
180
+ results: Dict[str, ScrapingResult] = {}
181
+
182
+ urls = {
183
+ "crates_io": f"https://crates.io/crates/{crate_name}",
184
+ "docs_rs": f"https://docs.rs/{crate_name}",
185
+ "lib_rs": f"https://lib.rs/crates/{crate_name}",
186
+ }
187
+
188
+ for source, url in urls.items():
189
+ try:
190
+ result = await self.scrape_url(url, doc_type="docs")
191
+ results[source] = result
192
+ except ScrapingError as e:
193
+ self.logger.warning(f"Failed to scrape {source} for {crate_name}: {e}")
194
+ results[source] = ScrapingResult(
195
+ url=url,
196
+ title=f"{crate_name} - {source}",
197
+ content="",
198
+ error=str(e),
199
+ extraction_method="failed"
200
+ )
201
+
202
+ return results
203
+
204
+ def _process_extracted_content(
205
+ self, content: Optional[Union[str, Dict[str, Any]]]
206
+ ) -> Dict[str, Any]:
207
+ if not content:
208
+ return {}
209
+
210
+ if isinstance(content, str):
211
+ try:
212
+ return json.loads(content)
213
+ except json.JSONDecodeError:
214
+ return {"raw_content": content}
215
+
216
+ return content if isinstance(content, dict) else {}
217
+
218
+ def _calculate_quality_score(
219
+ self, content: str, structured_data: Dict[str, Any]
220
+ ) -> float:
221
+ if not content:
222
+ return 0.0
223
+
224
+ score = 0.0
225
+
226
+ content_length = len(content)
227
+ if content_length > 1000:
228
+ score += 3.0
229
+ elif content_length > 500:
230
+ score += 2.0
231
+ elif content_length > 100:
232
+ score += 1.0
233
+
234
+ if structured_data:
235
+ score += 2.0
236
+
237
+ if "title" in content.lower() or "description" in content.lower():
238
+ score += 1.0
239
+
240
+ return min(score, 10.0)
241
+
242
+ def _extract_title(self, markdown: str) -> str:
243
+ lines = markdown.split('\n')
244
+ for line in lines[:5]:
245
+ if line.startswith('# '):
246
+ return line[2:].strip()
247
+ return "Untitled"
248
+
249
+ async def close(self) -> None:
250
+ if self.crawler and hasattr(self.crawler, 'stop'):
251
+ try:
252
+ await self.crawler.stop()
253
+ except Exception as e:
254
+ self.logger.warning(f"Error closing crawler: {e}")
255
+
256
+
257
+ async def quick_scrape(url: str, **kwargs: Any) -> ScrapingResult:
258
+ async with UnifiedScraper() as scraper:
259
+ return await scraper.scrape_url(url, **kwargs)