unrealon 1.1.5__py3-none-any.whl → 2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unrealon-1.1.5.dist-info/licenses → unrealon-2.0.4.dist-info}/LICENSE +1 -1
- unrealon-2.0.4.dist-info/METADATA +491 -0
- unrealon-2.0.4.dist-info/RECORD +129 -0
- {unrealon-1.1.5.dist-info → unrealon-2.0.4.dist-info}/WHEEL +2 -1
- unrealon-2.0.4.dist-info/entry_points.txt +3 -0
- unrealon-2.0.4.dist-info/top_level.txt +3 -0
- unrealon_browser/__init__.py +5 -2
- unrealon_browser/cli/browser_cli.py +18 -9
- unrealon_browser/cli/interactive_mode.py +18 -7
- unrealon_browser/core/browser_manager.py +76 -13
- unrealon_browser/dto/__init__.py +21 -0
- unrealon_browser/dto/bot_detection.py +175 -0
- unrealon_browser/dto/models/config.py +14 -1
- unrealon_browser/managers/__init__.py +4 -1
- unrealon_browser/managers/logger_bridge.py +3 -6
- unrealon_browser/managers/page_wait_manager.py +198 -0
- unrealon_browser/stealth/__init__.py +27 -0
- unrealon_browser/stealth/bypass_techniques.pyc +0 -0
- unrealon_browser/stealth/manager.pyc +0 -0
- unrealon_browser/stealth/nodriver_stealth.pyc +0 -0
- unrealon_browser/stealth/playwright_stealth.pyc +0 -0
- unrealon_browser/stealth/scanner_tester.pyc +0 -0
- unrealon_browser/stealth/undetected_chrome.pyc +0 -0
- unrealon_core/__init__.py +160 -0
- unrealon_core/config/__init__.py +16 -0
- unrealon_core/config/environment.py +98 -0
- unrealon_core/config/urls.py +93 -0
- unrealon_core/enums/__init__.py +24 -0
- unrealon_core/enums/status.py +216 -0
- unrealon_core/enums/types.py +240 -0
- unrealon_core/error_handling/__init__.py +45 -0
- unrealon_core/error_handling/circuit_breaker.py +292 -0
- unrealon_core/error_handling/error_context.py +324 -0
- unrealon_core/error_handling/recovery.py +371 -0
- unrealon_core/error_handling/retry.py +268 -0
- unrealon_core/exceptions/__init__.py +46 -0
- unrealon_core/exceptions/base.py +292 -0
- unrealon_core/exceptions/communication.py +22 -0
- unrealon_core/exceptions/driver.py +11 -0
- unrealon_core/exceptions/proxy.py +11 -0
- unrealon_core/exceptions/task.py +12 -0
- unrealon_core/exceptions/validation.py +17 -0
- unrealon_core/models/__init__.py +98 -0
- unrealon_core/models/arq_context.py +252 -0
- unrealon_core/models/arq_responses.py +125 -0
- unrealon_core/models/base.py +291 -0
- unrealon_core/models/bridge_stats.py +58 -0
- unrealon_core/models/communication.py +39 -0
- unrealon_core/models/config.py +47 -0
- unrealon_core/models/connection_stats.py +47 -0
- unrealon_core/models/driver.py +30 -0
- unrealon_core/models/driver_details.py +98 -0
- unrealon_core/models/logging.py +28 -0
- unrealon_core/models/task.py +21 -0
- unrealon_core/models/typed_responses.py +210 -0
- unrealon_core/models/websocket/__init__.py +91 -0
- unrealon_core/models/websocket/base.py +49 -0
- unrealon_core/models/websocket/config.py +200 -0
- unrealon_core/models/websocket/driver.py +215 -0
- unrealon_core/models/websocket/errors.py +138 -0
- unrealon_core/models/websocket/heartbeat.py +100 -0
- unrealon_core/models/websocket/logging.py +261 -0
- unrealon_core/models/websocket/proxy.py +496 -0
- unrealon_core/models/websocket/tasks.py +275 -0
- unrealon_core/models/websocket/utils.py +153 -0
- unrealon_core/models/websocket_session.py +144 -0
- unrealon_core/monitoring/__init__.py +43 -0
- unrealon_core/monitoring/alerts.py +398 -0
- unrealon_core/monitoring/dashboard.py +307 -0
- unrealon_core/monitoring/health_check.py +354 -0
- unrealon_core/monitoring/metrics.py +352 -0
- unrealon_core/utils/__init__.py +11 -0
- unrealon_core/utils/time.py +61 -0
- unrealon_core/version.py +219 -0
- unrealon_driver/__init__.py +88 -50
- unrealon_driver/core_module/__init__.py +34 -0
- unrealon_driver/core_module/base.py +184 -0
- unrealon_driver/core_module/config.py +30 -0
- unrealon_driver/core_module/event_manager.py +127 -0
- unrealon_driver/core_module/protocols.py +98 -0
- unrealon_driver/core_module/registry.py +146 -0
- unrealon_driver/decorators/__init__.py +15 -0
- unrealon_driver/decorators/retry.py +117 -0
- unrealon_driver/decorators/schedule.py +137 -0
- unrealon_driver/decorators/task.py +61 -0
- unrealon_driver/decorators/timing.py +132 -0
- unrealon_driver/driver/__init__.py +20 -0
- unrealon_driver/driver/communication/__init__.py +10 -0
- unrealon_driver/driver/communication/session.py +203 -0
- unrealon_driver/driver/communication/websocket_client.py +197 -0
- unrealon_driver/driver/core/__init__.py +10 -0
- unrealon_driver/driver/core/config.py +85 -0
- unrealon_driver/driver/core/driver.py +221 -0
- unrealon_driver/driver/factory/__init__.py +9 -0
- unrealon_driver/driver/factory/manager_factory.py +130 -0
- unrealon_driver/driver/lifecycle/__init__.py +11 -0
- unrealon_driver/driver/lifecycle/daemon.py +76 -0
- unrealon_driver/driver/lifecycle/initialization.py +97 -0
- unrealon_driver/driver/lifecycle/shutdown.py +48 -0
- unrealon_driver/driver/monitoring/__init__.py +9 -0
- unrealon_driver/driver/monitoring/health.py +63 -0
- unrealon_driver/driver/utilities/__init__.py +10 -0
- unrealon_driver/driver/utilities/logging.py +51 -0
- unrealon_driver/driver/utilities/serialization.py +61 -0
- unrealon_driver/managers/__init__.py +32 -0
- unrealon_driver/managers/base.py +174 -0
- unrealon_driver/managers/browser.py +98 -0
- unrealon_driver/managers/cache.py +116 -0
- unrealon_driver/managers/http.py +107 -0
- unrealon_driver/managers/logger.py +286 -0
- unrealon_driver/managers/proxy.py +99 -0
- unrealon_driver/managers/registry.py +87 -0
- unrealon_driver/managers/threading.py +54 -0
- unrealon_driver/managers/update.py +107 -0
- unrealon_driver/utils/__init__.py +9 -0
- unrealon_driver/utils/time.py +10 -0
- unrealon/__init__.py +0 -40
- unrealon-1.1.5.dist-info/METADATA +0 -621
- unrealon-1.1.5.dist-info/RECORD +0 -54
- unrealon-1.1.5.dist-info/entry_points.txt +0 -9
- unrealon_browser/managers/stealth.py +0 -388
- unrealon_driver/exceptions.py +0 -33
- unrealon_driver/html_analyzer/__init__.py +0 -32
- unrealon_driver/html_analyzer/cleaner.py +0 -657
- unrealon_driver/html_analyzer/config.py +0 -64
- unrealon_driver/html_analyzer/manager.py +0 -247
- unrealon_driver/html_analyzer/models.py +0 -115
- unrealon_driver/html_analyzer/websocket_analyzer.py +0 -157
- unrealon_driver/models/__init__.py +0 -31
- unrealon_driver/models/websocket.py +0 -98
- unrealon_driver/parser/__init__.py +0 -36
- unrealon_driver/parser/cli_manager.py +0 -142
- unrealon_driver/parser/daemon_manager.py +0 -403
- unrealon_driver/parser/managers/__init__.py +0 -25
- unrealon_driver/parser/managers/config.py +0 -293
- unrealon_driver/parser/managers/error.py +0 -412
- unrealon_driver/parser/managers/result.py +0 -321
- unrealon_driver/parser/parser_manager.py +0 -458
- unrealon_driver/smart_logging/__init__.py +0 -24
- unrealon_driver/smart_logging/models.py +0 -44
- unrealon_driver/smart_logging/smart_logger.py +0 -406
- unrealon_driver/smart_logging/unified_logger.py +0 -525
- unrealon_driver/websocket/__init__.py +0 -31
- unrealon_driver/websocket/client.py +0 -249
- unrealon_driver/websocket/config.py +0 -188
- unrealon_driver/websocket/manager.py +0 -90
|
@@ -1,657 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization.
|
|
3
|
-
|
|
4
|
-
Intelligent HTML cleaning that removes noise but preserves useful data.
|
|
5
|
-
Optimizes HTML for LLM token efficiency while keeping valuable content.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
|
-
import re
|
|
10
|
-
import asyncio
|
|
11
|
-
import concurrent.futures
|
|
12
|
-
from typing import Optional, List, Dict, Any, Tuple
|
|
13
|
-
from pydantic import BaseModel, Field, ConfigDict
|
|
14
|
-
|
|
15
|
-
from bs4 import BeautifulSoup, Comment
|
|
16
|
-
from unrealon_driver.smart_logging import create_smart_logger
|
|
17
|
-
|
|
18
|
-
from .config import HTMLCleaningConfig
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class HTMLCleaningStats(BaseModel):
|
|
22
|
-
"""HTML cleaning statistics"""
|
|
23
|
-
|
|
24
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
25
|
-
|
|
26
|
-
original_size_bytes: int = Field(ge=0)
|
|
27
|
-
cleaned_size_bytes: int = Field(ge=0)
|
|
28
|
-
size_reduction_bytes: int = Field(ge=0)
|
|
29
|
-
size_reduction_percent: float = Field(ge=0.0, le=100.0)
|
|
30
|
-
estimated_original_tokens: int = Field(ge=0)
|
|
31
|
-
estimated_cleaned_tokens: int = Field(ge=0)
|
|
32
|
-
estimated_token_savings: int = Field(ge=0)
|
|
33
|
-
estimated_token_savings_percent: float = Field(ge=0.0, le=100.0)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class ExtractedJSData(BaseModel):
|
|
37
|
-
"""Extracted JavaScript data structure"""
|
|
38
|
-
|
|
39
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
40
|
-
|
|
41
|
-
ssr_data: Dict[str, Any] = Field(default_factory=dict)
|
|
42
|
-
structured_data: List[Dict[str, Any]] = Field(default_factory=list)
|
|
43
|
-
analytics_data: Dict[str, Any] = Field(default_factory=dict)
|
|
44
|
-
product_data: Dict[str, Any] = Field(default_factory=dict)
|
|
45
|
-
raw_extracts: List[Dict[str, Any]] = Field(default_factory=list)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class HTMLCleaningError(Exception):
|
|
49
|
-
"""Raised when HTML cleaning fails"""
|
|
50
|
-
|
|
51
|
-
def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
|
|
52
|
-
self.message = message
|
|
53
|
-
self.operation = operation
|
|
54
|
-
self.details = details or {}
|
|
55
|
-
super().__init__(message)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class HTMLCleaner:
|
|
59
|
-
"""
|
|
60
|
-
🧹 Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization
|
|
61
|
-
|
|
62
|
-
Features:
|
|
63
|
-
- Removes noise (scripts, styles, comments)
|
|
64
|
-
- Preserves useful JavaScript data (JSON objects, SSR data)
|
|
65
|
-
- Cleans whitespace and formatting
|
|
66
|
-
- Maintains semantic structure
|
|
67
|
-
- Extracts and preserves Next.js/Nuxt.js SSR data
|
|
68
|
-
- Optimizes for LLM token efficiency
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
def __init__(self, parser_id: str, config: Optional[HTMLCleaningConfig] = None):
|
|
72
|
-
self.config = config or HTMLCleaningConfig()
|
|
73
|
-
|
|
74
|
-
# Initialize smart logger
|
|
75
|
-
self.parser_id = parser_id
|
|
76
|
-
self.logger = create_smart_logger(parser_id=self.parser_id)
|
|
77
|
-
|
|
78
|
-
# Tags to completely remove
|
|
79
|
-
self.noise_tags = {"script", "style", "meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
|
|
80
|
-
|
|
81
|
-
# Add conditional tags based on config
|
|
82
|
-
if not self.config.preserve_forms:
|
|
83
|
-
self.noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
|
|
84
|
-
|
|
85
|
-
# Universal noise selectors to remove (for any site)
|
|
86
|
-
self.universal_noise_selectors = [
|
|
87
|
-
'[id*="nav"]',
|
|
88
|
-
'[class*="nav"]', # Navigation
|
|
89
|
-
'[id*="menu"]',
|
|
90
|
-
'[class*="menu"]', # Menus
|
|
91
|
-
'[id*="sidebar"]',
|
|
92
|
-
'[class*="sidebar"]', # Sidebars
|
|
93
|
-
'[id*="footer"]',
|
|
94
|
-
'[class*="footer"]', # Footers
|
|
95
|
-
'[id*="header"]',
|
|
96
|
-
'[class*="header"]', # Headers
|
|
97
|
-
'[class*="ads"]',
|
|
98
|
-
'[class*="advertisement"]', # Ads
|
|
99
|
-
'[class*="sponsored"]',
|
|
100
|
-
'[class*="promo"]', # Sponsored content
|
|
101
|
-
'[class*="popup"]',
|
|
102
|
-
'[class*="modal"]', # Popups/modals
|
|
103
|
-
'[class*="overlay"]',
|
|
104
|
-
'[class*="tooltip"]', # Overlays
|
|
105
|
-
'[class*="cookie"]',
|
|
106
|
-
'[class*="gdpr"]', # Cookie notices
|
|
107
|
-
'[class*="newsletter"]',
|
|
108
|
-
'[class*="subscription"]', # Email signup
|
|
109
|
-
'[class*="social"]',
|
|
110
|
-
'[class*="share"]', # Social media
|
|
111
|
-
'[class*="comment"]',
|
|
112
|
-
'[class*="discussion"]', # Comments
|
|
113
|
-
'[class*="tracking"]',
|
|
114
|
-
'[class*="analytics"]', # Tracking
|
|
115
|
-
]
|
|
116
|
-
|
|
117
|
-
# Attributes to keep (semantic ones)
|
|
118
|
-
self.keep_attributes = {"id", "class", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role", "alt", "title", "href", "src", "action", "name", "value", "placeholder", "type"}
|
|
119
|
-
|
|
120
|
-
# Compile regex patterns for performance
|
|
121
|
-
self._compile_patterns()
|
|
122
|
-
|
|
123
|
-
def _compile_patterns(self) -> None:
|
|
124
|
-
"""Compile regex patterns for performance"""
|
|
125
|
-
# URL patterns to remove or shorten (for tracking/analytics)
|
|
126
|
-
self.tracking_url_patterns = [
|
|
127
|
-
r'https://aax-[^\s"]{200,}', # Amazon tracking URLs over 200 chars
|
|
128
|
-
r'https://[^\s"]*tracking[^\s"]{100,}', # General tracking URLs
|
|
129
|
-
r'https://[^\s"]*analytics[^\s"]{100,}', # Analytics URLs
|
|
130
|
-
r'https://[^\s"]*gtm[^\s"]{100,}', # Google Tag Manager URLs
|
|
131
|
-
]
|
|
132
|
-
|
|
133
|
-
# Base64 patterns to remove or replace
|
|
134
|
-
self.base64_patterns = [
|
|
135
|
-
r"data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}", # Base64 images over 50 chars
|
|
136
|
-
r"data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 applications
|
|
137
|
-
r"data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 text
|
|
138
|
-
]
|
|
139
|
-
|
|
140
|
-
# Patterns to detect valuable JavaScript data
|
|
141
|
-
self.useful_js_patterns = [
|
|
142
|
-
# Next.js/Nuxt.js SSR data
|
|
143
|
-
r"__NEXT_DATA__\s*=\s*(\{.+?\});?",
|
|
144
|
-
r"__NUXT__\s*=\s*(\{.+?\});?",
|
|
145
|
-
r"window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?",
|
|
146
|
-
# React/Vue hydration data
|
|
147
|
-
r"window\.__REACT_QUERY_STATE__\s*=\s*(\{.+?\});?",
|
|
148
|
-
r"window\.__VUE_SSR_CONTEXT__\s*=\s*(\{.+?\});?",
|
|
149
|
-
# E-commerce data
|
|
150
|
-
r"window\.productData\s*=\s*(\{.+?\});?",
|
|
151
|
-
r"window\.cartData\s*=\s*(\{.+?\});?",
|
|
152
|
-
r"dataLayer\s*=\s*(\[.+?\]);?",
|
|
153
|
-
# Analytics and tracking (structured data)
|
|
154
|
-
r'gtag\s*\(\s*[\'"]config[\'"],\s*[\'"][^\'\"]+[\'"],\s*(\{.+?\})\s*\);?',
|
|
155
|
-
# JSON-LD structured data (often in script tags)
|
|
156
|
-
r'"@context"\s*:\s*"https?://schema\.org"[^}]*\}',
|
|
157
|
-
# Generic JSON objects (be more selective)
|
|
158
|
-
r"(?:window\.|var\s+|let\s+|const\s+)\w+\s*=\s*(\{.+?\});?",
|
|
159
|
-
]
|
|
160
|
-
|
|
161
|
-
# Compiled regex patterns for efficiency
|
|
162
|
-
self.compiled_patterns = [re.compile(pattern, re.DOTALL | re.IGNORECASE) for pattern in self.useful_js_patterns]
|
|
163
|
-
|
|
164
|
-
# ==========================================
|
|
165
|
-
# MAIN CLEANING METHODS
|
|
166
|
-
# ==========================================
|
|
167
|
-
|
|
168
|
-
async def clean_html(self, html_content: str, preserve_js_data: bool = True, aggressive_cleaning: bool = False) -> Tuple[str, Dict[str, Any]]:
|
|
169
|
-
"""
|
|
170
|
-
Clean HTML content while preserving valuable data
|
|
171
|
-
|
|
172
|
-
Args:
|
|
173
|
-
html_content: Raw HTML content
|
|
174
|
-
preserve_js_data: Whether to extract and preserve JS data
|
|
175
|
-
aggressive_cleaning: Whether to apply more aggressive cleaning
|
|
176
|
-
|
|
177
|
-
Returns:
|
|
178
|
-
Tuple of (cleaned_html, extracted_data)
|
|
179
|
-
"""
|
|
180
|
-
if not html_content or not html_content.strip():
|
|
181
|
-
return "", {}
|
|
182
|
-
|
|
183
|
-
try:
|
|
184
|
-
self.logger.info(f"🧹 Cleaning HTML: {len(html_content)} characters")
|
|
185
|
-
|
|
186
|
-
# Check size limits
|
|
187
|
-
if len(html_content) > self.config.max_html_size:
|
|
188
|
-
self.logger.warning(f"⚠️ HTML size ({len(html_content)}) exceeds limit ({self.config.max_html_size}), truncating")
|
|
189
|
-
html_content = html_content[: self.config.max_html_size]
|
|
190
|
-
|
|
191
|
-
# Parse HTML
|
|
192
|
-
soup = BeautifulSoup(html_content, "html.parser")
|
|
193
|
-
|
|
194
|
-
extracted_data = {}
|
|
195
|
-
|
|
196
|
-
# Extract valuable JavaScript data before removing scripts
|
|
197
|
-
if preserve_js_data:
|
|
198
|
-
extracted_data = self._extract_js_data(soup)
|
|
199
|
-
|
|
200
|
-
# Remove universal noise elements for aggressive cleaning
|
|
201
|
-
if aggressive_cleaning:
|
|
202
|
-
self._remove_universal_noise(soup)
|
|
203
|
-
self._truncate_long_urls(soup) # Do this before tracking URL cleaning
|
|
204
|
-
self._clean_tracking_urls(soup)
|
|
205
|
-
self._clean_base64_data(soup)
|
|
206
|
-
self._remove_long_attributes(soup)
|
|
207
|
-
self._remove_html_comments(soup)
|
|
208
|
-
self._clean_whitespace(soup)
|
|
209
|
-
|
|
210
|
-
# Remove noise elements
|
|
211
|
-
self._remove_noise_elements(soup)
|
|
212
|
-
|
|
213
|
-
# Clean attributes
|
|
214
|
-
self._clean_attributes(soup, aggressive_cleaning)
|
|
215
|
-
|
|
216
|
-
# Remove comments
|
|
217
|
-
self._remove_comments(soup)
|
|
218
|
-
|
|
219
|
-
# Clean text and whitespace
|
|
220
|
-
cleaned_html = self._clean_text_and_whitespace(soup)
|
|
221
|
-
|
|
222
|
-
# Final cleanup
|
|
223
|
-
cleaned_html = self._final_cleanup(cleaned_html)
|
|
224
|
-
|
|
225
|
-
# Log results
|
|
226
|
-
original_size = len(html_content)
|
|
227
|
-
cleaned_size = len(cleaned_html)
|
|
228
|
-
reduction = ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0
|
|
229
|
-
|
|
230
|
-
self.logger.info(f"✅ HTML cleaned: {original_size} → {cleaned_size} chars " f"({reduction:.1f}% reduction)")
|
|
231
|
-
|
|
232
|
-
return cleaned_html, extracted_data
|
|
233
|
-
|
|
234
|
-
except Exception as e:
|
|
235
|
-
self.logger.error(f"❌ HTML cleaning failed: {e}")
|
|
236
|
-
raise HTMLCleaningError(message=f"Failed to clean HTML: {e}", operation="clean_html", details={"html_size": str(len(html_content))}) from e
|
|
237
|
-
|
|
238
|
-
def clean_html_sync(self, html_content: str, **kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
239
|
-
"""
|
|
240
|
-
Synchronous HTML cleaning
|
|
241
|
-
|
|
242
|
-
Args:
|
|
243
|
-
html_content: Raw HTML content
|
|
244
|
-
**kwargs: Cleaning options
|
|
245
|
-
|
|
246
|
-
Returns:
|
|
247
|
-
Tuple of (cleaned_html, extracted_data)
|
|
248
|
-
"""
|
|
249
|
-
# Handle running event loop
|
|
250
|
-
try:
|
|
251
|
-
loop = asyncio.get_running_loop()
|
|
252
|
-
# If we're in an event loop, create a new thread
|
|
253
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
254
|
-
future = executor.submit(asyncio.run, self.clean_html(html_content, **kwargs))
|
|
255
|
-
return future.result()
|
|
256
|
-
except RuntimeError:
|
|
257
|
-
# No event loop running, safe to use asyncio.run
|
|
258
|
-
return asyncio.run(self.clean_html(html_content, **kwargs))
|
|
259
|
-
|
|
260
|
-
# ==========================================
|
|
261
|
-
# CLEANING IMPLEMENTATION
|
|
262
|
-
# ==========================================
|
|
263
|
-
|
|
264
|
-
def _standard_cleaning(self, soup: BeautifulSoup) -> None:
|
|
265
|
-
"""Apply standard cleaning"""
|
|
266
|
-
# Remove noise elements
|
|
267
|
-
self._remove_noise_elements(soup)
|
|
268
|
-
|
|
269
|
-
# Clean attributes
|
|
270
|
-
self._clean_attributes(soup)
|
|
271
|
-
|
|
272
|
-
# Remove comments
|
|
273
|
-
if self.config.remove_comments:
|
|
274
|
-
self._remove_comments(soup)
|
|
275
|
-
|
|
276
|
-
# Normalize whitespace
|
|
277
|
-
if self.config.normalize_whitespace:
|
|
278
|
-
self._normalize_whitespace(soup)
|
|
279
|
-
|
|
280
|
-
def _aggressive_cleaning(self, soup: BeautifulSoup) -> None:
|
|
281
|
-
"""Apply aggressive cleaning"""
|
|
282
|
-
# Standard cleaning first
|
|
283
|
-
self._standard_cleaning(soup)
|
|
284
|
-
|
|
285
|
-
# Remove noise selectors
|
|
286
|
-
self._remove_noise_selectors(soup)
|
|
287
|
-
|
|
288
|
-
# Clean tracking URLs
|
|
289
|
-
if self.config.remove_tracking:
|
|
290
|
-
self._clean_tracking_urls(soup)
|
|
291
|
-
|
|
292
|
-
# Clean base64 data
|
|
293
|
-
self._clean_base64_data(soup)
|
|
294
|
-
|
|
295
|
-
# Truncate long URLs
|
|
296
|
-
self._truncate_long_urls(soup)
|
|
297
|
-
|
|
298
|
-
# Remove long attributes
|
|
299
|
-
self._remove_long_attributes(soup)
|
|
300
|
-
|
|
301
|
-
# Truncate long text
|
|
302
|
-
self._truncate_long_text(soup)
|
|
303
|
-
|
|
304
|
-
def _remove_noise_elements(self, soup: BeautifulSoup) -> None:
|
|
305
|
-
"""Remove noise HTML elements"""
|
|
306
|
-
# Define noise tags
|
|
307
|
-
noise_tags = {"meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
|
|
308
|
-
|
|
309
|
-
# Add conditional tags
|
|
310
|
-
if self.config.remove_scripts:
|
|
311
|
-
noise_tags.add("script")
|
|
312
|
-
if self.config.remove_styles:
|
|
313
|
-
noise_tags.add("style")
|
|
314
|
-
if not self.config.preserve_forms:
|
|
315
|
-
noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
|
|
316
|
-
|
|
317
|
-
# Remove noise tags
|
|
318
|
-
for tag_name in noise_tags:
|
|
319
|
-
for tag in soup.find_all(tag_name):
|
|
320
|
-
tag.decompose()
|
|
321
|
-
|
|
322
|
-
# Remove empty elements
|
|
323
|
-
if self.config.remove_empty_elements:
|
|
324
|
-
for tag in soup.find_all(["div", "span", "p"]):
|
|
325
|
-
if not tag.get_text(strip=True) and not tag.find_all():
|
|
326
|
-
tag.decompose()
|
|
327
|
-
|
|
328
|
-
def _remove_noise_selectors(self, soup: BeautifulSoup) -> None:
|
|
329
|
-
"""Remove elements matching noise selectors"""
|
|
330
|
-
for selector in self.config.noise_selectors:
|
|
331
|
-
try:
|
|
332
|
-
elements = soup.select(selector)
|
|
333
|
-
for element in elements:
|
|
334
|
-
element.decompose()
|
|
335
|
-
except Exception:
|
|
336
|
-
# Skip invalid selectors
|
|
337
|
-
continue
|
|
338
|
-
|
|
339
|
-
def _clean_attributes(self, soup: BeautifulSoup) -> None:
|
|
340
|
-
"""Clean HTML attributes"""
|
|
341
|
-
# Attributes to remove
|
|
342
|
-
noise_attributes = {
|
|
343
|
-
"style",
|
|
344
|
-
"onclick",
|
|
345
|
-
"onload",
|
|
346
|
-
"onchange",
|
|
347
|
-
"onmouseover",
|
|
348
|
-
"onmouseout",
|
|
349
|
-
"onfocus",
|
|
350
|
-
"onblur",
|
|
351
|
-
"onsubmit",
|
|
352
|
-
"onreset",
|
|
353
|
-
"onerror",
|
|
354
|
-
"onabort",
|
|
355
|
-
"autocomplete",
|
|
356
|
-
"autofocus",
|
|
357
|
-
"checked",
|
|
358
|
-
"defer",
|
|
359
|
-
"disabled",
|
|
360
|
-
"hidden",
|
|
361
|
-
"loop",
|
|
362
|
-
"multiple",
|
|
363
|
-
"muted",
|
|
364
|
-
"open",
|
|
365
|
-
"readonly",
|
|
366
|
-
"required",
|
|
367
|
-
"tabindex",
|
|
368
|
-
"translate",
|
|
369
|
-
"draggable",
|
|
370
|
-
"contenteditable",
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
# Attributes to keep
|
|
374
|
-
keep_attributes = {"id", "class", "href", "src", "alt", "title", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role"}
|
|
375
|
-
|
|
376
|
-
for tag in soup.find_all(True):
|
|
377
|
-
if hasattr(tag, "attrs"):
|
|
378
|
-
# Remove unwanted attributes
|
|
379
|
-
attrs_to_remove = set(tag.attrs.keys()) - keep_attributes
|
|
380
|
-
for attr in attrs_to_remove:
|
|
381
|
-
if attr in noise_attributes:
|
|
382
|
-
del tag.attrs[attr]
|
|
383
|
-
|
|
384
|
-
def _clean_tracking_urls(self, soup: BeautifulSoup) -> None:
|
|
385
|
-
"""Remove or replace tracking URLs"""
|
|
386
|
-
# Clean href attributes
|
|
387
|
-
for tag in soup.find_all(["a"], href=True):
|
|
388
|
-
href = tag.get("href", "")
|
|
389
|
-
if href:
|
|
390
|
-
for pattern in self.tracking_url_patterns:
|
|
391
|
-
if pattern.match(href):
|
|
392
|
-
tag["href"] = "#tracking-url-removed"
|
|
393
|
-
break
|
|
394
|
-
|
|
395
|
-
# Clean src attributes
|
|
396
|
-
for tag in soup.find_all(["img"], src=True):
|
|
397
|
-
src = tag.get("src", "")
|
|
398
|
-
if src:
|
|
399
|
-
for pattern in self.tracking_url_patterns:
|
|
400
|
-
if pattern.match(src):
|
|
401
|
-
tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
402
|
-
break
|
|
403
|
-
|
|
404
|
-
def _clean_base64_data(self, soup: BeautifulSoup) -> None:
|
|
405
|
-
"""Remove large base64 encoded data"""
|
|
406
|
-
for tag in soup.find_all(["img"], src=True):
|
|
407
|
-
src = tag.get("src", "")
|
|
408
|
-
if src:
|
|
409
|
-
for pattern in self.base64_patterns:
|
|
410
|
-
if pattern.search(src):
|
|
411
|
-
tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
412
|
-
break
|
|
413
|
-
|
|
414
|
-
def _truncate_long_urls(self, soup: BeautifulSoup) -> None:
|
|
415
|
-
"""Truncate URLs longer than max_url_length"""
|
|
416
|
-
max_length = self.config.max_url_length
|
|
417
|
-
|
|
418
|
-
for tag in soup.find_all(["a"], href=True):
|
|
419
|
-
href = tag.get("href", "")
|
|
420
|
-
if isinstance(href, str) and len(href) > max_length:
|
|
421
|
-
tag["href"] = href[:max_length] + "...truncated"
|
|
422
|
-
|
|
423
|
-
for tag in soup.find_all(["img"], src=True):
|
|
424
|
-
src = tag.get("src", "")
|
|
425
|
-
if isinstance(src, str) and len(src) > max_length and not src.startswith("data:"):
|
|
426
|
-
tag["src"] = src[:max_length] + "...truncated"
|
|
427
|
-
|
|
428
|
-
def _remove_long_attributes(self, soup: BeautifulSoup) -> None:
|
|
429
|
-
"""Remove attributes with extremely long values"""
|
|
430
|
-
for tag in soup.find_all():
|
|
431
|
-
attrs_to_remove = []
|
|
432
|
-
for attr, value in tag.attrs.items():
|
|
433
|
-
if isinstance(value, str) and len(value) > 800:
|
|
434
|
-
attrs_to_remove.append(attr)
|
|
435
|
-
elif any(tracking in attr.lower() for tracking in ["tracking", "analytics", "gtm", "pixel"]):
|
|
436
|
-
attrs_to_remove.append(attr)
|
|
437
|
-
|
|
438
|
-
for attr in attrs_to_remove:
|
|
439
|
-
del tag.attrs[attr]
|
|
440
|
-
|
|
441
|
-
def _truncate_long_text(self, soup: BeautifulSoup) -> None:
|
|
442
|
-
"""Truncate text content longer than max_text_length"""
|
|
443
|
-
max_length = self.config.max_text_length
|
|
444
|
-
|
|
445
|
-
for element in soup.find_all(text=True):
|
|
446
|
-
if element.parent.name not in ["script", "style"]:
|
|
447
|
-
text_content = str(element).strip()
|
|
448
|
-
if text_content and len(text_content) > max_length:
|
|
449
|
-
truncated_text = text_content[:max_length] + "..."
|
|
450
|
-
element.replace_with(truncated_text)
|
|
451
|
-
|
|
452
|
-
def _remove_comments(self, soup: BeautifulSoup) -> None:
|
|
453
|
-
"""Remove HTML comments"""
|
|
454
|
-
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
455
|
-
comment.extract()
|
|
456
|
-
|
|
457
|
-
def _normalize_whitespace(self, soup: BeautifulSoup) -> None:
|
|
458
|
-
"""Normalize whitespace in text content"""
|
|
459
|
-
for element in soup.find_all(text=True):
|
|
460
|
-
if element.parent.name not in ["script", "style"]:
|
|
461
|
-
# Replace multiple spaces with single space
|
|
462
|
-
cleaned_text = re.sub(r" {3,}", " ", str(element))
|
|
463
|
-
# Replace multiple newlines with maximum 2
|
|
464
|
-
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
|
|
465
|
-
# Replace multiple tabs with single space
|
|
466
|
-
cleaned_text = re.sub(r"\t+", " ", cleaned_text)
|
|
467
|
-
element.replace_with(cleaned_text)
|
|
468
|
-
|
|
469
|
-
def _final_cleanup(self, html: str) -> str:
|
|
470
|
-
"""Final cleanup and optimization"""
|
|
471
|
-
# Remove empty attributes
|
|
472
|
-
html = re.sub(r'\s+\w+=""', "", html)
|
|
473
|
-
|
|
474
|
-
# Remove extra spaces in attributes
|
|
475
|
-
html = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html)
|
|
476
|
-
|
|
477
|
-
# Normalize quotes
|
|
478
|
-
html = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html)
|
|
479
|
-
|
|
480
|
-
# Remove trailing spaces before closing tags
|
|
481
|
-
html = re.sub(r"\s+(/?>)", r"\1", html)
|
|
482
|
-
|
|
483
|
-
# Advanced whitespace cleanup
|
|
484
|
-
html = self._advanced_whitespace_cleanup(html)
|
|
485
|
-
|
|
486
|
-
return html.strip()
|
|
487
|
-
|
|
488
|
-
def _advanced_whitespace_cleanup(self, html: str) -> str:
|
|
489
|
-
"""Advanced whitespace cleanup"""
|
|
490
|
-
# Remove excessive spaces
|
|
491
|
-
html = re.sub(r" {3,}", " ", html)
|
|
492
|
-
|
|
493
|
-
# Remove excessive newlines
|
|
494
|
-
html = re.sub(r"\n{3,}", "\n\n", html)
|
|
495
|
-
|
|
496
|
-
# Clean space between tags
|
|
497
|
-
html = re.sub(r">\s{2,}<", "> <", html)
|
|
498
|
-
|
|
499
|
-
return html
|
|
500
|
-
|
|
501
|
-
# ==========================================
|
|
502
|
-
# JAVASCRIPT DATA EXTRACTION
|
|
503
|
-
# ==========================================
|
|
504
|
-
|
|
505
|
-
def _extract_js_data(self, soup: BeautifulSoup) -> ExtractedJSData:
|
|
506
|
-
"""Extract valuable JavaScript data"""
|
|
507
|
-
extracted_data = ExtractedJSData()
|
|
508
|
-
|
|
509
|
-
# Find all script tags
|
|
510
|
-
script_tags = soup.find_all("script")
|
|
511
|
-
|
|
512
|
-
for script in script_tags:
|
|
513
|
-
if not script.string:
|
|
514
|
-
continue
|
|
515
|
-
|
|
516
|
-
script_content = script.string.strip()
|
|
517
|
-
|
|
518
|
-
# Skip empty scripts
|
|
519
|
-
if len(script_content) < 10:
|
|
520
|
-
continue
|
|
521
|
-
|
|
522
|
-
# Check for JSON-LD structured data
|
|
523
|
-
if script.get("type") == "application/ld+json":
|
|
524
|
-
try:
|
|
525
|
-
json_data = json.loads(script_content)
|
|
526
|
-
# Convert to string dict for Pydantic compliance
|
|
527
|
-
str_data = {str(k): str(v) for k, v in json_data.items() if isinstance(k, (str, int, float))}
|
|
528
|
-
extracted_data.structured_data.append(str_data)
|
|
529
|
-
continue
|
|
530
|
-
except json.JSONDecodeError:
|
|
531
|
-
pass
|
|
532
|
-
|
|
533
|
-
# Extract data using patterns
|
|
534
|
-
self._extract_with_patterns(script_content, extracted_data)
|
|
535
|
-
|
|
536
|
-
return extracted_data
|
|
537
|
-
|
|
538
|
-
def _extract_with_patterns(self, script_content: str, extracted_data: ExtractedJSData) -> None:
|
|
539
|
-
"""Extract data using compiled regex patterns"""
|
|
540
|
-
for pattern in self.js_data_patterns:
|
|
541
|
-
matches = pattern.finditer(script_content)
|
|
542
|
-
for match in matches:
|
|
543
|
-
self._try_parse_json(match.group(1), extracted_data)
|
|
544
|
-
|
|
545
|
-
def _try_parse_json(self, json_str: str, extracted_data: ExtractedJSData) -> None:
|
|
546
|
-
"""Try to parse JSON string and add to extracted data"""
|
|
547
|
-
try:
|
|
548
|
-
json_data = json.loads(json_str)
|
|
549
|
-
|
|
550
|
-
if isinstance(json_data, dict):
|
|
551
|
-
# Convert to string dict for Pydantic compliance
|
|
552
|
-
str_data = {}
|
|
553
|
-
for k, v in json_data.items():
|
|
554
|
-
if isinstance(k, (str, int, float)) and isinstance(v, (str, int, float, bool)):
|
|
555
|
-
str_data[str(k)] = str(v)
|
|
556
|
-
|
|
557
|
-
if str_data:
|
|
558
|
-
extracted_data.ssr_data.update(str_data)
|
|
559
|
-
|
|
560
|
-
except json.JSONDecodeError:
|
|
561
|
-
# Skip invalid JSON
|
|
562
|
-
pass
|
|
563
|
-
|
|
564
|
-
# ==========================================
|
|
565
|
-
# UTILITY METHODS
|
|
566
|
-
# ==========================================
|
|
567
|
-
|
|
568
|
-
def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
|
|
569
|
-
"""Get statistics about the cleaning process"""
|
|
570
|
-
original_size = len(original_html)
|
|
571
|
-
cleaned_size = len(cleaned_html)
|
|
572
|
-
|
|
573
|
-
# Estimate token reduction (rough approximation)
|
|
574
|
-
original_tokens = original_size // 4 # Rough estimate: 4 chars per token
|
|
575
|
-
cleaned_tokens = cleaned_size // 4
|
|
576
|
-
|
|
577
|
-
size_reduction = original_size - cleaned_size
|
|
578
|
-
size_reduction_percent = (size_reduction / original_size * 100) if original_size > 0 else 0.0
|
|
579
|
-
token_savings = original_tokens - cleaned_tokens
|
|
580
|
-
token_savings_percent = (token_savings / original_tokens * 100) if original_tokens > 0 else 0.0
|
|
581
|
-
|
|
582
|
-
return HTMLCleaningStats(
|
|
583
|
-
original_size_bytes=original_size,
|
|
584
|
-
cleaned_size_bytes=cleaned_size,
|
|
585
|
-
size_reduction_bytes=size_reduction,
|
|
586
|
-
size_reduction_percent=size_reduction_percent,
|
|
587
|
-
estimated_original_tokens=original_tokens,
|
|
588
|
-
estimated_cleaned_tokens=cleaned_tokens,
|
|
589
|
-
estimated_token_savings=token_savings,
|
|
590
|
-
estimated_token_savings_percent=token_savings_percent,
|
|
591
|
-
)
|
|
592
|
-
|
|
593
|
-
def update_config(self, **kwargs) -> None:
|
|
594
|
-
"""Update configuration with new values"""
|
|
595
|
-
current_data = self.config.model_dump()
|
|
596
|
-
current_data.update(kwargs)
|
|
597
|
-
self.config = HTMLCleaningConfig.model_validate(current_data)
|
|
598
|
-
|
|
599
|
-
# Recompile patterns if needed
|
|
600
|
-
self._compile_patterns()
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
# ==========================================
|
|
604
|
-
# CONVENIENCE FUNCTIONS
|
|
605
|
-
# ==========================================
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
def create_html_cleaner(parser_id: str, config: Optional[HTMLCleaningConfig] = None) -> HTMLCleaner:
|
|
609
|
-
"""
|
|
610
|
-
Create an HTML cleaner instance
|
|
611
|
-
|
|
612
|
-
Args:
|
|
613
|
-
config: Optional HTML cleaning configuration
|
|
614
|
-
parser_id: Parser identifier for logging
|
|
615
|
-
|
|
616
|
-
Returns:
|
|
617
|
-
Configured HTMLCleaner instance
|
|
618
|
-
"""
|
|
619
|
-
return HTMLCleaner(parser_id=parser_id, config=config)
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
async def quick_clean_html(html: str, parser_id: str, **kwargs) -> str:
|
|
623
|
-
"""
|
|
624
|
-
Quick HTML cleaning convenience function
|
|
625
|
-
|
|
626
|
-
Args:
|
|
627
|
-
html: Raw HTML content
|
|
628
|
-
parser_id: Parser identifier for logging
|
|
629
|
-
**kwargs: Cleaning options
|
|
630
|
-
|
|
631
|
-
Returns:
|
|
632
|
-
Cleaned HTML
|
|
633
|
-
"""
|
|
634
|
-
config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
|
|
635
|
-
config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
|
|
636
|
-
|
|
637
|
-
cleaner = create_html_cleaner(parser_id, config)
|
|
638
|
-
return await cleaner.clean_html(html, **kwargs)
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
def quick_clean_html_sync(html: str, parser_id: str, **kwargs) -> str:
|
|
642
|
-
"""
|
|
643
|
-
Quick synchronous HTML cleaning convenience function
|
|
644
|
-
|
|
645
|
-
Args:
|
|
646
|
-
html: Raw HTML content
|
|
647
|
-
parser_id: Parser identifier for logging
|
|
648
|
-
**kwargs: Cleaning options
|
|
649
|
-
|
|
650
|
-
Returns:
|
|
651
|
-
Cleaned HTML
|
|
652
|
-
"""
|
|
653
|
-
config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
|
|
654
|
-
config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
|
|
655
|
-
|
|
656
|
-
cleaner = create_html_cleaner(parser_id, config)
|
|
657
|
-
return cleaner.clean_html_sync(html, **kwargs)
|