unrealon 1.1.6__py3-none-any.whl → 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {unrealon-1.1.6.dist-info/licenses → unrealon-2.0.5.dist-info}/LICENSE +1 -1
  2. unrealon-2.0.5.dist-info/METADATA +491 -0
  3. unrealon-2.0.5.dist-info/RECORD +128 -0
  4. {unrealon-1.1.6.dist-info → unrealon-2.0.5.dist-info}/WHEEL +2 -1
  5. unrealon-2.0.5.dist-info/entry_points.txt +3 -0
  6. unrealon-2.0.5.dist-info/top_level.txt +3 -0
  7. unrealon_browser/__init__.py +5 -6
  8. unrealon_browser/cli/browser_cli.py +18 -9
  9. unrealon_browser/cli/interactive_mode.py +13 -4
  10. unrealon_browser/core/browser_manager.py +29 -16
  11. unrealon_browser/dto/__init__.py +21 -0
  12. unrealon_browser/dto/bot_detection.py +175 -0
  13. unrealon_browser/dto/models/config.py +9 -3
  14. unrealon_browser/managers/__init__.py +1 -1
  15. unrealon_browser/managers/logger_bridge.py +1 -4
  16. unrealon_browser/stealth/__init__.py +27 -0
  17. unrealon_browser/stealth/bypass_techniques.pyc +0 -0
  18. unrealon_browser/stealth/manager.pyc +0 -0
  19. unrealon_browser/stealth/nodriver_stealth.pyc +0 -0
  20. unrealon_browser/stealth/playwright_stealth.pyc +0 -0
  21. unrealon_browser/stealth/scanner_tester.pyc +0 -0
  22. unrealon_browser/stealth/undetected_chrome.pyc +0 -0
  23. unrealon_core/__init__.py +172 -0
  24. unrealon_core/config/__init__.py +16 -0
  25. unrealon_core/config/environment.py +151 -0
  26. unrealon_core/config/urls.py +94 -0
  27. unrealon_core/enums/__init__.py +24 -0
  28. unrealon_core/enums/status.py +216 -0
  29. unrealon_core/enums/types.py +240 -0
  30. unrealon_core/error_handling/__init__.py +45 -0
  31. unrealon_core/error_handling/circuit_breaker.py +292 -0
  32. unrealon_core/error_handling/error_context.py +324 -0
  33. unrealon_core/error_handling/recovery.py +371 -0
  34. unrealon_core/error_handling/retry.py +268 -0
  35. unrealon_core/exceptions/__init__.py +46 -0
  36. unrealon_core/exceptions/base.py +292 -0
  37. unrealon_core/exceptions/communication.py +22 -0
  38. unrealon_core/exceptions/driver.py +11 -0
  39. unrealon_core/exceptions/proxy.py +11 -0
  40. unrealon_core/exceptions/task.py +12 -0
  41. unrealon_core/exceptions/validation.py +17 -0
  42. unrealon_core/models/__init__.py +79 -0
  43. unrealon_core/models/arq_context.py +252 -0
  44. unrealon_core/models/arq_responses.py +125 -0
  45. unrealon_core/models/base.py +291 -0
  46. unrealon_core/models/bridge_stats.py +58 -0
  47. unrealon_core/models/communication.py +39 -0
  48. unrealon_core/models/connection_stats.py +47 -0
  49. unrealon_core/models/driver.py +30 -0
  50. unrealon_core/models/driver_details.py +98 -0
  51. unrealon_core/models/logging.py +28 -0
  52. unrealon_core/models/task.py +21 -0
  53. unrealon_core/models/typed_responses.py +210 -0
  54. unrealon_core/models/websocket/__init__.py +91 -0
  55. unrealon_core/models/websocket/base.py +49 -0
  56. unrealon_core/models/websocket/config.py +200 -0
  57. unrealon_core/models/websocket/driver.py +215 -0
  58. unrealon_core/models/websocket/errors.py +138 -0
  59. unrealon_core/models/websocket/heartbeat.py +100 -0
  60. unrealon_core/models/websocket/logging.py +261 -0
  61. unrealon_core/models/websocket/proxy.py +496 -0
  62. unrealon_core/models/websocket/tasks.py +275 -0
  63. unrealon_core/models/websocket/utils.py +153 -0
  64. unrealon_core/models/websocket_session.py +144 -0
  65. unrealon_core/monitoring/__init__.py +43 -0
  66. unrealon_core/monitoring/alerts.py +398 -0
  67. unrealon_core/monitoring/dashboard.py +307 -0
  68. unrealon_core/monitoring/health_check.py +354 -0
  69. unrealon_core/monitoring/metrics.py +352 -0
  70. unrealon_core/utils/__init__.py +11 -0
  71. unrealon_core/utils/time.py +61 -0
  72. unrealon_core/version.py +219 -0
  73. unrealon_driver/__init__.py +90 -51
  74. unrealon_driver/core_module/__init__.py +34 -0
  75. unrealon_driver/core_module/base.py +184 -0
  76. unrealon_driver/core_module/config.py +30 -0
  77. unrealon_driver/core_module/event_manager.py +127 -0
  78. unrealon_driver/core_module/protocols.py +98 -0
  79. unrealon_driver/core_module/registry.py +146 -0
  80. unrealon_driver/decorators/__init__.py +15 -0
  81. unrealon_driver/decorators/retry.py +117 -0
  82. unrealon_driver/decorators/schedule.py +137 -0
  83. unrealon_driver/decorators/task.py +61 -0
  84. unrealon_driver/decorators/timing.py +132 -0
  85. unrealon_driver/driver/__init__.py +20 -0
  86. unrealon_driver/driver/communication/__init__.py +10 -0
  87. unrealon_driver/driver/communication/session.py +203 -0
  88. unrealon_driver/driver/communication/websocket_client.py +205 -0
  89. unrealon_driver/driver/core/__init__.py +10 -0
  90. unrealon_driver/driver/core/config.py +175 -0
  91. unrealon_driver/driver/core/driver.py +221 -0
  92. unrealon_driver/driver/factory/__init__.py +9 -0
  93. unrealon_driver/driver/factory/manager_factory.py +130 -0
  94. unrealon_driver/driver/lifecycle/__init__.py +11 -0
  95. unrealon_driver/driver/lifecycle/daemon.py +76 -0
  96. unrealon_driver/driver/lifecycle/initialization.py +97 -0
  97. unrealon_driver/driver/lifecycle/shutdown.py +48 -0
  98. unrealon_driver/driver/monitoring/__init__.py +9 -0
  99. unrealon_driver/driver/monitoring/health.py +63 -0
  100. unrealon_driver/driver/utilities/__init__.py +10 -0
  101. unrealon_driver/driver/utilities/logging.py +51 -0
  102. unrealon_driver/driver/utilities/serialization.py +61 -0
  103. unrealon_driver/managers/__init__.py +32 -0
  104. unrealon_driver/managers/base.py +174 -0
  105. unrealon_driver/managers/browser.py +98 -0
  106. unrealon_driver/managers/cache.py +116 -0
  107. unrealon_driver/managers/http.py +107 -0
  108. unrealon_driver/managers/logger.py +286 -0
  109. unrealon_driver/managers/proxy.py +99 -0
  110. unrealon_driver/managers/registry.py +87 -0
  111. unrealon_driver/managers/threading.py +54 -0
  112. unrealon_driver/managers/update.py +107 -0
  113. unrealon_driver/utils/__init__.py +9 -0
  114. unrealon_driver/utils/time.py +10 -0
  115. unrealon-1.1.6.dist-info/METADATA +0 -625
  116. unrealon-1.1.6.dist-info/RECORD +0 -55
  117. unrealon-1.1.6.dist-info/entry_points.txt +0 -9
  118. unrealon_browser/managers/stealth.py +0 -388
  119. unrealon_driver/README.md +0 -0
  120. unrealon_driver/exceptions.py +0 -33
  121. unrealon_driver/html_analyzer/__init__.py +0 -32
  122. unrealon_driver/html_analyzer/cleaner.py +0 -657
  123. unrealon_driver/html_analyzer/config.py +0 -64
  124. unrealon_driver/html_analyzer/manager.py +0 -247
  125. unrealon_driver/html_analyzer/models.py +0 -115
  126. unrealon_driver/html_analyzer/websocket_analyzer.py +0 -157
  127. unrealon_driver/models/__init__.py +0 -31
  128. unrealon_driver/models/websocket.py +0 -98
  129. unrealon_driver/parser/__init__.py +0 -36
  130. unrealon_driver/parser/cli_manager.py +0 -142
  131. unrealon_driver/parser/daemon_manager.py +0 -403
  132. unrealon_driver/parser/managers/__init__.py +0 -25
  133. unrealon_driver/parser/managers/config.py +0 -293
  134. unrealon_driver/parser/managers/error.py +0 -412
  135. unrealon_driver/parser/managers/result.py +0 -321
  136. unrealon_driver/parser/parser_manager.py +0 -458
  137. unrealon_driver/smart_logging/__init__.py +0 -24
  138. unrealon_driver/smart_logging/models.py +0 -44
  139. unrealon_driver/smart_logging/smart_logger.py +0 -406
  140. unrealon_driver/smart_logging/unified_logger.py +0 -525
  141. unrealon_driver/websocket/__init__.py +0 -31
  142. unrealon_driver/websocket/client.py +0 -249
  143. unrealon_driver/websocket/config.py +0 -188
  144. unrealon_driver/websocket/manager.py +0 -90
@@ -1,657 +0,0 @@
1
- """
2
- Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization.
3
-
4
- Intelligent HTML cleaning that removes noise but preserves useful data.
5
- Optimizes HTML for LLM token efficiency while keeping valuable content.
6
- """
7
-
8
- import json
9
- import re
10
- import asyncio
11
- import concurrent.futures
12
- from typing import Optional, List, Dict, Any, Tuple
13
- from pydantic import BaseModel, Field, ConfigDict
14
-
15
- from bs4 import BeautifulSoup, Comment
16
- from unrealon_driver.smart_logging import create_smart_logger
17
-
18
- from .config import HTMLCleaningConfig
19
-
20
-
21
- class HTMLCleaningStats(BaseModel):
22
- """HTML cleaning statistics"""
23
-
24
- model_config = ConfigDict(validate_assignment=True, extra="forbid")
25
-
26
- original_size_bytes: int = Field(ge=0)
27
- cleaned_size_bytes: int = Field(ge=0)
28
- size_reduction_bytes: int = Field(ge=0)
29
- size_reduction_percent: float = Field(ge=0.0, le=100.0)
30
- estimated_original_tokens: int = Field(ge=0)
31
- estimated_cleaned_tokens: int = Field(ge=0)
32
- estimated_token_savings: int = Field(ge=0)
33
- estimated_token_savings_percent: float = Field(ge=0.0, le=100.0)
34
-
35
-
36
- class ExtractedJSData(BaseModel):
37
- """Extracted JavaScript data structure"""
38
-
39
- model_config = ConfigDict(validate_assignment=True, extra="forbid")
40
-
41
- ssr_data: Dict[str, Any] = Field(default_factory=dict)
42
- structured_data: List[Dict[str, Any]] = Field(default_factory=list)
43
- analytics_data: Dict[str, Any] = Field(default_factory=dict)
44
- product_data: Dict[str, Any] = Field(default_factory=dict)
45
- raw_extracts: List[Dict[str, Any]] = Field(default_factory=list)
46
-
47
-
48
- class HTMLCleaningError(Exception):
49
- """Raised when HTML cleaning fails"""
50
-
51
- def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
52
- self.message = message
53
- self.operation = operation
54
- self.details = details or {}
55
- super().__init__(message)
56
-
57
-
58
- class HTMLCleaner:
59
- """
60
- 🧹 Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization
61
-
62
- Features:
63
- - Removes noise (scripts, styles, comments)
64
- - Preserves useful JavaScript data (JSON objects, SSR data)
65
- - Cleans whitespace and formatting
66
- - Maintains semantic structure
67
- - Extracts and preserves Next.js/Nuxt.js SSR data
68
- - Optimizes for LLM token efficiency
69
- """
70
-
71
- def __init__(self, parser_id: str, config: Optional[HTMLCleaningConfig] = None):
72
- self.config = config or HTMLCleaningConfig()
73
-
74
- # Initialize smart logger
75
- self.parser_id = parser_id
76
- self.logger = create_smart_logger(parser_id=self.parser_id)
77
-
78
- # Tags to completely remove
79
- self.noise_tags = {"script", "style", "meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
80
-
81
- # Add conditional tags based on config
82
- if not self.config.preserve_forms:
83
- self.noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
84
-
85
- # Universal noise selectors to remove (for any site)
86
- self.universal_noise_selectors = [
87
- '[id*="nav"]',
88
- '[class*="nav"]', # Navigation
89
- '[id*="menu"]',
90
- '[class*="menu"]', # Menus
91
- '[id*="sidebar"]',
92
- '[class*="sidebar"]', # Sidebars
93
- '[id*="footer"]',
94
- '[class*="footer"]', # Footers
95
- '[id*="header"]',
96
- '[class*="header"]', # Headers
97
- '[class*="ads"]',
98
- '[class*="advertisement"]', # Ads
99
- '[class*="sponsored"]',
100
- '[class*="promo"]', # Sponsored content
101
- '[class*="popup"]',
102
- '[class*="modal"]', # Popups/modals
103
- '[class*="overlay"]',
104
- '[class*="tooltip"]', # Overlays
105
- '[class*="cookie"]',
106
- '[class*="gdpr"]', # Cookie notices
107
- '[class*="newsletter"]',
108
- '[class*="subscription"]', # Email signup
109
- '[class*="social"]',
110
- '[class*="share"]', # Social media
111
- '[class*="comment"]',
112
- '[class*="discussion"]', # Comments
113
- '[class*="tracking"]',
114
- '[class*="analytics"]', # Tracking
115
- ]
116
-
117
- # Attributes to keep (semantic ones)
118
- self.keep_attributes = {"id", "class", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role", "alt", "title", "href", "src", "action", "name", "value", "placeholder", "type"}
119
-
120
- # Compile regex patterns for performance
121
- self._compile_patterns()
122
-
123
- def _compile_patterns(self) -> None:
124
- """Compile regex patterns for performance"""
125
- # URL patterns to remove or shorten (for tracking/analytics)
126
- self.tracking_url_patterns = [
127
- r'https://aax-[^\s"]{200,}', # Amazon tracking URLs over 200 chars
128
- r'https://[^\s"]*tracking[^\s"]{100,}', # General tracking URLs
129
- r'https://[^\s"]*analytics[^\s"]{100,}', # Analytics URLs
130
- r'https://[^\s"]*gtm[^\s"]{100,}', # Google Tag Manager URLs
131
- ]
132
-
133
- # Base64 patterns to remove or replace
134
- self.base64_patterns = [
135
- r"data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}", # Base64 images over 50 chars
136
- r"data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 applications
137
- r"data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 text
138
- ]
139
-
140
- # Patterns to detect valuable JavaScript data
141
- self.useful_js_patterns = [
142
- # Next.js/Nuxt.js SSR data
143
- r"__NEXT_DATA__\s*=\s*(\{.+?\});?",
144
- r"__NUXT__\s*=\s*(\{.+?\});?",
145
- r"window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?",
146
- # React/Vue hydration data
147
- r"window\.__REACT_QUERY_STATE__\s*=\s*(\{.+?\});?",
148
- r"window\.__VUE_SSR_CONTEXT__\s*=\s*(\{.+?\});?",
149
- # E-commerce data
150
- r"window\.productData\s*=\s*(\{.+?\});?",
151
- r"window\.cartData\s*=\s*(\{.+?\});?",
152
- r"dataLayer\s*=\s*(\[.+?\]);?",
153
- # Analytics and tracking (structured data)
154
- r'gtag\s*\(\s*[\'"]config[\'"],\s*[\'"][^\'\"]+[\'"],\s*(\{.+?\})\s*\);?',
155
- # JSON-LD structured data (often in script tags)
156
- r'"@context"\s*:\s*"https?://schema\.org"[^}]*\}',
157
- # Generic JSON objects (be more selective)
158
- r"(?:window\.|var\s+|let\s+|const\s+)\w+\s*=\s*(\{.+?\});?",
159
- ]
160
-
161
- # Compiled regex patterns for efficiency
162
- self.compiled_patterns = [re.compile(pattern, re.DOTALL | re.IGNORECASE) for pattern in self.useful_js_patterns]
163
-
164
- # ==========================================
165
- # MAIN CLEANING METHODS
166
- # ==========================================
167
-
168
- async def clean_html(self, html_content: str, preserve_js_data: bool = True, aggressive_cleaning: bool = False) -> Tuple[str, Dict[str, Any]]:
169
- """
170
- Clean HTML content while preserving valuable data
171
-
172
- Args:
173
- html_content: Raw HTML content
174
- preserve_js_data: Whether to extract and preserve JS data
175
- aggressive_cleaning: Whether to apply more aggressive cleaning
176
-
177
- Returns:
178
- Tuple of (cleaned_html, extracted_data)
179
- """
180
- if not html_content or not html_content.strip():
181
- return "", {}
182
-
183
- try:
184
- self.logger.info(f"🧹 Cleaning HTML: {len(html_content)} characters")
185
-
186
- # Check size limits
187
- if len(html_content) > self.config.max_html_size:
188
- self.logger.warning(f"⚠️ HTML size ({len(html_content)}) exceeds limit ({self.config.max_html_size}), truncating")
189
- html_content = html_content[: self.config.max_html_size]
190
-
191
- # Parse HTML
192
- soup = BeautifulSoup(html_content, "html.parser")
193
-
194
- extracted_data = {}
195
-
196
- # Extract valuable JavaScript data before removing scripts
197
- if preserve_js_data:
198
- extracted_data = self._extract_js_data(soup)
199
-
200
- # Remove universal noise elements for aggressive cleaning
201
- if aggressive_cleaning:
202
- self._remove_universal_noise(soup)
203
- self._truncate_long_urls(soup) # Do this before tracking URL cleaning
204
- self._clean_tracking_urls(soup)
205
- self._clean_base64_data(soup)
206
- self._remove_long_attributes(soup)
207
- self._remove_html_comments(soup)
208
- self._clean_whitespace(soup)
209
-
210
- # Remove noise elements
211
- self._remove_noise_elements(soup)
212
-
213
- # Clean attributes
214
- self._clean_attributes(soup, aggressive_cleaning)
215
-
216
- # Remove comments
217
- self._remove_comments(soup)
218
-
219
- # Clean text and whitespace
220
- cleaned_html = self._clean_text_and_whitespace(soup)
221
-
222
- # Final cleanup
223
- cleaned_html = self._final_cleanup(cleaned_html)
224
-
225
- # Log results
226
- original_size = len(html_content)
227
- cleaned_size = len(cleaned_html)
228
- reduction = ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0
229
-
230
- self.logger.info(f"✅ HTML cleaned: {original_size} → {cleaned_size} chars " f"({reduction:.1f}% reduction)")
231
-
232
- return cleaned_html, extracted_data
233
-
234
- except Exception as e:
235
- self.logger.error(f"❌ HTML cleaning failed: {e}")
236
- raise HTMLCleaningError(message=f"Failed to clean HTML: {e}", operation="clean_html", details={"html_size": str(len(html_content))}) from e
237
-
238
- def clean_html_sync(self, html_content: str, **kwargs) -> Tuple[str, Dict[str, Any]]:
239
- """
240
- Synchronous HTML cleaning
241
-
242
- Args:
243
- html_content: Raw HTML content
244
- **kwargs: Cleaning options
245
-
246
- Returns:
247
- Tuple of (cleaned_html, extracted_data)
248
- """
249
- # Handle running event loop
250
- try:
251
- loop = asyncio.get_running_loop()
252
- # If we're in an event loop, create a new thread
253
- with concurrent.futures.ThreadPoolExecutor() as executor:
254
- future = executor.submit(asyncio.run, self.clean_html(html_content, **kwargs))
255
- return future.result()
256
- except RuntimeError:
257
- # No event loop running, safe to use asyncio.run
258
- return asyncio.run(self.clean_html(html_content, **kwargs))
259
-
260
- # ==========================================
261
- # CLEANING IMPLEMENTATION
262
- # ==========================================
263
-
264
- def _standard_cleaning(self, soup: BeautifulSoup) -> None:
265
- """Apply standard cleaning"""
266
- # Remove noise elements
267
- self._remove_noise_elements(soup)
268
-
269
- # Clean attributes
270
- self._clean_attributes(soup)
271
-
272
- # Remove comments
273
- if self.config.remove_comments:
274
- self._remove_comments(soup)
275
-
276
- # Normalize whitespace
277
- if self.config.normalize_whitespace:
278
- self._normalize_whitespace(soup)
279
-
280
- def _aggressive_cleaning(self, soup: BeautifulSoup) -> None:
281
- """Apply aggressive cleaning"""
282
- # Standard cleaning first
283
- self._standard_cleaning(soup)
284
-
285
- # Remove noise selectors
286
- self._remove_noise_selectors(soup)
287
-
288
- # Clean tracking URLs
289
- if self.config.remove_tracking:
290
- self._clean_tracking_urls(soup)
291
-
292
- # Clean base64 data
293
- self._clean_base64_data(soup)
294
-
295
- # Truncate long URLs
296
- self._truncate_long_urls(soup)
297
-
298
- # Remove long attributes
299
- self._remove_long_attributes(soup)
300
-
301
- # Truncate long text
302
- self._truncate_long_text(soup)
303
-
304
- def _remove_noise_elements(self, soup: BeautifulSoup) -> None:
305
- """Remove noise HTML elements"""
306
- # Define noise tags
307
- noise_tags = {"meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
308
-
309
- # Add conditional tags
310
- if self.config.remove_scripts:
311
- noise_tags.add("script")
312
- if self.config.remove_styles:
313
- noise_tags.add("style")
314
- if not self.config.preserve_forms:
315
- noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
316
-
317
- # Remove noise tags
318
- for tag_name in noise_tags:
319
- for tag in soup.find_all(tag_name):
320
- tag.decompose()
321
-
322
- # Remove empty elements
323
- if self.config.remove_empty_elements:
324
- for tag in soup.find_all(["div", "span", "p"]):
325
- if not tag.get_text(strip=True) and not tag.find_all():
326
- tag.decompose()
327
-
328
- def _remove_noise_selectors(self, soup: BeautifulSoup) -> None:
329
- """Remove elements matching noise selectors"""
330
- for selector in self.config.noise_selectors:
331
- try:
332
- elements = soup.select(selector)
333
- for element in elements:
334
- element.decompose()
335
- except Exception:
336
- # Skip invalid selectors
337
- continue
338
-
339
- def _clean_attributes(self, soup: BeautifulSoup) -> None:
340
- """Clean HTML attributes"""
341
- # Attributes to remove
342
- noise_attributes = {
343
- "style",
344
- "onclick",
345
- "onload",
346
- "onchange",
347
- "onmouseover",
348
- "onmouseout",
349
- "onfocus",
350
- "onblur",
351
- "onsubmit",
352
- "onreset",
353
- "onerror",
354
- "onabort",
355
- "autocomplete",
356
- "autofocus",
357
- "checked",
358
- "defer",
359
- "disabled",
360
- "hidden",
361
- "loop",
362
- "multiple",
363
- "muted",
364
- "open",
365
- "readonly",
366
- "required",
367
- "tabindex",
368
- "translate",
369
- "draggable",
370
- "contenteditable",
371
- }
372
-
373
- # Attributes to keep
374
- keep_attributes = {"id", "class", "href", "src", "alt", "title", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role"}
375
-
376
- for tag in soup.find_all(True):
377
- if hasattr(tag, "attrs"):
378
- # Remove unwanted attributes
379
- attrs_to_remove = set(tag.attrs.keys()) - keep_attributes
380
- for attr in attrs_to_remove:
381
- if attr in noise_attributes:
382
- del tag.attrs[attr]
383
-
384
- def _clean_tracking_urls(self, soup: BeautifulSoup) -> None:
385
- """Remove or replace tracking URLs"""
386
- # Clean href attributes
387
- for tag in soup.find_all(["a"], href=True):
388
- href = tag.get("href", "")
389
- if href:
390
- for pattern in self.tracking_url_patterns:
391
- if pattern.match(href):
392
- tag["href"] = "#tracking-url-removed"
393
- break
394
-
395
- # Clean src attributes
396
- for tag in soup.find_all(["img"], src=True):
397
- src = tag.get("src", "")
398
- if src:
399
- for pattern in self.tracking_url_patterns:
400
- if pattern.match(src):
401
- tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
402
- break
403
-
404
- def _clean_base64_data(self, soup: BeautifulSoup) -> None:
405
- """Remove large base64 encoded data"""
406
- for tag in soup.find_all(["img"], src=True):
407
- src = tag.get("src", "")
408
- if src:
409
- for pattern in self.base64_patterns:
410
- if pattern.search(src):
411
- tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
412
- break
413
-
414
- def _truncate_long_urls(self, soup: BeautifulSoup) -> None:
415
- """Truncate URLs longer than max_url_length"""
416
- max_length = self.config.max_url_length
417
-
418
- for tag in soup.find_all(["a"], href=True):
419
- href = tag.get("href", "")
420
- if isinstance(href, str) and len(href) > max_length:
421
- tag["href"] = href[:max_length] + "...truncated"
422
-
423
- for tag in soup.find_all(["img"], src=True):
424
- src = tag.get("src", "")
425
- if isinstance(src, str) and len(src) > max_length and not src.startswith("data:"):
426
- tag["src"] = src[:max_length] + "...truncated"
427
-
428
- def _remove_long_attributes(self, soup: BeautifulSoup) -> None:
429
- """Remove attributes with extremely long values"""
430
- for tag in soup.find_all():
431
- attrs_to_remove = []
432
- for attr, value in tag.attrs.items():
433
- if isinstance(value, str) and len(value) > 800:
434
- attrs_to_remove.append(attr)
435
- elif any(tracking in attr.lower() for tracking in ["tracking", "analytics", "gtm", "pixel"]):
436
- attrs_to_remove.append(attr)
437
-
438
- for attr in attrs_to_remove:
439
- del tag.attrs[attr]
440
-
441
- def _truncate_long_text(self, soup: BeautifulSoup) -> None:
442
- """Truncate text content longer than max_text_length"""
443
- max_length = self.config.max_text_length
444
-
445
- for element in soup.find_all(text=True):
446
- if element.parent.name not in ["script", "style"]:
447
- text_content = str(element).strip()
448
- if text_content and len(text_content) > max_length:
449
- truncated_text = text_content[:max_length] + "..."
450
- element.replace_with(truncated_text)
451
-
452
- def _remove_comments(self, soup: BeautifulSoup) -> None:
453
- """Remove HTML comments"""
454
- for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
455
- comment.extract()
456
-
457
- def _normalize_whitespace(self, soup: BeautifulSoup) -> None:
458
- """Normalize whitespace in text content"""
459
- for element in soup.find_all(text=True):
460
- if element.parent.name not in ["script", "style"]:
461
- # Replace multiple spaces with single space
462
- cleaned_text = re.sub(r" {3,}", " ", str(element))
463
- # Replace multiple newlines with maximum 2
464
- cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
465
- # Replace multiple tabs with single space
466
- cleaned_text = re.sub(r"\t+", " ", cleaned_text)
467
- element.replace_with(cleaned_text)
468
-
469
- def _final_cleanup(self, html: str) -> str:
470
- """Final cleanup and optimization"""
471
- # Remove empty attributes
472
- html = re.sub(r'\s+\w+=""', "", html)
473
-
474
- # Remove extra spaces in attributes
475
- html = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html)
476
-
477
- # Normalize quotes
478
- html = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html)
479
-
480
- # Remove trailing spaces before closing tags
481
- html = re.sub(r"\s+(/?>)", r"\1", html)
482
-
483
- # Advanced whitespace cleanup
484
- html = self._advanced_whitespace_cleanup(html)
485
-
486
- return html.strip()
487
-
488
- def _advanced_whitespace_cleanup(self, html: str) -> str:
489
- """Advanced whitespace cleanup"""
490
- # Remove excessive spaces
491
- html = re.sub(r" {3,}", " ", html)
492
-
493
- # Remove excessive newlines
494
- html = re.sub(r"\n{3,}", "\n\n", html)
495
-
496
- # Clean space between tags
497
- html = re.sub(r">\s{2,}<", "> <", html)
498
-
499
- return html
500
-
501
- # ==========================================
502
- # JAVASCRIPT DATA EXTRACTION
503
- # ==========================================
504
-
505
- def _extract_js_data(self, soup: BeautifulSoup) -> ExtractedJSData:
506
- """Extract valuable JavaScript data"""
507
- extracted_data = ExtractedJSData()
508
-
509
- # Find all script tags
510
- script_tags = soup.find_all("script")
511
-
512
- for script in script_tags:
513
- if not script.string:
514
- continue
515
-
516
- script_content = script.string.strip()
517
-
518
- # Skip empty scripts
519
- if len(script_content) < 10:
520
- continue
521
-
522
- # Check for JSON-LD structured data
523
- if script.get("type") == "application/ld+json":
524
- try:
525
- json_data = json.loads(script_content)
526
- # Convert to string dict for Pydantic compliance
527
- str_data = {str(k): str(v) for k, v in json_data.items() if isinstance(k, (str, int, float))}
528
- extracted_data.structured_data.append(str_data)
529
- continue
530
- except json.JSONDecodeError:
531
- pass
532
-
533
- # Extract data using patterns
534
- self._extract_with_patterns(script_content, extracted_data)
535
-
536
- return extracted_data
537
-
538
- def _extract_with_patterns(self, script_content: str, extracted_data: ExtractedJSData) -> None:
539
- """Extract data using compiled regex patterns"""
540
- for pattern in self.js_data_patterns:
541
- matches = pattern.finditer(script_content)
542
- for match in matches:
543
- self._try_parse_json(match.group(1), extracted_data)
544
-
545
- def _try_parse_json(self, json_str: str, extracted_data: ExtractedJSData) -> None:
546
- """Try to parse JSON string and add to extracted data"""
547
- try:
548
- json_data = json.loads(json_str)
549
-
550
- if isinstance(json_data, dict):
551
- # Convert to string dict for Pydantic compliance
552
- str_data = {}
553
- for k, v in json_data.items():
554
- if isinstance(k, (str, int, float)) and isinstance(v, (str, int, float, bool)):
555
- str_data[str(k)] = str(v)
556
-
557
- if str_data:
558
- extracted_data.ssr_data.update(str_data)
559
-
560
- except json.JSONDecodeError:
561
- # Skip invalid JSON
562
- pass
563
-
564
- # ==========================================
565
- # UTILITY METHODS
566
- # ==========================================
567
-
568
- def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
569
- """Get statistics about the cleaning process"""
570
- original_size = len(original_html)
571
- cleaned_size = len(cleaned_html)
572
-
573
- # Estimate token reduction (rough approximation)
574
- original_tokens = original_size // 4 # Rough estimate: 4 chars per token
575
- cleaned_tokens = cleaned_size // 4
576
-
577
- size_reduction = original_size - cleaned_size
578
- size_reduction_percent = (size_reduction / original_size * 100) if original_size > 0 else 0.0
579
- token_savings = original_tokens - cleaned_tokens
580
- token_savings_percent = (token_savings / original_tokens * 100) if original_tokens > 0 else 0.0
581
-
582
- return HTMLCleaningStats(
583
- original_size_bytes=original_size,
584
- cleaned_size_bytes=cleaned_size,
585
- size_reduction_bytes=size_reduction,
586
- size_reduction_percent=size_reduction_percent,
587
- estimated_original_tokens=original_tokens,
588
- estimated_cleaned_tokens=cleaned_tokens,
589
- estimated_token_savings=token_savings,
590
- estimated_token_savings_percent=token_savings_percent,
591
- )
592
-
593
- def update_config(self, **kwargs) -> None:
594
- """Update configuration with new values"""
595
- current_data = self.config.model_dump()
596
- current_data.update(kwargs)
597
- self.config = HTMLCleaningConfig.model_validate(current_data)
598
-
599
- # Recompile patterns if needed
600
- self._compile_patterns()
601
-
602
-
603
- # ==========================================
604
- # CONVENIENCE FUNCTIONS
605
- # ==========================================
606
-
607
-
608
- def create_html_cleaner(parser_id: str, config: Optional[HTMLCleaningConfig] = None) -> HTMLCleaner:
609
- """
610
- Create an HTML cleaner instance
611
-
612
- Args:
613
- config: Optional HTML cleaning configuration
614
- parser_id: Parser identifier for logging
615
-
616
- Returns:
617
- Configured HTMLCleaner instance
618
- """
619
- return HTMLCleaner(parser_id=parser_id, config=config)
620
-
621
-
622
- async def quick_clean_html(html: str, parser_id: str, **kwargs) -> str:
623
- """
624
- Quick HTML cleaning convenience function
625
-
626
- Args:
627
- html: Raw HTML content
628
- parser_id: Parser identifier for logging
629
- **kwargs: Cleaning options
630
-
631
- Returns:
632
- Cleaned HTML
633
- """
634
- config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
635
- config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
636
-
637
- cleaner = create_html_cleaner(parser_id, config)
638
- return await cleaner.clean_html(html, **kwargs)
639
-
640
-
641
- def quick_clean_html_sync(html: str, parser_id: str, **kwargs) -> str:
642
- """
643
- Quick synchronous HTML cleaning convenience function
644
-
645
- Args:
646
- html: Raw HTML content
647
- parser_id: Parser identifier for logging
648
- **kwargs: Cleaning options
649
-
650
- Returns:
651
- Cleaned HTML
652
- """
653
- config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
654
- config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
655
-
656
- cleaner = create_html_cleaner(parser_id, config)
657
- return cleaner.clean_html_sync(html, **kwargs)