unrealon 1.1.1__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. unrealon/__init__.py +16 -6
  2. unrealon-1.1.4.dist-info/METADATA +658 -0
  3. unrealon-1.1.4.dist-info/RECORD +54 -0
  4. {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/entry_points.txt +1 -1
  5. unrealon_browser/__init__.py +3 -6
  6. unrealon_browser/core/browser_manager.py +86 -84
  7. unrealon_browser/dto/models/config.py +2 -0
  8. unrealon_browser/managers/captcha.py +165 -185
  9. unrealon_browser/managers/cookies.py +57 -28
  10. unrealon_browser/managers/logger_bridge.py +94 -34
  11. unrealon_browser/managers/profile.py +186 -158
  12. unrealon_browser/managers/stealth.py +58 -47
  13. unrealon_driver/__init__.py +8 -21
  14. unrealon_driver/exceptions.py +5 -0
  15. unrealon_driver/html_analyzer/__init__.py +32 -0
  16. unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} +330 -405
  17. unrealon_driver/html_analyzer/config.py +64 -0
  18. unrealon_driver/html_analyzer/manager.py +247 -0
  19. unrealon_driver/html_analyzer/models.py +115 -0
  20. unrealon_driver/html_analyzer/websocket_analyzer.py +157 -0
  21. unrealon_driver/models/__init__.py +31 -0
  22. unrealon_driver/models/websocket.py +98 -0
  23. unrealon_driver/parser/__init__.py +4 -23
  24. unrealon_driver/parser/cli_manager.py +6 -5
  25. unrealon_driver/parser/daemon_manager.py +242 -66
  26. unrealon_driver/parser/managers/__init__.py +0 -21
  27. unrealon_driver/parser/managers/config.py +15 -3
  28. unrealon_driver/parser/parser_manager.py +225 -395
  29. unrealon_driver/smart_logging/__init__.py +24 -0
  30. unrealon_driver/smart_logging/models.py +44 -0
  31. unrealon_driver/smart_logging/smart_logger.py +406 -0
  32. unrealon_driver/smart_logging/unified_logger.py +525 -0
  33. unrealon_driver/websocket/__init__.py +31 -0
  34. unrealon_driver/websocket/client.py +249 -0
  35. unrealon_driver/websocket/config.py +188 -0
  36. unrealon_driver/websocket/manager.py +90 -0
  37. unrealon-1.1.1.dist-info/METADATA +0 -722
  38. unrealon-1.1.1.dist-info/RECORD +0 -82
  39. unrealon_bridge/__init__.py +0 -114
  40. unrealon_bridge/cli.py +0 -316
  41. unrealon_bridge/client/__init__.py +0 -93
  42. unrealon_bridge/client/base.py +0 -78
  43. unrealon_bridge/client/commands.py +0 -89
  44. unrealon_bridge/client/connection.py +0 -90
  45. unrealon_bridge/client/events.py +0 -65
  46. unrealon_bridge/client/health.py +0 -38
  47. unrealon_bridge/client/html_parser.py +0 -146
  48. unrealon_bridge/client/logging.py +0 -139
  49. unrealon_bridge/client/proxy.py +0 -70
  50. unrealon_bridge/client/scheduler.py +0 -450
  51. unrealon_bridge/client/session.py +0 -70
  52. unrealon_bridge/configs/__init__.py +0 -14
  53. unrealon_bridge/configs/bridge_config.py +0 -212
  54. unrealon_bridge/configs/bridge_config.yaml +0 -39
  55. unrealon_bridge/models/__init__.py +0 -138
  56. unrealon_bridge/models/base.py +0 -28
  57. unrealon_bridge/models/command.py +0 -41
  58. unrealon_bridge/models/events.py +0 -40
  59. unrealon_bridge/models/html_parser.py +0 -79
  60. unrealon_bridge/models/logging.py +0 -55
  61. unrealon_bridge/models/parser.py +0 -63
  62. unrealon_bridge/models/proxy.py +0 -41
  63. unrealon_bridge/models/requests.py +0 -95
  64. unrealon_bridge/models/responses.py +0 -88
  65. unrealon_bridge/models/scheduler.py +0 -592
  66. unrealon_bridge/models/session.py +0 -28
  67. unrealon_bridge/server/__init__.py +0 -91
  68. unrealon_bridge/server/base.py +0 -171
  69. unrealon_bridge/server/handlers/__init__.py +0 -23
  70. unrealon_bridge/server/handlers/command.py +0 -110
  71. unrealon_bridge/server/handlers/html_parser.py +0 -139
  72. unrealon_bridge/server/handlers/logging.py +0 -95
  73. unrealon_bridge/server/handlers/parser.py +0 -95
  74. unrealon_bridge/server/handlers/proxy.py +0 -75
  75. unrealon_bridge/server/handlers/scheduler.py +0 -545
  76. unrealon_bridge/server/handlers/session.py +0 -66
  77. unrealon_driver/browser/__init__.py +0 -8
  78. unrealon_driver/browser/config.py +0 -74
  79. unrealon_driver/browser/manager.py +0 -416
  80. unrealon_driver/parser/managers/browser.py +0 -51
  81. unrealon_driver/parser/managers/logging.py +0 -609
  82. {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/WHEEL +0 -0
  83. {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,123 +1,28 @@
1
1
  """
2
- HTML Manager - Smart HTML processing and cleaning with Pydantic v2
2
+ Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization.
3
3
 
4
- Strict compliance with CRITICAL_REQUIREMENTS.md:
5
- - No Dict[str, Any] usage
6
- - Complete type annotations
7
- - Pydantic v2 models everywhere
8
- - Custom exception hierarchy
4
+ Intelligent HTML cleaning that removes noise but preserves useful data.
5
+ Optimizes HTML for LLM token efficiency while keeping valuable content.
9
6
  """
10
7
 
11
8
  import json
12
9
  import re
13
- from typing import Optional, List, Union
14
- from pathlib import Path
15
- from pydantic import BaseModel, Field, ConfigDict, field_validator
16
10
  import asyncio
17
11
  import concurrent.futures
12
+ from typing import Optional, List, Dict, Any, Tuple
13
+ from pydantic import BaseModel, Field, ConfigDict
18
14
 
19
15
  from bs4 import BeautifulSoup, Comment
16
+ from unrealon_driver.smart_logging import create_smart_logger
20
17
 
21
- from unrealon_rpc.logging import get_logger
22
-
23
-
24
- class HTMLCleaningConfig(BaseModel):
25
- """HTML cleaning configuration with strict typing"""
26
- model_config = ConfigDict(
27
- validate_assignment=True,
28
- extra="forbid"
29
- )
30
-
31
- # Cleaning modes
32
- aggressive_cleaning: bool = Field(
33
- default=True,
34
- description="Enable aggressive cleaning"
35
- )
36
- preserve_js_data: bool = Field(
37
- default=True,
38
- description="Preserve JavaScript data during cleaning"
39
- )
40
-
41
- # Content preservation
42
- preserve_images: bool = Field(
43
- default=False,
44
- description="Preserve image tags"
45
- )
46
- preserve_links: bool = Field(
47
- default=True,
48
- description="Preserve link tags"
49
- )
50
- preserve_forms: bool = Field(
51
- default=False,
52
- description="Preserve form elements"
53
- )
54
-
55
- # Size limits
56
- max_html_size: int = Field(
57
- default=1000000,
58
- ge=1000,
59
- le=10000000,
60
- description="Maximum HTML size in characters"
61
- )
62
- max_text_length: int = Field(
63
- default=300,
64
- ge=50,
65
- le=1000,
66
- description="Maximum text content length per element"
67
- )
68
- max_url_length: int = Field(
69
- default=500,
70
- ge=100,
71
- le=2000,
72
- description="Maximum URL length"
73
- )
74
-
75
- # Noise removal
76
- remove_comments: bool = Field(
77
- default=True,
78
- description="Remove HTML comments"
79
- )
80
- remove_scripts: bool = Field(
81
- default=True,
82
- description="Remove script tags"
83
- )
84
- remove_styles: bool = Field(
85
- default=True,
86
- description="Remove style tags"
87
- )
88
- remove_tracking: bool = Field(
89
- default=True,
90
- description="Remove tracking URLs and attributes"
91
- )
92
-
93
- # Whitespace handling
94
- normalize_whitespace: bool = Field(
95
- default=True,
96
- description="Normalize whitespace"
97
- )
98
- remove_empty_elements: bool = Field(
99
- default=True,
100
- description="Remove empty elements"
101
- )
102
-
103
- # Custom selectors
104
- noise_selectors: List[str] = Field(
105
- default_factory=lambda: [
106
- '[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]',
107
- '[class*="footer"]', '[class*="header"]', '[class*="ads"]',
108
- '[class*="popup"]', '[class*="modal"]', '[class*="cookie"]'
109
- ],
110
- description="CSS selectors for noise elements to remove"
111
- )
18
+ from .config import HTMLCleaningConfig
112
19
 
113
20
 
114
21
  class HTMLCleaningStats(BaseModel):
115
22
  """HTML cleaning statistics"""
116
- model_config = ConfigDict(
117
- validate_assignment=True,
118
- extra="forbid"
119
- )
120
-
23
+
24
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
25
+
121
26
  original_size_bytes: int = Field(ge=0)
122
27
  cleaned_size_bytes: int = Field(ge=0)
123
28
  size_reduction_bytes: int = Field(ge=0)
@@ -130,297 +35,296 @@ class HTMLCleaningStats(BaseModel):
130
35
 
131
36
  class ExtractedJSData(BaseModel):
132
37
  """Extracted JavaScript data structure"""
133
- model_config = ConfigDict(
134
- validate_assignment=True,
135
- extra="forbid"
136
- )
137
-
138
- ssr_data: dict[str, str] = Field(default_factory=dict)
139
- structured_data: List[dict[str, str]] = Field(default_factory=list)
140
- raw_extracts: List[dict[str, str]] = Field(default_factory=list)
141
-
142
-
143
- class HTMLManagerError(Exception):
144
- """Base exception for HTML manager"""
145
- def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
146
- self.message = message
147
- self.operation = operation
148
- self.details = details or {}
149
- super().__init__(message)
150
38
 
39
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
151
40
 
152
- class HTMLParsingError(HTMLManagerError):
153
- """Raised when HTML parsing fails"""
154
- pass
41
+ ssr_data: Dict[str, Any] = Field(default_factory=dict)
42
+ structured_data: List[Dict[str, Any]] = Field(default_factory=list)
43
+ analytics_data: Dict[str, Any] = Field(default_factory=dict)
44
+ product_data: Dict[str, Any] = Field(default_factory=dict)
45
+ raw_extracts: List[Dict[str, Any]] = Field(default_factory=list)
155
46
 
156
47
 
157
- class HTMLCleaningError(HTMLManagerError):
48
+ class HTMLCleaningError(Exception):
158
49
  """Raised when HTML cleaning fails"""
159
- pass
50
+
51
+ def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
52
+ self.message = message
53
+ self.operation = operation
54
+ self.details = details or {}
55
+ super().__init__(message)
160
56
 
161
57
 
162
- class HTMLManager:
58
+ class HTMLCleaner:
163
59
  """
164
- 🧹 HTML Manager - Smart HTML processing and cleaning
165
-
60
+ 🧹 Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization
61
+
166
62
  Features:
167
- - LLM Optimized: Removes noise, preserves valuable content
168
- - Token Efficient: Reduces HTML size for cost-effective LLM analysis
169
- - Smart Extraction: Preserves JavaScript data and structured content
170
- - Performance: Fast cleaning with configurable aggressiveness
171
- - Safe: Handles malformed HTML gracefully
172
- - Type Safety: Full Pydantic v2 compliance
63
+ - Removes noise (scripts, styles, comments)
64
+ - Preserves useful JavaScript data (JSON objects, SSR data)
65
+ - Cleans whitespace and formatting
66
+ - Maintains semantic structure
67
+ - Extracts and preserves Next.js/Nuxt.js SSR data
68
+ - Optimizes for LLM token efficiency
173
69
  """
174
-
175
- def __init__(self, config: Optional[HTMLCleaningConfig] = None):
70
+
71
+ def __init__(self, parser_id: str, config: Optional[HTMLCleaningConfig] = None):
176
72
  self.config = config or HTMLCleaningConfig()
177
- self.logger = get_logger()
178
-
73
+
74
+ # Initialize smart logger
75
+ self.parser_id = parser_id
76
+ self.logger = create_smart_logger(parser_id=self.parser_id)
77
+
78
+ # Tags to completely remove
79
+ self.noise_tags = {"script", "style", "meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
80
+
81
+ # Add conditional tags based on config
82
+ if not self.config.preserve_forms:
83
+ self.noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
84
+
85
+ # Universal noise selectors to remove (for any site)
86
+ self.universal_noise_selectors = [
87
+ '[id*="nav"]',
88
+ '[class*="nav"]', # Navigation
89
+ '[id*="menu"]',
90
+ '[class*="menu"]', # Menus
91
+ '[id*="sidebar"]',
92
+ '[class*="sidebar"]', # Sidebars
93
+ '[id*="footer"]',
94
+ '[class*="footer"]', # Footers
95
+ '[id*="header"]',
96
+ '[class*="header"]', # Headers
97
+ '[class*="ads"]',
98
+ '[class*="advertisement"]', # Ads
99
+ '[class*="sponsored"]',
100
+ '[class*="promo"]', # Sponsored content
101
+ '[class*="popup"]',
102
+ '[class*="modal"]', # Popups/modals
103
+ '[class*="overlay"]',
104
+ '[class*="tooltip"]', # Overlays
105
+ '[class*="cookie"]',
106
+ '[class*="gdpr"]', # Cookie notices
107
+ '[class*="newsletter"]',
108
+ '[class*="subscription"]', # Email signup
109
+ '[class*="social"]',
110
+ '[class*="share"]', # Social media
111
+ '[class*="comment"]',
112
+ '[class*="discussion"]', # Comments
113
+ '[class*="tracking"]',
114
+ '[class*="analytics"]', # Tracking
115
+ ]
116
+
117
+ # Attributes to keep (semantic ones)
118
+ self.keep_attributes = {"id", "class", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role", "alt", "title", "href", "src", "action", "name", "value", "placeholder", "type"}
119
+
179
120
  # Compile regex patterns for performance
180
121
  self._compile_patterns()
181
-
122
+
182
123
  def _compile_patterns(self) -> None:
183
124
  """Compile regex patterns for performance"""
184
- # Tracking URL patterns
125
+ # URL patterns to remove or shorten (for tracking/analytics)
185
126
  self.tracking_url_patterns = [
186
- re.compile(r'https://aax-[^\s"]{200,}', re.IGNORECASE),
187
- re.compile(r'https://[^\s"]*tracking[^\s"]{100,}', re.IGNORECASE),
188
- re.compile(r'https://[^\s"]*analytics[^\s"]{100,}', re.IGNORECASE),
189
- re.compile(r'https://[^\s"]*gtm[^\s"]{100,}', re.IGNORECASE),
127
+ r'https://aax-[^\s"]{200,}', # Amazon tracking URLs over 200 chars
128
+ r'https://[^\s"]*tracking[^\s"]{100,}', # General tracking URLs
129
+ r'https://[^\s"]*analytics[^\s"]{100,}', # Analytics URLs
130
+ r'https://[^\s"]*gtm[^\s"]{100,}', # Google Tag Manager URLs
190
131
  ]
191
-
192
- # Base64 patterns
132
+
133
+ # Base64 patterns to remove or replace
193
134
  self.base64_patterns = [
194
- re.compile(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}'),
195
- re.compile(r'data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
196
- re.compile(r'data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
135
+ r"data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}", # Base64 images over 50 chars
136
+ r"data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 applications
137
+ r"data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 text
197
138
  ]
198
-
199
- # JavaScript data patterns
200
- self.js_data_patterns = [
201
- re.compile(r'__NEXT_DATA__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
202
- re.compile(r'__NUXT__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
203
- re.compile(r'window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
204
- re.compile(r'dataLayer\s*=\s*(\[.+?\]);?', re.DOTALL | re.IGNORECASE),
139
+
140
+ # Patterns to detect valuable JavaScript data
141
+ self.useful_js_patterns = [
142
+ # Next.js/Nuxt.js SSR data
143
+ r"__NEXT_DATA__\s*=\s*(\{.+?\});?",
144
+ r"__NUXT__\s*=\s*(\{.+?\});?",
145
+ r"window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?",
146
+ # React/Vue hydration data
147
+ r"window\.__REACT_QUERY_STATE__\s*=\s*(\{.+?\});?",
148
+ r"window\.__VUE_SSR_CONTEXT__\s*=\s*(\{.+?\});?",
149
+ # E-commerce data
150
+ r"window\.productData\s*=\s*(\{.+?\});?",
151
+ r"window\.cartData\s*=\s*(\{.+?\});?",
152
+ r"dataLayer\s*=\s*(\[.+?\]);?",
153
+ # Analytics and tracking (structured data)
154
+ r'gtag\s*\(\s*[\'"]config[\'"],\s*[\'"][^\'\"]+[\'"],\s*(\{.+?\})\s*\);?',
155
+ # JSON-LD structured data (often in script tags)
156
+ r'"@context"\s*:\s*"https?://schema\.org"[^}]*\}',
157
+ # Generic JSON objects (be more selective)
158
+ r"(?:window\.|var\s+|let\s+|const\s+)\w+\s*=\s*(\{.+?\});?",
205
159
  ]
206
-
160
+
161
+ # Compiled regex patterns for efficiency
162
+ self.compiled_patterns = [re.compile(pattern, re.DOTALL | re.IGNORECASE) for pattern in self.useful_js_patterns]
163
+
207
164
  # ==========================================
208
165
  # MAIN CLEANING METHODS
209
166
  # ==========================================
210
-
211
- async def clean_html(
212
- self,
213
- html: str,
214
- aggressive: Optional[bool] = None,
215
- preserve_js_data: Optional[bool] = None
216
- ) -> str:
167
+
168
+ async def clean_html(self, html_content: str, preserve_js_data: bool = True, aggressive_cleaning: bool = False) -> Tuple[str, Dict[str, Any]]:
217
169
  """
218
- Clean HTML content for LLM analysis
219
-
170
+ Clean HTML content while preserving valuable data
171
+
220
172
  Args:
221
- html: Raw HTML content
222
- aggressive: Override aggressive cleaning setting
223
- preserve_js_data: Override JS data preservation setting
224
-
173
+ html_content: Raw HTML content
174
+ preserve_js_data: Whether to extract and preserve JS data
175
+ aggressive_cleaning: Whether to apply more aggressive cleaning
176
+
225
177
  Returns:
226
- Cleaned HTML optimized for LLM
178
+ Tuple of (cleaned_html, extracted_data)
227
179
  """
228
- if not html or not html.strip():
229
- return ""
230
-
231
- # Use config defaults or overrides
232
- aggressive_cleaning = aggressive if aggressive is not None else self.config.aggressive_cleaning
233
- preserve_js = preserve_js_data if preserve_js_data is not None else self.config.preserve_js_data
234
-
180
+ if not html_content or not html_content.strip():
181
+ return "", {}
182
+
235
183
  try:
236
- self.logger.info(f"Cleaning HTML: {len(html)} characters")
237
-
184
+ self.logger.info(f"🧹 Cleaning HTML: {len(html_content)} characters")
185
+
238
186
  # Check size limits
239
- if len(html) > self.config.max_html_size:
240
- self.logger.warning(f"HTML size ({len(html)}) exceeds limit ({self.config.max_html_size})")
241
- html = html[:self.config.max_html_size]
242
-
187
+ if len(html_content) > self.config.max_html_size:
188
+ self.logger.warning(f"⚠️ HTML size ({len(html_content)}) exceeds limit ({self.config.max_html_size}), truncating")
189
+ html_content = html_content[: self.config.max_html_size]
190
+
243
191
  # Parse HTML
244
- soup = BeautifulSoup(html, 'html.parser')
245
-
246
- # Extract JavaScript data before cleaning
247
- extracted_data = ExtractedJSData()
248
- if preserve_js:
192
+ soup = BeautifulSoup(html_content, "html.parser")
193
+
194
+ extracted_data = {}
195
+
196
+ # Extract valuable JavaScript data before removing scripts
197
+ if preserve_js_data:
249
198
  extracted_data = self._extract_js_data(soup)
250
-
251
- # Apply cleaning steps
199
+
200
+ # Remove universal noise elements for aggressive cleaning
252
201
  if aggressive_cleaning:
253
- self._aggressive_cleaning(soup)
254
- else:
255
- self._standard_cleaning(soup)
256
-
257
- # Get cleaned HTML
258
- cleaned_html = str(soup)
259
-
202
+ self._remove_universal_noise(soup)
203
+ self._truncate_long_urls(soup) # Do this before tracking URL cleaning
204
+ self._clean_tracking_urls(soup)
205
+ self._clean_base64_data(soup)
206
+ self._remove_long_attributes(soup)
207
+ self._remove_html_comments(soup)
208
+ self._clean_whitespace(soup)
209
+
210
+ # Remove noise elements
211
+ self._remove_noise_elements(soup)
212
+
213
+ # Clean attributes
214
+ self._clean_attributes(soup, aggressive_cleaning)
215
+
216
+ # Remove comments
217
+ self._remove_comments(soup)
218
+
219
+ # Clean text and whitespace
220
+ cleaned_html = self._clean_text_and_whitespace(soup)
221
+
260
222
  # Final cleanup
261
223
  cleaned_html = self._final_cleanup(cleaned_html)
262
-
224
+
263
225
  # Log results
264
- original_size = len(html)
226
+ original_size = len(html_content)
265
227
  cleaned_size = len(cleaned_html)
266
228
  reduction = ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0
267
-
268
- self.logger.info(
269
- f"HTML cleaned: {original_size} → {cleaned_size} chars "
270
- f"({reduction:.1f}% reduction)"
271
- )
272
-
273
- return cleaned_html
274
-
229
+
230
+ self.logger.info(f"✅ HTML cleaned: {original_size} → {cleaned_size} chars " f"({reduction:.1f}% reduction)")
231
+
232
+ return cleaned_html, extracted_data
233
+
275
234
  except Exception as e:
276
- self.logger.error(f"HTML cleaning failed: {e}")
277
- raise HTMLCleaningError(
278
- message=f"Failed to clean HTML: {e}",
279
- operation="clean_html",
280
- details={"html_size": str(len(html))}
281
- ) from e
282
-
283
- def clean_html_sync(self, html: str, **kwargs) -> str:
235
+ self.logger.error(f"HTML cleaning failed: {e}")
236
+ raise HTMLCleaningError(message=f"Failed to clean HTML: {e}", operation="clean_html", details={"html_size": str(len(html_content))}) from e
237
+
238
+ def clean_html_sync(self, html_content: str, **kwargs) -> Tuple[str, Dict[str, Any]]:
284
239
  """
285
240
  Synchronous HTML cleaning
286
-
241
+
287
242
  Args:
288
- html: Raw HTML content
243
+ html_content: Raw HTML content
289
244
  **kwargs: Cleaning options
290
-
245
+
291
246
  Returns:
292
- Cleaned HTML
247
+ Tuple of (cleaned_html, extracted_data)
293
248
  """
294
249
  # Handle running event loop
295
250
  try:
296
251
  loop = asyncio.get_running_loop()
297
252
  # If we're in an event loop, create a new thread
298
253
  with concurrent.futures.ThreadPoolExecutor() as executor:
299
- future = executor.submit(asyncio.run, self.clean_html(html, **kwargs))
254
+ future = executor.submit(asyncio.run, self.clean_html(html_content, **kwargs))
300
255
  return future.result()
301
256
  except RuntimeError:
302
257
  # No event loop running, safe to use asyncio.run
303
- return asyncio.run(self.clean_html(html, **kwargs))
304
-
305
- async def parse_and_clean_html(
306
- self,
307
- html: str,
308
- schema: Optional[dict[str, str]] = None,
309
- instructions: Optional[str] = None,
310
- **kwargs
311
- ) -> dict[str, str]:
312
- """
313
- Parse and clean HTML with LLM analysis preparation
314
-
315
- Args:
316
- html: Raw HTML content
317
- schema: Optional data schema for extraction
318
- instructions: Optional parsing instructions
319
- **kwargs: Additional options
320
-
321
- Returns:
322
- Dictionary with cleaned HTML and metadata
323
- """
324
- try:
325
- # Clean HTML
326
- cleaned_html = await self.clean_html(html, **kwargs)
327
-
328
- # Get cleaning stats
329
- stats = self.get_cleaning_stats(html, cleaned_html)
330
-
331
- result = {
332
- "cleaned_html": cleaned_html,
333
- "original_size": str(stats.original_size_bytes),
334
- "cleaned_size": str(stats.cleaned_size_bytes),
335
- "reduction_percent": f"{stats.size_reduction_percent:.1f}",
336
- "estimated_token_savings": str(stats.estimated_token_savings)
337
- }
338
-
339
- if schema:
340
- result["schema"] = str(schema)
341
- if instructions:
342
- result["instructions"] = instructions
343
-
344
- return result
345
-
346
- except Exception as e:
347
- raise HTMLCleaningError(
348
- message=f"Failed to parse and clean HTML: {e}",
349
- operation="parse_and_clean_html"
350
- ) from e
351
-
258
+ return asyncio.run(self.clean_html(html_content, **kwargs))
259
+
352
260
  # ==========================================
353
261
  # CLEANING IMPLEMENTATION
354
262
  # ==========================================
355
-
263
+
356
264
  def _standard_cleaning(self, soup: BeautifulSoup) -> None:
357
265
  """Apply standard cleaning"""
358
266
  # Remove noise elements
359
267
  self._remove_noise_elements(soup)
360
-
268
+
361
269
  # Clean attributes
362
270
  self._clean_attributes(soup)
363
-
271
+
364
272
  # Remove comments
365
273
  if self.config.remove_comments:
366
274
  self._remove_comments(soup)
367
-
275
+
368
276
  # Normalize whitespace
369
277
  if self.config.normalize_whitespace:
370
278
  self._normalize_whitespace(soup)
371
-
279
+
372
280
  def _aggressive_cleaning(self, soup: BeautifulSoup) -> None:
373
281
  """Apply aggressive cleaning"""
374
282
  # Standard cleaning first
375
283
  self._standard_cleaning(soup)
376
-
284
+
377
285
  # Remove noise selectors
378
286
  self._remove_noise_selectors(soup)
379
-
287
+
380
288
  # Clean tracking URLs
381
289
  if self.config.remove_tracking:
382
290
  self._clean_tracking_urls(soup)
383
-
291
+
384
292
  # Clean base64 data
385
293
  self._clean_base64_data(soup)
386
-
294
+
387
295
  # Truncate long URLs
388
296
  self._truncate_long_urls(soup)
389
-
297
+
390
298
  # Remove long attributes
391
299
  self._remove_long_attributes(soup)
392
-
300
+
393
301
  # Truncate long text
394
302
  self._truncate_long_text(soup)
395
-
303
+
396
304
  def _remove_noise_elements(self, soup: BeautifulSoup) -> None:
397
305
  """Remove noise HTML elements"""
398
306
  # Define noise tags
399
- noise_tags = {
400
- 'meta', 'link', 'base', 'title', 'head', 'noscript',
401
- 'iframe', 'embed', 'object', 'svg', 'canvas',
402
- 'audio', 'video', 'source', 'track', 'area', 'map', 'param'
403
- }
404
-
307
+ noise_tags = {"meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
308
+
405
309
  # Add conditional tags
406
310
  if self.config.remove_scripts:
407
- noise_tags.add('script')
311
+ noise_tags.add("script")
408
312
  if self.config.remove_styles:
409
- noise_tags.add('style')
313
+ noise_tags.add("style")
410
314
  if not self.config.preserve_forms:
411
- noise_tags.update({'form', 'input', 'button', 'select', 'textarea', 'fieldset', 'legend'})
412
-
315
+ noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
316
+
413
317
  # Remove noise tags
414
318
  for tag_name in noise_tags:
415
319
  for tag in soup.find_all(tag_name):
416
320
  tag.decompose()
417
-
321
+
418
322
  # Remove empty elements
419
323
  if self.config.remove_empty_elements:
420
- for tag in soup.find_all(['div', 'span', 'p']):
324
+ for tag in soup.find_all(["div", "span", "p"]):
421
325
  if not tag.get_text(strip=True) and not tag.find_all():
422
326
  tag.decompose()
423
-
327
+
424
328
  def _remove_noise_selectors(self, soup: BeautifulSoup) -> None:
425
329
  """Remove elements matching noise selectors"""
426
330
  for selector in self.config.noise_selectors:
@@ -431,78 +335,96 @@ class HTMLManager:
431
335
  except Exception:
432
336
  # Skip invalid selectors
433
337
  continue
434
-
338
+
435
339
  def _clean_attributes(self, soup: BeautifulSoup) -> None:
436
340
  """Clean HTML attributes"""
437
341
  # Attributes to remove
438
342
  noise_attributes = {
439
- 'style', 'onclick', 'onload', 'onchange', 'onmouseover',
440
- 'onmouseout', 'onfocus', 'onblur', 'onsubmit', 'onreset',
441
- 'onerror', 'onabort', 'autocomplete', 'autofocus',
442
- 'checked', 'defer', 'disabled', 'hidden', 'loop',
443
- 'multiple', 'muted', 'open', 'readonly', 'required',
444
- 'tabindex', 'translate', 'draggable', 'contenteditable'
343
+ "style",
344
+ "onclick",
345
+ "onload",
346
+ "onchange",
347
+ "onmouseover",
348
+ "onmouseout",
349
+ "onfocus",
350
+ "onblur",
351
+ "onsubmit",
352
+ "onreset",
353
+ "onerror",
354
+ "onabort",
355
+ "autocomplete",
356
+ "autofocus",
357
+ "checked",
358
+ "defer",
359
+ "disabled",
360
+ "hidden",
361
+ "loop",
362
+ "multiple",
363
+ "muted",
364
+ "open",
365
+ "readonly",
366
+ "required",
367
+ "tabindex",
368
+ "translate",
369
+ "draggable",
370
+ "contenteditable",
445
371
  }
446
-
372
+
447
373
  # Attributes to keep
448
- keep_attributes = {
449
- 'id', 'class', 'href', 'src', 'alt', 'title',
450
- 'data-testid', 'data-test', 'data-cy',
451
- 'aria-label', 'aria-labelledby', 'aria-describedby', 'role'
452
- }
453
-
374
+ keep_attributes = {"id", "class", "href", "src", "alt", "title", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role"}
375
+
454
376
  for tag in soup.find_all(True):
455
- if hasattr(tag, 'attrs'):
377
+ if hasattr(tag, "attrs"):
456
378
  # Remove unwanted attributes
457
379
  attrs_to_remove = set(tag.attrs.keys()) - keep_attributes
458
380
  for attr in attrs_to_remove:
459
381
  if attr in noise_attributes:
460
382
  del tag.attrs[attr]
461
-
383
+
462
384
  def _clean_tracking_urls(self, soup: BeautifulSoup) -> None:
463
385
  """Remove or replace tracking URLs"""
464
386
  # Clean href attributes
465
- for tag in soup.find_all(['a'], href=True):
466
- href = tag.get('href', '')
387
+ for tag in soup.find_all(["a"], href=True):
388
+ href = tag.get("href", "")
467
389
  if href:
468
390
  for pattern in self.tracking_url_patterns:
469
391
  if pattern.match(href):
470
- tag['href'] = '#tracking-url-removed'
392
+ tag["href"] = "#tracking-url-removed"
471
393
  break
472
-
394
+
473
395
  # Clean src attributes
474
- for tag in soup.find_all(['img'], src=True):
475
- src = tag.get('src', '')
396
+ for tag in soup.find_all(["img"], src=True):
397
+ src = tag.get("src", "")
476
398
  if src:
477
399
  for pattern in self.tracking_url_patterns:
478
400
  if pattern.match(src):
479
- tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
401
+ tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
480
402
  break
481
-
403
+
482
404
  def _clean_base64_data(self, soup: BeautifulSoup) -> None:
483
405
  """Remove large base64 encoded data"""
484
- for tag in soup.find_all(['img'], src=True):
485
- src = tag.get('src', '')
406
+ for tag in soup.find_all(["img"], src=True):
407
+ src = tag.get("src", "")
486
408
  if src:
487
409
  for pattern in self.base64_patterns:
488
410
  if pattern.search(src):
489
- tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
411
+ tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
490
412
  break
491
-
413
+
492
414
  def _truncate_long_urls(self, soup: BeautifulSoup) -> None:
493
415
  """Truncate URLs longer than max_url_length"""
494
416
  max_length = self.config.max_url_length
495
-
496
- for tag in soup.find_all(['a'], href=True):
497
- href = tag.get('href', '')
417
+
418
+ for tag in soup.find_all(["a"], href=True):
419
+ href = tag.get("href", "")
498
420
  if isinstance(href, str) and len(href) > max_length:
499
- tag['href'] = href[:max_length] + '...truncated'
500
-
501
- for tag in soup.find_all(['img'], src=True):
502
- src = tag.get('src', '')
503
- if isinstance(src, str) and len(src) > max_length and not src.startswith('data:'):
504
- tag['src'] = src[:max_length] + '...truncated'
505
-
421
+ tag["href"] = href[:max_length] + "...truncated"
422
+
423
+ for tag in soup.find_all(["img"], src=True):
424
+ src = tag.get("src", "")
425
+ if isinstance(src, str) and len(src) > max_length and not src.startswith("data:"):
426
+ tag["src"] = src[:max_length] + "...truncated"
427
+
506
428
  def _remove_long_attributes(self, soup: BeautifulSoup) -> None:
507
429
  """Remove attributes with extremely long values"""
508
430
  for tag in soup.find_all():
@@ -510,96 +432,95 @@ class HTMLManager:
510
432
  for attr, value in tag.attrs.items():
511
433
  if isinstance(value, str) and len(value) > 800:
512
434
  attrs_to_remove.append(attr)
513
- elif any(tracking in attr.lower() for tracking in
514
- ['tracking', 'analytics', 'gtm', 'pixel']):
435
+ elif any(tracking in attr.lower() for tracking in ["tracking", "analytics", "gtm", "pixel"]):
515
436
  attrs_to_remove.append(attr)
516
-
437
+
517
438
  for attr in attrs_to_remove:
518
439
  del tag.attrs[attr]
519
-
440
+
520
441
  def _truncate_long_text(self, soup: BeautifulSoup) -> None:
521
442
  """Truncate text content longer than max_text_length"""
522
443
  max_length = self.config.max_text_length
523
-
444
+
524
445
  for element in soup.find_all(text=True):
525
- if element.parent.name not in ['script', 'style']:
446
+ if element.parent.name not in ["script", "style"]:
526
447
  text_content = str(element).strip()
527
448
  if text_content and len(text_content) > max_length:
528
- truncated_text = text_content[:max_length] + '...'
449
+ truncated_text = text_content[:max_length] + "..."
529
450
  element.replace_with(truncated_text)
530
-
451
+
531
452
  def _remove_comments(self, soup: BeautifulSoup) -> None:
532
453
  """Remove HTML comments"""
533
454
  for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
534
455
  comment.extract()
535
-
456
+
536
457
  def _normalize_whitespace(self, soup: BeautifulSoup) -> None:
537
458
  """Normalize whitespace in text content"""
538
459
  for element in soup.find_all(text=True):
539
- if element.parent.name not in ['script', 'style']:
460
+ if element.parent.name not in ["script", "style"]:
540
461
  # Replace multiple spaces with single space
541
- cleaned_text = re.sub(r' {3,}', ' ', str(element))
462
+ cleaned_text = re.sub(r" {3,}", " ", str(element))
542
463
  # Replace multiple newlines with maximum 2
543
- cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
464
+ cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
544
465
  # Replace multiple tabs with single space
545
- cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
466
+ cleaned_text = re.sub(r"\t+", " ", cleaned_text)
546
467
  element.replace_with(cleaned_text)
547
-
468
+
548
469
  def _final_cleanup(self, html: str) -> str:
549
470
  """Final cleanup and optimization"""
550
471
  # Remove empty attributes
551
- html = re.sub(r'\s+\w+=""', '', html)
552
-
472
+ html = re.sub(r'\s+\w+=""', "", html)
473
+
553
474
  # Remove extra spaces in attributes
554
475
  html = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html)
555
-
476
+
556
477
  # Normalize quotes
557
478
  html = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html)
558
-
479
+
559
480
  # Remove trailing spaces before closing tags
560
- html = re.sub(r'\s+(/?>)', r'\1', html)
561
-
481
+ html = re.sub(r"\s+(/?>)", r"\1", html)
482
+
562
483
  # Advanced whitespace cleanup
563
484
  html = self._advanced_whitespace_cleanup(html)
564
-
485
+
565
486
  return html.strip()
566
-
487
+
567
488
  def _advanced_whitespace_cleanup(self, html: str) -> str:
568
489
  """Advanced whitespace cleanup"""
569
490
  # Remove excessive spaces
570
- html = re.sub(r' {3,}', ' ', html)
571
-
491
+ html = re.sub(r" {3,}", " ", html)
492
+
572
493
  # Remove excessive newlines
573
- html = re.sub(r'\n{3,}', '\n\n', html)
574
-
494
+ html = re.sub(r"\n{3,}", "\n\n", html)
495
+
575
496
  # Clean space between tags
576
- html = re.sub(r'>\s{2,}<', '> <', html)
577
-
497
+ html = re.sub(r">\s{2,}<", "> <", html)
498
+
578
499
  return html
579
-
500
+
580
501
  # ==========================================
581
502
  # JAVASCRIPT DATA EXTRACTION
582
503
  # ==========================================
583
-
504
+
584
505
  def _extract_js_data(self, soup: BeautifulSoup) -> ExtractedJSData:
585
506
  """Extract valuable JavaScript data"""
586
507
  extracted_data = ExtractedJSData()
587
-
508
+
588
509
  # Find all script tags
589
- script_tags = soup.find_all('script')
590
-
510
+ script_tags = soup.find_all("script")
511
+
591
512
  for script in script_tags:
592
513
  if not script.string:
593
514
  continue
594
-
515
+
595
516
  script_content = script.string.strip()
596
-
517
+
597
518
  # Skip empty scripts
598
519
  if len(script_content) < 10:
599
520
  continue
600
-
521
+
601
522
  # Check for JSON-LD structured data
602
- if script.get('type') == 'application/ld+json':
523
+ if script.get("type") == "application/ld+json":
603
524
  try:
604
525
  json_data = json.loads(script_content)
605
526
  # Convert to string dict for Pydantic compliance
@@ -608,56 +529,56 @@ class HTMLManager:
608
529
  continue
609
530
  except json.JSONDecodeError:
610
531
  pass
611
-
532
+
612
533
  # Extract data using patterns
613
534
  self._extract_with_patterns(script_content, extracted_data)
614
-
535
+
615
536
  return extracted_data
616
-
537
+
617
538
  def _extract_with_patterns(self, script_content: str, extracted_data: ExtractedJSData) -> None:
618
539
  """Extract data using compiled regex patterns"""
619
540
  for pattern in self.js_data_patterns:
620
541
  matches = pattern.finditer(script_content)
621
542
  for match in matches:
622
543
  self._try_parse_json(match.group(1), extracted_data)
623
-
544
+
624
545
  def _try_parse_json(self, json_str: str, extracted_data: ExtractedJSData) -> None:
625
546
  """Try to parse JSON string and add to extracted data"""
626
547
  try:
627
548
  json_data = json.loads(json_str)
628
-
549
+
629
550
  if isinstance(json_data, dict):
630
551
  # Convert to string dict for Pydantic compliance
631
552
  str_data = {}
632
553
  for k, v in json_data.items():
633
554
  if isinstance(k, (str, int, float)) and isinstance(v, (str, int, float, bool)):
634
555
  str_data[str(k)] = str(v)
635
-
556
+
636
557
  if str_data:
637
558
  extracted_data.ssr_data.update(str_data)
638
-
559
+
639
560
  except json.JSONDecodeError:
640
561
  # Skip invalid JSON
641
562
  pass
642
-
563
+
643
564
  # ==========================================
644
565
  # UTILITY METHODS
645
566
  # ==========================================
646
-
567
+
647
568
  def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
648
569
  """Get statistics about the cleaning process"""
649
570
  original_size = len(original_html)
650
571
  cleaned_size = len(cleaned_html)
651
-
572
+
652
573
  # Estimate token reduction (rough approximation)
653
574
  original_tokens = original_size // 4 # Rough estimate: 4 chars per token
654
575
  cleaned_tokens = cleaned_size // 4
655
-
576
+
656
577
  size_reduction = original_size - cleaned_size
657
578
  size_reduction_percent = (size_reduction / original_size * 100) if original_size > 0 else 0.0
658
579
  token_savings = original_tokens - cleaned_tokens
659
580
  token_savings_percent = (token_savings / original_tokens * 100) if original_tokens > 0 else 0.0
660
-
581
+
661
582
  return HTMLCleaningStats(
662
583
  original_size_bytes=original_size,
663
584
  cleaned_size_bytes=cleaned_size,
@@ -666,15 +587,15 @@ class HTMLManager:
666
587
  estimated_original_tokens=original_tokens,
667
588
  estimated_cleaned_tokens=cleaned_tokens,
668
589
  estimated_token_savings=token_savings,
669
- estimated_token_savings_percent=token_savings_percent
590
+ estimated_token_savings_percent=token_savings_percent,
670
591
  )
671
-
592
+
672
593
  def update_config(self, **kwargs) -> None:
673
594
  """Update configuration with new values"""
674
595
  current_data = self.config.model_dump()
675
596
  current_data.update(kwargs)
676
597
  self.config = HTMLCleaningConfig.model_validate(current_data)
677
-
598
+
678
599
  # Recompile patterns if needed
679
600
  self._compile_patterns()
680
601
 
@@ -683,50 +604,54 @@ class HTMLManager:
683
604
  # CONVENIENCE FUNCTIONS
684
605
  # ==========================================
685
606
 
686
- def get_html_manager(config: Optional[HTMLCleaningConfig] = None) -> HTMLManager:
607
+
608
+ def create_html_cleaner(parser_id: str, config: Optional[HTMLCleaningConfig] = None) -> HTMLCleaner:
687
609
  """
688
- Get an HTML manager instance
689
-
610
+ Create an HTML cleaner instance
611
+
690
612
  Args:
691
613
  config: Optional HTML cleaning configuration
692
-
614
+ parser_id: Parser identifier for logging
615
+
693
616
  Returns:
694
- Configured HTMLManager instance
617
+ Configured HTMLCleaner instance
695
618
  """
696
- return HTMLManager(config=config)
619
+ return HTMLCleaner(parser_id=parser_id, config=config)
697
620
 
698
621
 
699
- async def quick_clean_html(html: str, **kwargs) -> str:
622
+ async def quick_clean_html(html: str, parser_id: str, **kwargs) -> str:
700
623
  """
701
624
  Quick HTML cleaning convenience function
702
-
625
+
703
626
  Args:
704
627
  html: Raw HTML content
628
+ parser_id: Parser identifier for logging
705
629
  **kwargs: Cleaning options
706
-
630
+
707
631
  Returns:
708
632
  Cleaned HTML
709
633
  """
710
634
  config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
711
635
  config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
712
-
713
- manager = get_html_manager(config)
714
- return await manager.clean_html(html, **kwargs)
715
636
 
637
+ cleaner = create_html_cleaner(parser_id, config)
638
+ return await cleaner.clean_html(html, **kwargs)
716
639
 
717
- def quick_clean_html_sync(html: str, **kwargs) -> str:
640
+
641
+ def quick_clean_html_sync(html: str, parser_id: str, **kwargs) -> str:
718
642
  """
719
643
  Quick synchronous HTML cleaning convenience function
720
-
644
+
721
645
  Args:
722
646
  html: Raw HTML content
647
+ parser_id: Parser identifier for logging
723
648
  **kwargs: Cleaning options
724
-
649
+
725
650
  Returns:
726
651
  Cleaned HTML
727
652
  """
728
653
  config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
729
654
  config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
730
-
731
- manager = get_html_manager(config)
732
- return manager.clean_html_sync(html, **kwargs)
655
+
656
+ cleaner = create_html_cleaner(parser_id, config)
657
+ return cleaner.clean_html_sync(html, **kwargs)