unrealon 1.1.1__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +16 -6
- unrealon-1.1.5.dist-info/METADATA +621 -0
- unrealon-1.1.5.dist-info/RECORD +54 -0
- {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/entry_points.txt +1 -1
- unrealon_browser/__init__.py +3 -6
- unrealon_browser/core/browser_manager.py +86 -84
- unrealon_browser/dto/models/config.py +2 -0
- unrealon_browser/managers/captcha.py +165 -185
- unrealon_browser/managers/cookies.py +57 -28
- unrealon_browser/managers/logger_bridge.py +94 -34
- unrealon_browser/managers/profile.py +186 -158
- unrealon_browser/managers/stealth.py +58 -47
- unrealon_driver/__init__.py +8 -21
- unrealon_driver/exceptions.py +5 -0
- unrealon_driver/html_analyzer/__init__.py +32 -0
- unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} +330 -405
- unrealon_driver/html_analyzer/config.py +64 -0
- unrealon_driver/html_analyzer/manager.py +247 -0
- unrealon_driver/html_analyzer/models.py +115 -0
- unrealon_driver/html_analyzer/websocket_analyzer.py +157 -0
- unrealon_driver/models/__init__.py +31 -0
- unrealon_driver/models/websocket.py +98 -0
- unrealon_driver/parser/__init__.py +4 -23
- unrealon_driver/parser/cli_manager.py +6 -5
- unrealon_driver/parser/daemon_manager.py +242 -66
- unrealon_driver/parser/managers/__init__.py +0 -21
- unrealon_driver/parser/managers/config.py +15 -3
- unrealon_driver/parser/parser_manager.py +225 -395
- unrealon_driver/smart_logging/__init__.py +24 -0
- unrealon_driver/smart_logging/models.py +44 -0
- unrealon_driver/smart_logging/smart_logger.py +406 -0
- unrealon_driver/smart_logging/unified_logger.py +525 -0
- unrealon_driver/websocket/__init__.py +31 -0
- unrealon_driver/websocket/client.py +249 -0
- unrealon_driver/websocket/config.py +188 -0
- unrealon_driver/websocket/manager.py +90 -0
- unrealon-1.1.1.dist-info/METADATA +0 -722
- unrealon-1.1.1.dist-info/RECORD +0 -82
- unrealon_bridge/__init__.py +0 -114
- unrealon_bridge/cli.py +0 -316
- unrealon_bridge/client/__init__.py +0 -93
- unrealon_bridge/client/base.py +0 -78
- unrealon_bridge/client/commands.py +0 -89
- unrealon_bridge/client/connection.py +0 -90
- unrealon_bridge/client/events.py +0 -65
- unrealon_bridge/client/health.py +0 -38
- unrealon_bridge/client/html_parser.py +0 -146
- unrealon_bridge/client/logging.py +0 -139
- unrealon_bridge/client/proxy.py +0 -70
- unrealon_bridge/client/scheduler.py +0 -450
- unrealon_bridge/client/session.py +0 -70
- unrealon_bridge/configs/__init__.py +0 -14
- unrealon_bridge/configs/bridge_config.py +0 -212
- unrealon_bridge/configs/bridge_config.yaml +0 -39
- unrealon_bridge/models/__init__.py +0 -138
- unrealon_bridge/models/base.py +0 -28
- unrealon_bridge/models/command.py +0 -41
- unrealon_bridge/models/events.py +0 -40
- unrealon_bridge/models/html_parser.py +0 -79
- unrealon_bridge/models/logging.py +0 -55
- unrealon_bridge/models/parser.py +0 -63
- unrealon_bridge/models/proxy.py +0 -41
- unrealon_bridge/models/requests.py +0 -95
- unrealon_bridge/models/responses.py +0 -88
- unrealon_bridge/models/scheduler.py +0 -592
- unrealon_bridge/models/session.py +0 -28
- unrealon_bridge/server/__init__.py +0 -91
- unrealon_bridge/server/base.py +0 -171
- unrealon_bridge/server/handlers/__init__.py +0 -23
- unrealon_bridge/server/handlers/command.py +0 -110
- unrealon_bridge/server/handlers/html_parser.py +0 -139
- unrealon_bridge/server/handlers/logging.py +0 -95
- unrealon_bridge/server/handlers/parser.py +0 -95
- unrealon_bridge/server/handlers/proxy.py +0 -75
- unrealon_bridge/server/handlers/scheduler.py +0 -545
- unrealon_bridge/server/handlers/session.py +0 -66
- unrealon_driver/browser/__init__.py +0 -8
- unrealon_driver/browser/config.py +0 -74
- unrealon_driver/browser/manager.py +0 -416
- unrealon_driver/parser/managers/browser.py +0 -51
- unrealon_driver/parser/managers/logging.py +0 -609
- {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/WHEEL +0 -0
- {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,123 +1,28 @@
|
|
|
1
1
|
"""
|
|
2
|
-
HTML
|
|
2
|
+
Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
- Complete type annotations
|
|
7
|
-
- Pydantic v2 models everywhere
|
|
8
|
-
- Custom exception hierarchy
|
|
4
|
+
Intelligent HTML cleaning that removes noise but preserves useful data.
|
|
5
|
+
Optimizes HTML for LLM token efficiency while keeping valuable content.
|
|
9
6
|
"""
|
|
10
7
|
|
|
11
8
|
import json
|
|
12
9
|
import re
|
|
13
|
-
from typing import Optional, List, Union
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from pydantic import BaseModel, Field, ConfigDict, field_validator
|
|
16
10
|
import asyncio
|
|
17
11
|
import concurrent.futures
|
|
12
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
13
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
18
14
|
|
|
19
15
|
from bs4 import BeautifulSoup, Comment
|
|
16
|
+
from unrealon_driver.smart_logging import create_smart_logger
|
|
20
17
|
|
|
21
|
-
from
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class HTMLCleaningConfig(BaseModel):
|
|
25
|
-
"""HTML cleaning configuration with strict typing"""
|
|
26
|
-
model_config = ConfigDict(
|
|
27
|
-
validate_assignment=True,
|
|
28
|
-
extra="forbid"
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
# Cleaning modes
|
|
32
|
-
aggressive_cleaning: bool = Field(
|
|
33
|
-
default=True,
|
|
34
|
-
description="Enable aggressive cleaning"
|
|
35
|
-
)
|
|
36
|
-
preserve_js_data: bool = Field(
|
|
37
|
-
default=True,
|
|
38
|
-
description="Preserve JavaScript data during cleaning"
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# Content preservation
|
|
42
|
-
preserve_images: bool = Field(
|
|
43
|
-
default=False,
|
|
44
|
-
description="Preserve image tags"
|
|
45
|
-
)
|
|
46
|
-
preserve_links: bool = Field(
|
|
47
|
-
default=True,
|
|
48
|
-
description="Preserve link tags"
|
|
49
|
-
)
|
|
50
|
-
preserve_forms: bool = Field(
|
|
51
|
-
default=False,
|
|
52
|
-
description="Preserve form elements"
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
# Size limits
|
|
56
|
-
max_html_size: int = Field(
|
|
57
|
-
default=1000000,
|
|
58
|
-
ge=1000,
|
|
59
|
-
le=10000000,
|
|
60
|
-
description="Maximum HTML size in characters"
|
|
61
|
-
)
|
|
62
|
-
max_text_length: int = Field(
|
|
63
|
-
default=300,
|
|
64
|
-
ge=50,
|
|
65
|
-
le=1000,
|
|
66
|
-
description="Maximum text content length per element"
|
|
67
|
-
)
|
|
68
|
-
max_url_length: int = Field(
|
|
69
|
-
default=500,
|
|
70
|
-
ge=100,
|
|
71
|
-
le=2000,
|
|
72
|
-
description="Maximum URL length"
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
# Noise removal
|
|
76
|
-
remove_comments: bool = Field(
|
|
77
|
-
default=True,
|
|
78
|
-
description="Remove HTML comments"
|
|
79
|
-
)
|
|
80
|
-
remove_scripts: bool = Field(
|
|
81
|
-
default=True,
|
|
82
|
-
description="Remove script tags"
|
|
83
|
-
)
|
|
84
|
-
remove_styles: bool = Field(
|
|
85
|
-
default=True,
|
|
86
|
-
description="Remove style tags"
|
|
87
|
-
)
|
|
88
|
-
remove_tracking: bool = Field(
|
|
89
|
-
default=True,
|
|
90
|
-
description="Remove tracking URLs and attributes"
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
# Whitespace handling
|
|
94
|
-
normalize_whitespace: bool = Field(
|
|
95
|
-
default=True,
|
|
96
|
-
description="Normalize whitespace"
|
|
97
|
-
)
|
|
98
|
-
remove_empty_elements: bool = Field(
|
|
99
|
-
default=True,
|
|
100
|
-
description="Remove empty elements"
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
# Custom selectors
|
|
104
|
-
noise_selectors: List[str] = Field(
|
|
105
|
-
default_factory=lambda: [
|
|
106
|
-
'[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]',
|
|
107
|
-
'[class*="footer"]', '[class*="header"]', '[class*="ads"]',
|
|
108
|
-
'[class*="popup"]', '[class*="modal"]', '[class*="cookie"]'
|
|
109
|
-
],
|
|
110
|
-
description="CSS selectors for noise elements to remove"
|
|
111
|
-
)
|
|
18
|
+
from .config import HTMLCleaningConfig
|
|
112
19
|
|
|
113
20
|
|
|
114
21
|
class HTMLCleaningStats(BaseModel):
|
|
115
22
|
"""HTML cleaning statistics"""
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
)
|
|
120
|
-
|
|
23
|
+
|
|
24
|
+
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
25
|
+
|
|
121
26
|
original_size_bytes: int = Field(ge=0)
|
|
122
27
|
cleaned_size_bytes: int = Field(ge=0)
|
|
123
28
|
size_reduction_bytes: int = Field(ge=0)
|
|
@@ -130,297 +35,296 @@ class HTMLCleaningStats(BaseModel):
|
|
|
130
35
|
|
|
131
36
|
class ExtractedJSData(BaseModel):
|
|
132
37
|
"""Extracted JavaScript data structure"""
|
|
133
|
-
model_config = ConfigDict(
|
|
134
|
-
validate_assignment=True,
|
|
135
|
-
extra="forbid"
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
ssr_data: dict[str, str] = Field(default_factory=dict)
|
|
139
|
-
structured_data: List[dict[str, str]] = Field(default_factory=list)
|
|
140
|
-
raw_extracts: List[dict[str, str]] = Field(default_factory=list)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
class HTMLManagerError(Exception):
|
|
144
|
-
"""Base exception for HTML manager"""
|
|
145
|
-
def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
|
|
146
|
-
self.message = message
|
|
147
|
-
self.operation = operation
|
|
148
|
-
self.details = details or {}
|
|
149
|
-
super().__init__(message)
|
|
150
38
|
|
|
39
|
+
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
151
40
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
41
|
+
ssr_data: Dict[str, Any] = Field(default_factory=dict)
|
|
42
|
+
structured_data: List[Dict[str, Any]] = Field(default_factory=list)
|
|
43
|
+
analytics_data: Dict[str, Any] = Field(default_factory=dict)
|
|
44
|
+
product_data: Dict[str, Any] = Field(default_factory=dict)
|
|
45
|
+
raw_extracts: List[Dict[str, Any]] = Field(default_factory=list)
|
|
155
46
|
|
|
156
47
|
|
|
157
|
-
class HTMLCleaningError(
|
|
48
|
+
class HTMLCleaningError(Exception):
|
|
158
49
|
"""Raised when HTML cleaning fails"""
|
|
159
|
-
|
|
50
|
+
|
|
51
|
+
def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
|
|
52
|
+
self.message = message
|
|
53
|
+
self.operation = operation
|
|
54
|
+
self.details = details or {}
|
|
55
|
+
super().__init__(message)
|
|
160
56
|
|
|
161
57
|
|
|
162
|
-
class
|
|
58
|
+
class HTMLCleaner:
|
|
163
59
|
"""
|
|
164
|
-
🧹 HTML
|
|
165
|
-
|
|
60
|
+
🧹 Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization
|
|
61
|
+
|
|
166
62
|
Features:
|
|
167
|
-
-
|
|
168
|
-
-
|
|
169
|
-
-
|
|
170
|
-
-
|
|
171
|
-
-
|
|
172
|
-
-
|
|
63
|
+
- Removes noise (scripts, styles, comments)
|
|
64
|
+
- Preserves useful JavaScript data (JSON objects, SSR data)
|
|
65
|
+
- Cleans whitespace and formatting
|
|
66
|
+
- Maintains semantic structure
|
|
67
|
+
- Extracts and preserves Next.js/Nuxt.js SSR data
|
|
68
|
+
- Optimizes for LLM token efficiency
|
|
173
69
|
"""
|
|
174
|
-
|
|
175
|
-
def __init__(self, config: Optional[HTMLCleaningConfig] = None):
|
|
70
|
+
|
|
71
|
+
def __init__(self, parser_id: str, config: Optional[HTMLCleaningConfig] = None):
|
|
176
72
|
self.config = config or HTMLCleaningConfig()
|
|
177
|
-
|
|
178
|
-
|
|
73
|
+
|
|
74
|
+
# Initialize smart logger
|
|
75
|
+
self.parser_id = parser_id
|
|
76
|
+
self.logger = create_smart_logger(parser_id=self.parser_id)
|
|
77
|
+
|
|
78
|
+
# Tags to completely remove
|
|
79
|
+
self.noise_tags = {"script", "style", "meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
|
|
80
|
+
|
|
81
|
+
# Add conditional tags based on config
|
|
82
|
+
if not self.config.preserve_forms:
|
|
83
|
+
self.noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
|
|
84
|
+
|
|
85
|
+
# Universal noise selectors to remove (for any site)
|
|
86
|
+
self.universal_noise_selectors = [
|
|
87
|
+
'[id*="nav"]',
|
|
88
|
+
'[class*="nav"]', # Navigation
|
|
89
|
+
'[id*="menu"]',
|
|
90
|
+
'[class*="menu"]', # Menus
|
|
91
|
+
'[id*="sidebar"]',
|
|
92
|
+
'[class*="sidebar"]', # Sidebars
|
|
93
|
+
'[id*="footer"]',
|
|
94
|
+
'[class*="footer"]', # Footers
|
|
95
|
+
'[id*="header"]',
|
|
96
|
+
'[class*="header"]', # Headers
|
|
97
|
+
'[class*="ads"]',
|
|
98
|
+
'[class*="advertisement"]', # Ads
|
|
99
|
+
'[class*="sponsored"]',
|
|
100
|
+
'[class*="promo"]', # Sponsored content
|
|
101
|
+
'[class*="popup"]',
|
|
102
|
+
'[class*="modal"]', # Popups/modals
|
|
103
|
+
'[class*="overlay"]',
|
|
104
|
+
'[class*="tooltip"]', # Overlays
|
|
105
|
+
'[class*="cookie"]',
|
|
106
|
+
'[class*="gdpr"]', # Cookie notices
|
|
107
|
+
'[class*="newsletter"]',
|
|
108
|
+
'[class*="subscription"]', # Email signup
|
|
109
|
+
'[class*="social"]',
|
|
110
|
+
'[class*="share"]', # Social media
|
|
111
|
+
'[class*="comment"]',
|
|
112
|
+
'[class*="discussion"]', # Comments
|
|
113
|
+
'[class*="tracking"]',
|
|
114
|
+
'[class*="analytics"]', # Tracking
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
# Attributes to keep (semantic ones)
|
|
118
|
+
self.keep_attributes = {"id", "class", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role", "alt", "title", "href", "src", "action", "name", "value", "placeholder", "type"}
|
|
119
|
+
|
|
179
120
|
# Compile regex patterns for performance
|
|
180
121
|
self._compile_patterns()
|
|
181
|
-
|
|
122
|
+
|
|
182
123
|
def _compile_patterns(self) -> None:
|
|
183
124
|
"""Compile regex patterns for performance"""
|
|
184
|
-
#
|
|
125
|
+
# URL patterns to remove or shorten (for tracking/analytics)
|
|
185
126
|
self.tracking_url_patterns = [
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
127
|
+
r'https://aax-[^\s"]{200,}', # Amazon tracking URLs over 200 chars
|
|
128
|
+
r'https://[^\s"]*tracking[^\s"]{100,}', # General tracking URLs
|
|
129
|
+
r'https://[^\s"]*analytics[^\s"]{100,}', # Analytics URLs
|
|
130
|
+
r'https://[^\s"]*gtm[^\s"]{100,}', # Google Tag Manager URLs
|
|
190
131
|
]
|
|
191
|
-
|
|
192
|
-
# Base64 patterns
|
|
132
|
+
|
|
133
|
+
# Base64 patterns to remove or replace
|
|
193
134
|
self.base64_patterns = [
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
135
|
+
r"data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}", # Base64 images over 50 chars
|
|
136
|
+
r"data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 applications
|
|
137
|
+
r"data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}", # Base64 text
|
|
197
138
|
]
|
|
198
|
-
|
|
199
|
-
# JavaScript data
|
|
200
|
-
self.
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
139
|
+
|
|
140
|
+
# Patterns to detect valuable JavaScript data
|
|
141
|
+
self.useful_js_patterns = [
|
|
142
|
+
# Next.js/Nuxt.js SSR data
|
|
143
|
+
r"__NEXT_DATA__\s*=\s*(\{.+?\});?",
|
|
144
|
+
r"__NUXT__\s*=\s*(\{.+?\});?",
|
|
145
|
+
r"window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?",
|
|
146
|
+
# React/Vue hydration data
|
|
147
|
+
r"window\.__REACT_QUERY_STATE__\s*=\s*(\{.+?\});?",
|
|
148
|
+
r"window\.__VUE_SSR_CONTEXT__\s*=\s*(\{.+?\});?",
|
|
149
|
+
# E-commerce data
|
|
150
|
+
r"window\.productData\s*=\s*(\{.+?\});?",
|
|
151
|
+
r"window\.cartData\s*=\s*(\{.+?\});?",
|
|
152
|
+
r"dataLayer\s*=\s*(\[.+?\]);?",
|
|
153
|
+
# Analytics and tracking (structured data)
|
|
154
|
+
r'gtag\s*\(\s*[\'"]config[\'"],\s*[\'"][^\'\"]+[\'"],\s*(\{.+?\})\s*\);?',
|
|
155
|
+
# JSON-LD structured data (often in script tags)
|
|
156
|
+
r'"@context"\s*:\s*"https?://schema\.org"[^}]*\}',
|
|
157
|
+
# Generic JSON objects (be more selective)
|
|
158
|
+
r"(?:window\.|var\s+|let\s+|const\s+)\w+\s*=\s*(\{.+?\});?",
|
|
205
159
|
]
|
|
206
|
-
|
|
160
|
+
|
|
161
|
+
# Compiled regex patterns for efficiency
|
|
162
|
+
self.compiled_patterns = [re.compile(pattern, re.DOTALL | re.IGNORECASE) for pattern in self.useful_js_patterns]
|
|
163
|
+
|
|
207
164
|
# ==========================================
|
|
208
165
|
# MAIN CLEANING METHODS
|
|
209
166
|
# ==========================================
|
|
210
|
-
|
|
211
|
-
async def clean_html(
|
|
212
|
-
self,
|
|
213
|
-
html: str,
|
|
214
|
-
aggressive: Optional[bool] = None,
|
|
215
|
-
preserve_js_data: Optional[bool] = None
|
|
216
|
-
) -> str:
|
|
167
|
+
|
|
168
|
+
async def clean_html(self, html_content: str, preserve_js_data: bool = True, aggressive_cleaning: bool = False) -> Tuple[str, Dict[str, Any]]:
|
|
217
169
|
"""
|
|
218
|
-
Clean HTML content
|
|
219
|
-
|
|
170
|
+
Clean HTML content while preserving valuable data
|
|
171
|
+
|
|
220
172
|
Args:
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
173
|
+
html_content: Raw HTML content
|
|
174
|
+
preserve_js_data: Whether to extract and preserve JS data
|
|
175
|
+
aggressive_cleaning: Whether to apply more aggressive cleaning
|
|
176
|
+
|
|
225
177
|
Returns:
|
|
226
|
-
|
|
178
|
+
Tuple of (cleaned_html, extracted_data)
|
|
227
179
|
"""
|
|
228
|
-
if not
|
|
229
|
-
return ""
|
|
230
|
-
|
|
231
|
-
# Use config defaults or overrides
|
|
232
|
-
aggressive_cleaning = aggressive if aggressive is not None else self.config.aggressive_cleaning
|
|
233
|
-
preserve_js = preserve_js_data if preserve_js_data is not None else self.config.preserve_js_data
|
|
234
|
-
|
|
180
|
+
if not html_content or not html_content.strip():
|
|
181
|
+
return "", {}
|
|
182
|
+
|
|
235
183
|
try:
|
|
236
|
-
self.logger.info(f"Cleaning HTML: {len(
|
|
237
|
-
|
|
184
|
+
self.logger.info(f"🧹 Cleaning HTML: {len(html_content)} characters")
|
|
185
|
+
|
|
238
186
|
# Check size limits
|
|
239
|
-
if len(
|
|
240
|
-
self.logger.warning(f"HTML size ({len(
|
|
241
|
-
|
|
242
|
-
|
|
187
|
+
if len(html_content) > self.config.max_html_size:
|
|
188
|
+
self.logger.warning(f"⚠️ HTML size ({len(html_content)}) exceeds limit ({self.config.max_html_size}), truncating")
|
|
189
|
+
html_content = html_content[: self.config.max_html_size]
|
|
190
|
+
|
|
243
191
|
# Parse HTML
|
|
244
|
-
soup = BeautifulSoup(
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
192
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
193
|
+
|
|
194
|
+
extracted_data = {}
|
|
195
|
+
|
|
196
|
+
# Extract valuable JavaScript data before removing scripts
|
|
197
|
+
if preserve_js_data:
|
|
249
198
|
extracted_data = self._extract_js_data(soup)
|
|
250
|
-
|
|
251
|
-
#
|
|
199
|
+
|
|
200
|
+
# Remove universal noise elements for aggressive cleaning
|
|
252
201
|
if aggressive_cleaning:
|
|
253
|
-
self.
|
|
254
|
-
|
|
255
|
-
self.
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
202
|
+
self._remove_universal_noise(soup)
|
|
203
|
+
self._truncate_long_urls(soup) # Do this before tracking URL cleaning
|
|
204
|
+
self._clean_tracking_urls(soup)
|
|
205
|
+
self._clean_base64_data(soup)
|
|
206
|
+
self._remove_long_attributes(soup)
|
|
207
|
+
self._remove_html_comments(soup)
|
|
208
|
+
self._clean_whitespace(soup)
|
|
209
|
+
|
|
210
|
+
# Remove noise elements
|
|
211
|
+
self._remove_noise_elements(soup)
|
|
212
|
+
|
|
213
|
+
# Clean attributes
|
|
214
|
+
self._clean_attributes(soup, aggressive_cleaning)
|
|
215
|
+
|
|
216
|
+
# Remove comments
|
|
217
|
+
self._remove_comments(soup)
|
|
218
|
+
|
|
219
|
+
# Clean text and whitespace
|
|
220
|
+
cleaned_html = self._clean_text_and_whitespace(soup)
|
|
221
|
+
|
|
260
222
|
# Final cleanup
|
|
261
223
|
cleaned_html = self._final_cleanup(cleaned_html)
|
|
262
|
-
|
|
224
|
+
|
|
263
225
|
# Log results
|
|
264
|
-
original_size = len(
|
|
226
|
+
original_size = len(html_content)
|
|
265
227
|
cleaned_size = len(cleaned_html)
|
|
266
228
|
reduction = ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0
|
|
267
|
-
|
|
268
|
-
self.logger.info(
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
return cleaned_html
|
|
274
|
-
|
|
229
|
+
|
|
230
|
+
self.logger.info(f"✅ HTML cleaned: {original_size} → {cleaned_size} chars " f"({reduction:.1f}% reduction)")
|
|
231
|
+
|
|
232
|
+
return cleaned_html, extracted_data
|
|
233
|
+
|
|
275
234
|
except Exception as e:
|
|
276
|
-
self.logger.error(f"HTML cleaning failed: {e}")
|
|
277
|
-
raise HTMLCleaningError(
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
details={"html_size": str(len(html))}
|
|
281
|
-
) from e
|
|
282
|
-
|
|
283
|
-
def clean_html_sync(self, html: str, **kwargs) -> str:
|
|
235
|
+
self.logger.error(f"❌ HTML cleaning failed: {e}")
|
|
236
|
+
raise HTMLCleaningError(message=f"Failed to clean HTML: {e}", operation="clean_html", details={"html_size": str(len(html_content))}) from e
|
|
237
|
+
|
|
238
|
+
def clean_html_sync(self, html_content: str, **kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
284
239
|
"""
|
|
285
240
|
Synchronous HTML cleaning
|
|
286
|
-
|
|
241
|
+
|
|
287
242
|
Args:
|
|
288
|
-
|
|
243
|
+
html_content: Raw HTML content
|
|
289
244
|
**kwargs: Cleaning options
|
|
290
|
-
|
|
245
|
+
|
|
291
246
|
Returns:
|
|
292
|
-
|
|
247
|
+
Tuple of (cleaned_html, extracted_data)
|
|
293
248
|
"""
|
|
294
249
|
# Handle running event loop
|
|
295
250
|
try:
|
|
296
251
|
loop = asyncio.get_running_loop()
|
|
297
252
|
# If we're in an event loop, create a new thread
|
|
298
253
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
299
|
-
future = executor.submit(asyncio.run, self.clean_html(
|
|
254
|
+
future = executor.submit(asyncio.run, self.clean_html(html_content, **kwargs))
|
|
300
255
|
return future.result()
|
|
301
256
|
except RuntimeError:
|
|
302
257
|
# No event loop running, safe to use asyncio.run
|
|
303
|
-
return asyncio.run(self.clean_html(
|
|
304
|
-
|
|
305
|
-
async def parse_and_clean_html(
|
|
306
|
-
self,
|
|
307
|
-
html: str,
|
|
308
|
-
schema: Optional[dict[str, str]] = None,
|
|
309
|
-
instructions: Optional[str] = None,
|
|
310
|
-
**kwargs
|
|
311
|
-
) -> dict[str, str]:
|
|
312
|
-
"""
|
|
313
|
-
Parse and clean HTML with LLM analysis preparation
|
|
314
|
-
|
|
315
|
-
Args:
|
|
316
|
-
html: Raw HTML content
|
|
317
|
-
schema: Optional data schema for extraction
|
|
318
|
-
instructions: Optional parsing instructions
|
|
319
|
-
**kwargs: Additional options
|
|
320
|
-
|
|
321
|
-
Returns:
|
|
322
|
-
Dictionary with cleaned HTML and metadata
|
|
323
|
-
"""
|
|
324
|
-
try:
|
|
325
|
-
# Clean HTML
|
|
326
|
-
cleaned_html = await self.clean_html(html, **kwargs)
|
|
327
|
-
|
|
328
|
-
# Get cleaning stats
|
|
329
|
-
stats = self.get_cleaning_stats(html, cleaned_html)
|
|
330
|
-
|
|
331
|
-
result = {
|
|
332
|
-
"cleaned_html": cleaned_html,
|
|
333
|
-
"original_size": str(stats.original_size_bytes),
|
|
334
|
-
"cleaned_size": str(stats.cleaned_size_bytes),
|
|
335
|
-
"reduction_percent": f"{stats.size_reduction_percent:.1f}",
|
|
336
|
-
"estimated_token_savings": str(stats.estimated_token_savings)
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
if schema:
|
|
340
|
-
result["schema"] = str(schema)
|
|
341
|
-
if instructions:
|
|
342
|
-
result["instructions"] = instructions
|
|
343
|
-
|
|
344
|
-
return result
|
|
345
|
-
|
|
346
|
-
except Exception as e:
|
|
347
|
-
raise HTMLCleaningError(
|
|
348
|
-
message=f"Failed to parse and clean HTML: {e}",
|
|
349
|
-
operation="parse_and_clean_html"
|
|
350
|
-
) from e
|
|
351
|
-
|
|
258
|
+
return asyncio.run(self.clean_html(html_content, **kwargs))
|
|
259
|
+
|
|
352
260
|
# ==========================================
|
|
353
261
|
# CLEANING IMPLEMENTATION
|
|
354
262
|
# ==========================================
|
|
355
|
-
|
|
263
|
+
|
|
356
264
|
def _standard_cleaning(self, soup: BeautifulSoup) -> None:
|
|
357
265
|
"""Apply standard cleaning"""
|
|
358
266
|
# Remove noise elements
|
|
359
267
|
self._remove_noise_elements(soup)
|
|
360
|
-
|
|
268
|
+
|
|
361
269
|
# Clean attributes
|
|
362
270
|
self._clean_attributes(soup)
|
|
363
|
-
|
|
271
|
+
|
|
364
272
|
# Remove comments
|
|
365
273
|
if self.config.remove_comments:
|
|
366
274
|
self._remove_comments(soup)
|
|
367
|
-
|
|
275
|
+
|
|
368
276
|
# Normalize whitespace
|
|
369
277
|
if self.config.normalize_whitespace:
|
|
370
278
|
self._normalize_whitespace(soup)
|
|
371
|
-
|
|
279
|
+
|
|
372
280
|
def _aggressive_cleaning(self, soup: BeautifulSoup) -> None:
|
|
373
281
|
"""Apply aggressive cleaning"""
|
|
374
282
|
# Standard cleaning first
|
|
375
283
|
self._standard_cleaning(soup)
|
|
376
|
-
|
|
284
|
+
|
|
377
285
|
# Remove noise selectors
|
|
378
286
|
self._remove_noise_selectors(soup)
|
|
379
|
-
|
|
287
|
+
|
|
380
288
|
# Clean tracking URLs
|
|
381
289
|
if self.config.remove_tracking:
|
|
382
290
|
self._clean_tracking_urls(soup)
|
|
383
|
-
|
|
291
|
+
|
|
384
292
|
# Clean base64 data
|
|
385
293
|
self._clean_base64_data(soup)
|
|
386
|
-
|
|
294
|
+
|
|
387
295
|
# Truncate long URLs
|
|
388
296
|
self._truncate_long_urls(soup)
|
|
389
|
-
|
|
297
|
+
|
|
390
298
|
# Remove long attributes
|
|
391
299
|
self._remove_long_attributes(soup)
|
|
392
|
-
|
|
300
|
+
|
|
393
301
|
# Truncate long text
|
|
394
302
|
self._truncate_long_text(soup)
|
|
395
|
-
|
|
303
|
+
|
|
396
304
|
def _remove_noise_elements(self, soup: BeautifulSoup) -> None:
|
|
397
305
|
"""Remove noise HTML elements"""
|
|
398
306
|
# Define noise tags
|
|
399
|
-
noise_tags = {
|
|
400
|
-
|
|
401
|
-
'iframe', 'embed', 'object', 'svg', 'canvas',
|
|
402
|
-
'audio', 'video', 'source', 'track', 'area', 'map', 'param'
|
|
403
|
-
}
|
|
404
|
-
|
|
307
|
+
noise_tags = {"meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
|
|
308
|
+
|
|
405
309
|
# Add conditional tags
|
|
406
310
|
if self.config.remove_scripts:
|
|
407
|
-
noise_tags.add(
|
|
311
|
+
noise_tags.add("script")
|
|
408
312
|
if self.config.remove_styles:
|
|
409
|
-
noise_tags.add(
|
|
313
|
+
noise_tags.add("style")
|
|
410
314
|
if not self.config.preserve_forms:
|
|
411
|
-
noise_tags.update({
|
|
412
|
-
|
|
315
|
+
noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
|
|
316
|
+
|
|
413
317
|
# Remove noise tags
|
|
414
318
|
for tag_name in noise_tags:
|
|
415
319
|
for tag in soup.find_all(tag_name):
|
|
416
320
|
tag.decompose()
|
|
417
|
-
|
|
321
|
+
|
|
418
322
|
# Remove empty elements
|
|
419
323
|
if self.config.remove_empty_elements:
|
|
420
|
-
for tag in soup.find_all([
|
|
324
|
+
for tag in soup.find_all(["div", "span", "p"]):
|
|
421
325
|
if not tag.get_text(strip=True) and not tag.find_all():
|
|
422
326
|
tag.decompose()
|
|
423
|
-
|
|
327
|
+
|
|
424
328
|
def _remove_noise_selectors(self, soup: BeautifulSoup) -> None:
|
|
425
329
|
"""Remove elements matching noise selectors"""
|
|
426
330
|
for selector in self.config.noise_selectors:
|
|
@@ -431,78 +335,96 @@ class HTMLManager:
|
|
|
431
335
|
except Exception:
|
|
432
336
|
# Skip invalid selectors
|
|
433
337
|
continue
|
|
434
|
-
|
|
338
|
+
|
|
435
339
|
def _clean_attributes(self, soup: BeautifulSoup) -> None:
|
|
436
340
|
"""Clean HTML attributes"""
|
|
437
341
|
# Attributes to remove
|
|
438
342
|
noise_attributes = {
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
343
|
+
"style",
|
|
344
|
+
"onclick",
|
|
345
|
+
"onload",
|
|
346
|
+
"onchange",
|
|
347
|
+
"onmouseover",
|
|
348
|
+
"onmouseout",
|
|
349
|
+
"onfocus",
|
|
350
|
+
"onblur",
|
|
351
|
+
"onsubmit",
|
|
352
|
+
"onreset",
|
|
353
|
+
"onerror",
|
|
354
|
+
"onabort",
|
|
355
|
+
"autocomplete",
|
|
356
|
+
"autofocus",
|
|
357
|
+
"checked",
|
|
358
|
+
"defer",
|
|
359
|
+
"disabled",
|
|
360
|
+
"hidden",
|
|
361
|
+
"loop",
|
|
362
|
+
"multiple",
|
|
363
|
+
"muted",
|
|
364
|
+
"open",
|
|
365
|
+
"readonly",
|
|
366
|
+
"required",
|
|
367
|
+
"tabindex",
|
|
368
|
+
"translate",
|
|
369
|
+
"draggable",
|
|
370
|
+
"contenteditable",
|
|
445
371
|
}
|
|
446
|
-
|
|
372
|
+
|
|
447
373
|
# Attributes to keep
|
|
448
|
-
keep_attributes = {
|
|
449
|
-
|
|
450
|
-
'data-testid', 'data-test', 'data-cy',
|
|
451
|
-
'aria-label', 'aria-labelledby', 'aria-describedby', 'role'
|
|
452
|
-
}
|
|
453
|
-
|
|
374
|
+
keep_attributes = {"id", "class", "href", "src", "alt", "title", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role"}
|
|
375
|
+
|
|
454
376
|
for tag in soup.find_all(True):
|
|
455
|
-
if hasattr(tag,
|
|
377
|
+
if hasattr(tag, "attrs"):
|
|
456
378
|
# Remove unwanted attributes
|
|
457
379
|
attrs_to_remove = set(tag.attrs.keys()) - keep_attributes
|
|
458
380
|
for attr in attrs_to_remove:
|
|
459
381
|
if attr in noise_attributes:
|
|
460
382
|
del tag.attrs[attr]
|
|
461
|
-
|
|
383
|
+
|
|
462
384
|
def _clean_tracking_urls(self, soup: BeautifulSoup) -> None:
|
|
463
385
|
"""Remove or replace tracking URLs"""
|
|
464
386
|
# Clean href attributes
|
|
465
|
-
for tag in soup.find_all([
|
|
466
|
-
href = tag.get(
|
|
387
|
+
for tag in soup.find_all(["a"], href=True):
|
|
388
|
+
href = tag.get("href", "")
|
|
467
389
|
if href:
|
|
468
390
|
for pattern in self.tracking_url_patterns:
|
|
469
391
|
if pattern.match(href):
|
|
470
|
-
tag[
|
|
392
|
+
tag["href"] = "#tracking-url-removed"
|
|
471
393
|
break
|
|
472
|
-
|
|
394
|
+
|
|
473
395
|
# Clean src attributes
|
|
474
|
-
for tag in soup.find_all([
|
|
475
|
-
src = tag.get(
|
|
396
|
+
for tag in soup.find_all(["img"], src=True):
|
|
397
|
+
src = tag.get("src", "")
|
|
476
398
|
if src:
|
|
477
399
|
for pattern in self.tracking_url_patterns:
|
|
478
400
|
if pattern.match(src):
|
|
479
|
-
tag[
|
|
401
|
+
tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
480
402
|
break
|
|
481
|
-
|
|
403
|
+
|
|
482
404
|
def _clean_base64_data(self, soup: BeautifulSoup) -> None:
|
|
483
405
|
"""Remove large base64 encoded data"""
|
|
484
|
-
for tag in soup.find_all([
|
|
485
|
-
src = tag.get(
|
|
406
|
+
for tag in soup.find_all(["img"], src=True):
|
|
407
|
+
src = tag.get("src", "")
|
|
486
408
|
if src:
|
|
487
409
|
for pattern in self.base64_patterns:
|
|
488
410
|
if pattern.search(src):
|
|
489
|
-
tag[
|
|
411
|
+
tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
490
412
|
break
|
|
491
|
-
|
|
413
|
+
|
|
492
414
|
def _truncate_long_urls(self, soup: BeautifulSoup) -> None:
|
|
493
415
|
"""Truncate URLs longer than max_url_length"""
|
|
494
416
|
max_length = self.config.max_url_length
|
|
495
|
-
|
|
496
|
-
for tag in soup.find_all([
|
|
497
|
-
href = tag.get(
|
|
417
|
+
|
|
418
|
+
for tag in soup.find_all(["a"], href=True):
|
|
419
|
+
href = tag.get("href", "")
|
|
498
420
|
if isinstance(href, str) and len(href) > max_length:
|
|
499
|
-
tag[
|
|
500
|
-
|
|
501
|
-
for tag in soup.find_all([
|
|
502
|
-
src = tag.get(
|
|
503
|
-
if isinstance(src, str) and len(src) > max_length and not src.startswith(
|
|
504
|
-
tag[
|
|
505
|
-
|
|
421
|
+
tag["href"] = href[:max_length] + "...truncated"
|
|
422
|
+
|
|
423
|
+
for tag in soup.find_all(["img"], src=True):
|
|
424
|
+
src = tag.get("src", "")
|
|
425
|
+
if isinstance(src, str) and len(src) > max_length and not src.startswith("data:"):
|
|
426
|
+
tag["src"] = src[:max_length] + "...truncated"
|
|
427
|
+
|
|
506
428
|
def _remove_long_attributes(self, soup: BeautifulSoup) -> None:
|
|
507
429
|
"""Remove attributes with extremely long values"""
|
|
508
430
|
for tag in soup.find_all():
|
|
@@ -510,96 +432,95 @@ class HTMLManager:
|
|
|
510
432
|
for attr, value in tag.attrs.items():
|
|
511
433
|
if isinstance(value, str) and len(value) > 800:
|
|
512
434
|
attrs_to_remove.append(attr)
|
|
513
|
-
elif any(tracking in attr.lower() for tracking in
|
|
514
|
-
['tracking', 'analytics', 'gtm', 'pixel']):
|
|
435
|
+
elif any(tracking in attr.lower() for tracking in ["tracking", "analytics", "gtm", "pixel"]):
|
|
515
436
|
attrs_to_remove.append(attr)
|
|
516
|
-
|
|
437
|
+
|
|
517
438
|
for attr in attrs_to_remove:
|
|
518
439
|
del tag.attrs[attr]
|
|
519
|
-
|
|
440
|
+
|
|
520
441
|
def _truncate_long_text(self, soup: BeautifulSoup) -> None:
|
|
521
442
|
"""Truncate text content longer than max_text_length"""
|
|
522
443
|
max_length = self.config.max_text_length
|
|
523
|
-
|
|
444
|
+
|
|
524
445
|
for element in soup.find_all(text=True):
|
|
525
|
-
if element.parent.name not in [
|
|
446
|
+
if element.parent.name not in ["script", "style"]:
|
|
526
447
|
text_content = str(element).strip()
|
|
527
448
|
if text_content and len(text_content) > max_length:
|
|
528
|
-
truncated_text = text_content[:max_length] +
|
|
449
|
+
truncated_text = text_content[:max_length] + "..."
|
|
529
450
|
element.replace_with(truncated_text)
|
|
530
|
-
|
|
451
|
+
|
|
531
452
|
def _remove_comments(self, soup: BeautifulSoup) -> None:
|
|
532
453
|
"""Remove HTML comments"""
|
|
533
454
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
534
455
|
comment.extract()
|
|
535
|
-
|
|
456
|
+
|
|
536
457
|
def _normalize_whitespace(self, soup: BeautifulSoup) -> None:
|
|
537
458
|
"""Normalize whitespace in text content"""
|
|
538
459
|
for element in soup.find_all(text=True):
|
|
539
|
-
if element.parent.name not in [
|
|
460
|
+
if element.parent.name not in ["script", "style"]:
|
|
540
461
|
# Replace multiple spaces with single space
|
|
541
|
-
cleaned_text = re.sub(r
|
|
462
|
+
cleaned_text = re.sub(r" {3,}", " ", str(element))
|
|
542
463
|
# Replace multiple newlines with maximum 2
|
|
543
|
-
cleaned_text = re.sub(r
|
|
464
|
+
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
|
|
544
465
|
# Replace multiple tabs with single space
|
|
545
|
-
cleaned_text = re.sub(r
|
|
466
|
+
cleaned_text = re.sub(r"\t+", " ", cleaned_text)
|
|
546
467
|
element.replace_with(cleaned_text)
|
|
547
|
-
|
|
468
|
+
|
|
548
469
|
def _final_cleanup(self, html: str) -> str:
|
|
549
470
|
"""Final cleanup and optimization"""
|
|
550
471
|
# Remove empty attributes
|
|
551
|
-
html = re.sub(r'\s+\w+=""',
|
|
552
|
-
|
|
472
|
+
html = re.sub(r'\s+\w+=""', "", html)
|
|
473
|
+
|
|
553
474
|
# Remove extra spaces in attributes
|
|
554
475
|
html = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html)
|
|
555
|
-
|
|
476
|
+
|
|
556
477
|
# Normalize quotes
|
|
557
478
|
html = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html)
|
|
558
|
-
|
|
479
|
+
|
|
559
480
|
# Remove trailing spaces before closing tags
|
|
560
|
-
html = re.sub(r
|
|
561
|
-
|
|
481
|
+
html = re.sub(r"\s+(/?>)", r"\1", html)
|
|
482
|
+
|
|
562
483
|
# Advanced whitespace cleanup
|
|
563
484
|
html = self._advanced_whitespace_cleanup(html)
|
|
564
|
-
|
|
485
|
+
|
|
565
486
|
return html.strip()
|
|
566
|
-
|
|
487
|
+
|
|
567
488
|
def _advanced_whitespace_cleanup(self, html: str) -> str:
|
|
568
489
|
"""Advanced whitespace cleanup"""
|
|
569
490
|
# Remove excessive spaces
|
|
570
|
-
html = re.sub(r
|
|
571
|
-
|
|
491
|
+
html = re.sub(r" {3,}", " ", html)
|
|
492
|
+
|
|
572
493
|
# Remove excessive newlines
|
|
573
|
-
html = re.sub(r
|
|
574
|
-
|
|
494
|
+
html = re.sub(r"\n{3,}", "\n\n", html)
|
|
495
|
+
|
|
575
496
|
# Clean space between tags
|
|
576
|
-
html = re.sub(r
|
|
577
|
-
|
|
497
|
+
html = re.sub(r">\s{2,}<", "> <", html)
|
|
498
|
+
|
|
578
499
|
return html
|
|
579
|
-
|
|
500
|
+
|
|
580
501
|
# ==========================================
|
|
581
502
|
# JAVASCRIPT DATA EXTRACTION
|
|
582
503
|
# ==========================================
|
|
583
|
-
|
|
504
|
+
|
|
584
505
|
def _extract_js_data(self, soup: BeautifulSoup) -> ExtractedJSData:
|
|
585
506
|
"""Extract valuable JavaScript data"""
|
|
586
507
|
extracted_data = ExtractedJSData()
|
|
587
|
-
|
|
508
|
+
|
|
588
509
|
# Find all script tags
|
|
589
|
-
script_tags = soup.find_all(
|
|
590
|
-
|
|
510
|
+
script_tags = soup.find_all("script")
|
|
511
|
+
|
|
591
512
|
for script in script_tags:
|
|
592
513
|
if not script.string:
|
|
593
514
|
continue
|
|
594
|
-
|
|
515
|
+
|
|
595
516
|
script_content = script.string.strip()
|
|
596
|
-
|
|
517
|
+
|
|
597
518
|
# Skip empty scripts
|
|
598
519
|
if len(script_content) < 10:
|
|
599
520
|
continue
|
|
600
|
-
|
|
521
|
+
|
|
601
522
|
# Check for JSON-LD structured data
|
|
602
|
-
if script.get(
|
|
523
|
+
if script.get("type") == "application/ld+json":
|
|
603
524
|
try:
|
|
604
525
|
json_data = json.loads(script_content)
|
|
605
526
|
# Convert to string dict for Pydantic compliance
|
|
@@ -608,56 +529,56 @@ class HTMLManager:
|
|
|
608
529
|
continue
|
|
609
530
|
except json.JSONDecodeError:
|
|
610
531
|
pass
|
|
611
|
-
|
|
532
|
+
|
|
612
533
|
# Extract data using patterns
|
|
613
534
|
self._extract_with_patterns(script_content, extracted_data)
|
|
614
|
-
|
|
535
|
+
|
|
615
536
|
return extracted_data
|
|
616
|
-
|
|
537
|
+
|
|
617
538
|
def _extract_with_patterns(self, script_content: str, extracted_data: ExtractedJSData) -> None:
|
|
618
539
|
"""Extract data using compiled regex patterns"""
|
|
619
540
|
for pattern in self.js_data_patterns:
|
|
620
541
|
matches = pattern.finditer(script_content)
|
|
621
542
|
for match in matches:
|
|
622
543
|
self._try_parse_json(match.group(1), extracted_data)
|
|
623
|
-
|
|
544
|
+
|
|
624
545
|
def _try_parse_json(self, json_str: str, extracted_data: ExtractedJSData) -> None:
|
|
625
546
|
"""Try to parse JSON string and add to extracted data"""
|
|
626
547
|
try:
|
|
627
548
|
json_data = json.loads(json_str)
|
|
628
|
-
|
|
549
|
+
|
|
629
550
|
if isinstance(json_data, dict):
|
|
630
551
|
# Convert to string dict for Pydantic compliance
|
|
631
552
|
str_data = {}
|
|
632
553
|
for k, v in json_data.items():
|
|
633
554
|
if isinstance(k, (str, int, float)) and isinstance(v, (str, int, float, bool)):
|
|
634
555
|
str_data[str(k)] = str(v)
|
|
635
|
-
|
|
556
|
+
|
|
636
557
|
if str_data:
|
|
637
558
|
extracted_data.ssr_data.update(str_data)
|
|
638
|
-
|
|
559
|
+
|
|
639
560
|
except json.JSONDecodeError:
|
|
640
561
|
# Skip invalid JSON
|
|
641
562
|
pass
|
|
642
|
-
|
|
563
|
+
|
|
643
564
|
# ==========================================
|
|
644
565
|
# UTILITY METHODS
|
|
645
566
|
# ==========================================
|
|
646
|
-
|
|
567
|
+
|
|
647
568
|
def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
|
|
648
569
|
"""Get statistics about the cleaning process"""
|
|
649
570
|
original_size = len(original_html)
|
|
650
571
|
cleaned_size = len(cleaned_html)
|
|
651
|
-
|
|
572
|
+
|
|
652
573
|
# Estimate token reduction (rough approximation)
|
|
653
574
|
original_tokens = original_size // 4 # Rough estimate: 4 chars per token
|
|
654
575
|
cleaned_tokens = cleaned_size // 4
|
|
655
|
-
|
|
576
|
+
|
|
656
577
|
size_reduction = original_size - cleaned_size
|
|
657
578
|
size_reduction_percent = (size_reduction / original_size * 100) if original_size > 0 else 0.0
|
|
658
579
|
token_savings = original_tokens - cleaned_tokens
|
|
659
580
|
token_savings_percent = (token_savings / original_tokens * 100) if original_tokens > 0 else 0.0
|
|
660
|
-
|
|
581
|
+
|
|
661
582
|
return HTMLCleaningStats(
|
|
662
583
|
original_size_bytes=original_size,
|
|
663
584
|
cleaned_size_bytes=cleaned_size,
|
|
@@ -666,15 +587,15 @@ class HTMLManager:
|
|
|
666
587
|
estimated_original_tokens=original_tokens,
|
|
667
588
|
estimated_cleaned_tokens=cleaned_tokens,
|
|
668
589
|
estimated_token_savings=token_savings,
|
|
669
|
-
estimated_token_savings_percent=token_savings_percent
|
|
590
|
+
estimated_token_savings_percent=token_savings_percent,
|
|
670
591
|
)
|
|
671
|
-
|
|
592
|
+
|
|
672
593
|
def update_config(self, **kwargs) -> None:
|
|
673
594
|
"""Update configuration with new values"""
|
|
674
595
|
current_data = self.config.model_dump()
|
|
675
596
|
current_data.update(kwargs)
|
|
676
597
|
self.config = HTMLCleaningConfig.model_validate(current_data)
|
|
677
|
-
|
|
598
|
+
|
|
678
599
|
# Recompile patterns if needed
|
|
679
600
|
self._compile_patterns()
|
|
680
601
|
|
|
@@ -683,50 +604,54 @@ class HTMLManager:
|
|
|
683
604
|
# CONVENIENCE FUNCTIONS
|
|
684
605
|
# ==========================================
|
|
685
606
|
|
|
686
|
-
|
|
607
|
+
|
|
608
|
+
def create_html_cleaner(parser_id: str, config: Optional[HTMLCleaningConfig] = None) -> HTMLCleaner:
|
|
687
609
|
"""
|
|
688
|
-
|
|
689
|
-
|
|
610
|
+
Create an HTML cleaner instance
|
|
611
|
+
|
|
690
612
|
Args:
|
|
691
613
|
config: Optional HTML cleaning configuration
|
|
692
|
-
|
|
614
|
+
parser_id: Parser identifier for logging
|
|
615
|
+
|
|
693
616
|
Returns:
|
|
694
|
-
Configured
|
|
617
|
+
Configured HTMLCleaner instance
|
|
695
618
|
"""
|
|
696
|
-
return
|
|
619
|
+
return HTMLCleaner(parser_id=parser_id, config=config)
|
|
697
620
|
|
|
698
621
|
|
|
699
|
-
async def quick_clean_html(html: str, **kwargs) -> str:
|
|
622
|
+
async def quick_clean_html(html: str, parser_id: str, **kwargs) -> str:
|
|
700
623
|
"""
|
|
701
624
|
Quick HTML cleaning convenience function
|
|
702
|
-
|
|
625
|
+
|
|
703
626
|
Args:
|
|
704
627
|
html: Raw HTML content
|
|
628
|
+
parser_id: Parser identifier for logging
|
|
705
629
|
**kwargs: Cleaning options
|
|
706
|
-
|
|
630
|
+
|
|
707
631
|
Returns:
|
|
708
632
|
Cleaned HTML
|
|
709
633
|
"""
|
|
710
634
|
config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
|
|
711
635
|
config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
|
|
712
|
-
|
|
713
|
-
manager = get_html_manager(config)
|
|
714
|
-
return await manager.clean_html(html, **kwargs)
|
|
715
636
|
|
|
637
|
+
cleaner = create_html_cleaner(parser_id, config)
|
|
638
|
+
return await cleaner.clean_html(html, **kwargs)
|
|
716
639
|
|
|
717
|
-
|
|
640
|
+
|
|
641
|
+
def quick_clean_html_sync(html: str, parser_id: str, **kwargs) -> str:
|
|
718
642
|
"""
|
|
719
643
|
Quick synchronous HTML cleaning convenience function
|
|
720
|
-
|
|
644
|
+
|
|
721
645
|
Args:
|
|
722
646
|
html: Raw HTML content
|
|
647
|
+
parser_id: Parser identifier for logging
|
|
723
648
|
**kwargs: Cleaning options
|
|
724
|
-
|
|
649
|
+
|
|
725
650
|
Returns:
|
|
726
651
|
Cleaned HTML
|
|
727
652
|
"""
|
|
728
653
|
config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
|
|
729
654
|
config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
return
|
|
655
|
+
|
|
656
|
+
cleaner = create_html_cleaner(parser_id, config)
|
|
657
|
+
return cleaner.clean_html_sync(html, **kwargs)
|