unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +23 -21
- unrealon-1.1.0.dist-info/METADATA +164 -0
- unrealon-1.1.0.dist-info/RECORD +82 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
- unrealon-1.1.0.dist-info/entry_points.txt +9 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
- unrealon_bridge/__init__.py +114 -0
- unrealon_bridge/cli.py +316 -0
- unrealon_bridge/client/__init__.py +93 -0
- unrealon_bridge/client/base.py +78 -0
- unrealon_bridge/client/commands.py +89 -0
- unrealon_bridge/client/connection.py +90 -0
- unrealon_bridge/client/events.py +65 -0
- unrealon_bridge/client/health.py +38 -0
- unrealon_bridge/client/html_parser.py +146 -0
- unrealon_bridge/client/logging.py +139 -0
- unrealon_bridge/client/proxy.py +70 -0
- unrealon_bridge/client/scheduler.py +450 -0
- unrealon_bridge/client/session.py +70 -0
- unrealon_bridge/configs/__init__.py +14 -0
- unrealon_bridge/configs/bridge_config.py +212 -0
- unrealon_bridge/configs/bridge_config.yaml +39 -0
- unrealon_bridge/models/__init__.py +138 -0
- unrealon_bridge/models/base.py +28 -0
- unrealon_bridge/models/command.py +41 -0
- unrealon_bridge/models/events.py +40 -0
- unrealon_bridge/models/html_parser.py +79 -0
- unrealon_bridge/models/logging.py +55 -0
- unrealon_bridge/models/parser.py +63 -0
- unrealon_bridge/models/proxy.py +41 -0
- unrealon_bridge/models/requests.py +95 -0
- unrealon_bridge/models/responses.py +88 -0
- unrealon_bridge/models/scheduler.py +592 -0
- unrealon_bridge/models/session.py +28 -0
- unrealon_bridge/server/__init__.py +91 -0
- unrealon_bridge/server/base.py +171 -0
- unrealon_bridge/server/handlers/__init__.py +23 -0
- unrealon_bridge/server/handlers/command.py +110 -0
- unrealon_bridge/server/handlers/html_parser.py +139 -0
- unrealon_bridge/server/handlers/logging.py +95 -0
- unrealon_bridge/server/handlers/parser.py +95 -0
- unrealon_bridge/server/handlers/proxy.py +75 -0
- unrealon_bridge/server/handlers/scheduler.py +545 -0
- unrealon_bridge/server/handlers/session.py +66 -0
- unrealon_browser/__init__.py +61 -18
- unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
- unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
- unrealon_browser/{src/core → core}/browser_manager.py +2 -2
- unrealon_browser/{src/managers → managers}/captcha.py +1 -1
- unrealon_browser/{src/managers → managers}/cookies.py +1 -1
- unrealon_browser/managers/logger_bridge.py +231 -0
- unrealon_browser/{src/managers → managers}/profile.py +1 -1
- unrealon_driver/__init__.py +73 -19
- unrealon_driver/browser/__init__.py +8 -0
- unrealon_driver/browser/config.py +74 -0
- unrealon_driver/browser/manager.py +416 -0
- unrealon_driver/exceptions.py +28 -0
- unrealon_driver/parser/__init__.py +55 -0
- unrealon_driver/parser/cli_manager.py +141 -0
- unrealon_driver/parser/daemon_manager.py +227 -0
- unrealon_driver/parser/managers/__init__.py +46 -0
- unrealon_driver/parser/managers/browser.py +51 -0
- unrealon_driver/parser/managers/config.py +281 -0
- unrealon_driver/parser/managers/error.py +412 -0
- unrealon_driver/parser/managers/html.py +732 -0
- unrealon_driver/parser/managers/logging.py +609 -0
- unrealon_driver/parser/managers/result.py +321 -0
- unrealon_driver/parser/parser_manager.py +628 -0
- unrealon/sdk_config.py +0 -88
- unrealon-1.0.9.dist-info/METADATA +0 -810
- unrealon-1.0.9.dist-info/RECORD +0 -246
- unrealon_browser/pyproject.toml +0 -182
- unrealon_browser/src/__init__.py +0 -62
- unrealon_browser/src/managers/logger_bridge.py +0 -395
- unrealon_driver/README.md +0 -204
- unrealon_driver/pyproject.toml +0 -187
- unrealon_driver/src/__init__.py +0 -90
- unrealon_driver/src/cli/__init__.py +0 -10
- unrealon_driver/src/cli/main.py +0 -66
- unrealon_driver/src/cli/simple.py +0 -510
- unrealon_driver/src/config/__init__.py +0 -11
- unrealon_driver/src/config/auto_config.py +0 -478
- unrealon_driver/src/core/__init__.py +0 -18
- unrealon_driver/src/core/exceptions.py +0 -289
- unrealon_driver/src/core/parser.py +0 -638
- unrealon_driver/src/dto/__init__.py +0 -66
- unrealon_driver/src/dto/cli.py +0 -119
- unrealon_driver/src/dto/config.py +0 -18
- unrealon_driver/src/dto/events.py +0 -237
- unrealon_driver/src/dto/execution.py +0 -313
- unrealon_driver/src/dto/services.py +0 -311
- unrealon_driver/src/execution/__init__.py +0 -23
- unrealon_driver/src/execution/daemon_mode.py +0 -317
- unrealon_driver/src/execution/interactive_mode.py +0 -88
- unrealon_driver/src/execution/modes.py +0 -45
- unrealon_driver/src/execution/scheduled_mode.py +0 -209
- unrealon_driver/src/execution/test_mode.py +0 -250
- unrealon_driver/src/logging/__init__.py +0 -24
- unrealon_driver/src/logging/driver_logger.py +0 -512
- unrealon_driver/src/services/__init__.py +0 -24
- unrealon_driver/src/services/browser_service.py +0 -726
- unrealon_driver/src/services/llm/__init__.py +0 -15
- unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
- unrealon_driver/src/services/llm/llm.py +0 -195
- unrealon_driver/src/services/logger_service.py +0 -232
- unrealon_driver/src/services/metrics_service.py +0 -185
- unrealon_driver/src/services/scheduler_service.py +0 -489
- unrealon_driver/src/services/websocket_service.py +0 -362
- unrealon_driver/src/utils/__init__.py +0 -16
- unrealon_driver/src/utils/service_factory.py +0 -317
- unrealon_driver/src/utils/time_formatter.py +0 -338
- unrealon_llm/README.md +0 -44
- unrealon_llm/__init__.py +0 -26
- unrealon_llm/pyproject.toml +0 -154
- unrealon_llm/src/__init__.py +0 -228
- unrealon_llm/src/cli/__init__.py +0 -0
- unrealon_llm/src/core/__init__.py +0 -11
- unrealon_llm/src/core/smart_client.py +0 -438
- unrealon_llm/src/dto/__init__.py +0 -155
- unrealon_llm/src/dto/models/__init__.py +0 -0
- unrealon_llm/src/dto/models/config.py +0 -343
- unrealon_llm/src/dto/models/core.py +0 -328
- unrealon_llm/src/dto/models/enums.py +0 -123
- unrealon_llm/src/dto/models/html_analysis.py +0 -345
- unrealon_llm/src/dto/models/statistics.py +0 -473
- unrealon_llm/src/dto/models/translation.py +0 -383
- unrealon_llm/src/dto/models/type_conversion.py +0 -462
- unrealon_llm/src/dto/schemas/__init__.py +0 -0
- unrealon_llm/src/exceptions.py +0 -392
- unrealon_llm/src/llm_config/__init__.py +0 -20
- unrealon_llm/src/llm_config/logging_config.py +0 -178
- unrealon_llm/src/llm_logging/__init__.py +0 -42
- unrealon_llm/src/llm_logging/llm_events.py +0 -107
- unrealon_llm/src/llm_logging/llm_logger.py +0 -466
- unrealon_llm/src/managers/__init__.py +0 -15
- unrealon_llm/src/managers/cache_manager.py +0 -67
- unrealon_llm/src/managers/cost_manager.py +0 -107
- unrealon_llm/src/managers/request_manager.py +0 -298
- unrealon_llm/src/modules/__init__.py +0 -0
- unrealon_llm/src/modules/html_processor/__init__.py +0 -25
- unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
- unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
- unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
- unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
- unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
- unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
- unrealon_llm/src/modules/html_processor/processor.py +0 -102
- unrealon_llm/src/modules/llm/__init__.py +0 -0
- unrealon_llm/src/modules/translator/__init__.py +0 -0
- unrealon_llm/src/provider.py +0 -116
- unrealon_llm/src/utils/__init__.py +0 -95
- unrealon_llm/src/utils/common.py +0 -64
- unrealon_llm/src/utils/data_extractor.py +0 -188
- unrealon_llm/src/utils/html_cleaner.py +0 -767
- unrealon_llm/src/utils/language_detector.py +0 -308
- unrealon_llm/src/utils/models_cache.py +0 -592
- unrealon_llm/src/utils/smart_counter.py +0 -229
- unrealon_llm/src/utils/token_counter.py +0 -189
- unrealon_sdk/README.md +0 -25
- unrealon_sdk/__init__.py +0 -30
- unrealon_sdk/pyproject.toml +0 -231
- unrealon_sdk/src/__init__.py +0 -150
- unrealon_sdk/src/cli/__init__.py +0 -12
- unrealon_sdk/src/cli/commands/__init__.py +0 -22
- unrealon_sdk/src/cli/commands/benchmark.py +0 -42
- unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
- unrealon_sdk/src/cli/commands/health.py +0 -46
- unrealon_sdk/src/cli/commands/integration.py +0 -498
- unrealon_sdk/src/cli/commands/reports.py +0 -43
- unrealon_sdk/src/cli/commands/security.py +0 -36
- unrealon_sdk/src/cli/commands/server.py +0 -483
- unrealon_sdk/src/cli/commands/servers.py +0 -56
- unrealon_sdk/src/cli/commands/tests.py +0 -55
- unrealon_sdk/src/cli/main.py +0 -126
- unrealon_sdk/src/cli/utils/reporter.py +0 -519
- unrealon_sdk/src/clients/openapi.yaml +0 -3347
- unrealon_sdk/src/clients/python_http/__init__.py +0 -3
- unrealon_sdk/src/clients/python_http/api_config.py +0 -228
- unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
- unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
- unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
- unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
- unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
- unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
- unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
- unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
- unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
- unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
- unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
- unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
- unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
- unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
- unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
- unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
- unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
- unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
- unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
- unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
- unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
- unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
- unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
- unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
- unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
- unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
- unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
- unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
- unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
- unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
- unrealon_sdk/src/clients/python_websocket/client.py +0 -490
- unrealon_sdk/src/clients/python_websocket/events.py +0 -732
- unrealon_sdk/src/clients/python_websocket/example.py +0 -136
- unrealon_sdk/src/clients/python_websocket/types.py +0 -871
- unrealon_sdk/src/core/__init__.py +0 -64
- unrealon_sdk/src/core/client.py +0 -556
- unrealon_sdk/src/core/config.py +0 -465
- unrealon_sdk/src/core/exceptions.py +0 -239
- unrealon_sdk/src/core/metadata.py +0 -191
- unrealon_sdk/src/core/models.py +0 -142
- unrealon_sdk/src/core/types.py +0 -68
- unrealon_sdk/src/dto/__init__.py +0 -268
- unrealon_sdk/src/dto/authentication.py +0 -108
- unrealon_sdk/src/dto/cache.py +0 -208
- unrealon_sdk/src/dto/common.py +0 -19
- unrealon_sdk/src/dto/concurrency.py +0 -393
- unrealon_sdk/src/dto/events.py +0 -108
- unrealon_sdk/src/dto/health.py +0 -339
- unrealon_sdk/src/dto/load_balancing.py +0 -336
- unrealon_sdk/src/dto/logging.py +0 -230
- unrealon_sdk/src/dto/performance.py +0 -165
- unrealon_sdk/src/dto/rate_limiting.py +0 -295
- unrealon_sdk/src/dto/resource_pooling.py +0 -128
- unrealon_sdk/src/dto/structured_logging.py +0 -112
- unrealon_sdk/src/dto/task_scheduling.py +0 -121
- unrealon_sdk/src/dto/websocket.py +0 -55
- unrealon_sdk/src/enterprise/__init__.py +0 -59
- unrealon_sdk/src/enterprise/authentication.py +0 -401
- unrealon_sdk/src/enterprise/cache_manager.py +0 -578
- unrealon_sdk/src/enterprise/error_recovery.py +0 -494
- unrealon_sdk/src/enterprise/event_system.py +0 -549
- unrealon_sdk/src/enterprise/health_monitor.py +0 -747
- unrealon_sdk/src/enterprise/load_balancer.py +0 -964
- unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
- unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
- unrealon_sdk/src/enterprise/logging/development.py +0 -744
- unrealon_sdk/src/enterprise/logging/service.py +0 -410
- unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
- unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
- unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
- unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
- unrealon_sdk/src/enterprise/resource_pool.py +0 -763
- unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
- unrealon_sdk/src/internal/__init__.py +0 -10
- unrealon_sdk/src/internal/command_router.py +0 -497
- unrealon_sdk/src/internal/connection_manager.py +0 -397
- unrealon_sdk/src/internal/http_client.py +0 -446
- unrealon_sdk/src/internal/websocket_client.py +0 -420
- unrealon_sdk/src/provider.py +0 -471
- unrealon_sdk/src/utils.py +0 -234
- /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
- /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
- /unrealon_browser/{src/cli → cli}/main.py +0 -0
- /unrealon_browser/{src/core → core}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
- /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
- /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
|
@@ -1,767 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Smart HTML Cleaner
|
|
3
|
-
|
|
4
|
-
Intelligent HTML cleaning that removes noise but preserves useful data.
|
|
5
|
-
Optimizes HTML for LLM token efficiency while keeping valuable content.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
|
-
import re
|
|
10
|
-
from typing import Dict, List, Optional, Tuple
|
|
11
|
-
|
|
12
|
-
from bs4 import BeautifulSoup, Comment
|
|
13
|
-
|
|
14
|
-
from unrealon_llm.src.exceptions import ValidationError
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SmartHTMLCleaner:
|
|
18
|
-
"""
|
|
19
|
-
Intelligent HTML cleaner that optimizes for LLM analysis
|
|
20
|
-
|
|
21
|
-
Features:
|
|
22
|
-
- Removes noise (scripts, styles, comments)
|
|
23
|
-
- Preserves useful JavaScript data (JSON objects, SSR data)
|
|
24
|
-
- Cleans whitespace and formatting
|
|
25
|
-
- Maintains semantic structure
|
|
26
|
-
- Extracts and preserves Next.js/Nuxt.js SSR data
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
def __init__(self):
|
|
30
|
-
"""Initialize the HTML cleaner"""
|
|
31
|
-
# Tags to completely remove
|
|
32
|
-
self.noise_tags = {
|
|
33
|
-
'script', 'style', 'meta', 'link', 'base', 'title',
|
|
34
|
-
'head', 'noscript', 'iframe', 'embed', 'object',
|
|
35
|
-
'svg', 'canvas', 'audio', 'video', 'source',
|
|
36
|
-
'track', 'area', 'map', 'param', 'form', 'input',
|
|
37
|
-
'button', 'select', 'textarea', 'fieldset', 'legend'
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
# URL patterns to remove or shorten (for tracking/analytics)
|
|
41
|
-
self.tracking_url_patterns = [
|
|
42
|
-
r'https://aax-[^\s"]{200,}', # Amazon tracking URLs over 200 chars
|
|
43
|
-
r'https://[^\s"]*tracking[^\s"]{100,}', # General tracking URLs
|
|
44
|
-
r'https://[^\s"]*analytics[^\s"]{100,}', # Analytics URLs
|
|
45
|
-
r'https://[^\s"]*gtm[^\s"]{100,}', # Google Tag Manager URLs
|
|
46
|
-
]
|
|
47
|
-
|
|
48
|
-
# Base64 patterns to remove or replace
|
|
49
|
-
self.base64_patterns = [
|
|
50
|
-
r'data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}', # Base64 images over 50 chars
|
|
51
|
-
r'data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}', # Base64 applications
|
|
52
|
-
r'data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}', # Base64 text
|
|
53
|
-
]
|
|
54
|
-
|
|
55
|
-
# Universal noise selectors to remove (for any site)
|
|
56
|
-
self.universal_noise_selectors = [
|
|
57
|
-
'[id*="nav"]', '[class*="nav"]', # Navigation
|
|
58
|
-
'[id*="menu"]', '[class*="menu"]', # Menus
|
|
59
|
-
'[id*="sidebar"]', '[class*="sidebar"]', # Sidebars
|
|
60
|
-
'[id*="footer"]', '[class*="footer"]', # Footers
|
|
61
|
-
'[id*="header"]', '[class*="header"]', # Headers
|
|
62
|
-
'[class*="ads"]', '[class*="advertisement"]', # Ads
|
|
63
|
-
'[class*="sponsored"]', '[class*="promo"]', # Sponsored content
|
|
64
|
-
'[class*="popup"]', '[class*="modal"]', # Popups/modals
|
|
65
|
-
'[class*="overlay"]', '[class*="tooltip"]', # Overlays
|
|
66
|
-
'[class*="cookie"]', '[class*="gdpr"]', # Cookie notices
|
|
67
|
-
'[class*="newsletter"]', '[class*="subscription"]', # Email signup
|
|
68
|
-
'[class*="social"]', '[class*="share"]', # Social media
|
|
69
|
-
'[class*="comment"]', '[class*="discussion"]', # Comments (unless main content)
|
|
70
|
-
'[class*="tracking"]', '[class*="analytics"]', # Tracking
|
|
71
|
-
]
|
|
72
|
-
|
|
73
|
-
# Attributes to remove (keep only semantic ones)
|
|
74
|
-
self.noise_attributes = {
|
|
75
|
-
'style', 'onclick', 'onload', 'onchange', 'onmouseover',
|
|
76
|
-
'onmouseout', 'onfocus', 'onblur', 'onsubmit', 'onreset',
|
|
77
|
-
'onerror', 'onabort', 'oncanplay', 'oncanplaythrough',
|
|
78
|
-
'ondurationchange', 'onemptied', 'onended', 'onloadeddata',
|
|
79
|
-
'onloadedmetadata', 'onloadstart', 'onpause', 'onplay',
|
|
80
|
-
'onplaying', 'onprogress', 'onratechange', 'onseeked',
|
|
81
|
-
'onseeking', 'onstalled', 'onsuspend', 'ontimeupdate',
|
|
82
|
-
'onvolumechange', 'onwaiting', 'onkeydown', 'onkeypress',
|
|
83
|
-
'onkeyup', 'onmousedown', 'onmousemove', 'onmouseup',
|
|
84
|
-
'onwheel', 'ondrag', 'ondragend', 'ondragenter',
|
|
85
|
-
'ondragleave', 'ondragover', 'ondragstart', 'ondrop',
|
|
86
|
-
'onscroll', 'onresize', 'onstorage', 'onhashchange',
|
|
87
|
-
'onpopstate', 'onbeforeprint', 'onafterprint',
|
|
88
|
-
'onbeforeunload', 'onunload', 'onmessage', 'oninput',
|
|
89
|
-
'oninvalid', 'onsearch', 'autocomplete', 'autofocus',
|
|
90
|
-
'checked', 'defer', 'disabled', 'hidden', 'loop',
|
|
91
|
-
'multiple', 'muted', 'open', 'readonly', 'required',
|
|
92
|
-
'reversed', 'selected', 'autoplay', 'controls',
|
|
93
|
-
'crossorigin', 'download', 'hreflang', 'ismap',
|
|
94
|
-
'itemid', 'itemprop', 'itemref', 'itemscope',
|
|
95
|
-
'itemtype', 'kind', 'media', 'rel', 'sandbox',
|
|
96
|
-
'scope', 'sizes', 'span', 'spellcheck', 'srcdoc',
|
|
97
|
-
'srclang', 'srcset', 'step', 'tabindex', 'target',
|
|
98
|
-
'translate', 'usemap', 'wrap', 'accept', 'acceptcharset',
|
|
99
|
-
'accesskey', 'action', 'allowfullscreen', 'alt',
|
|
100
|
-
'async', 'autocapitalize', 'capture', 'charset',
|
|
101
|
-
'cols', 'colspan', 'content', 'contenteditable',
|
|
102
|
-
'contextmenu', 'coords', 'datetime', 'decoding',
|
|
103
|
-
'default', 'dir', 'dirname', 'download', 'draggable',
|
|
104
|
-
'enctype', 'enterkeyhint', 'for', 'form', 'formaction',
|
|
105
|
-
'formenctype', 'formmethod', 'formnovalidate',
|
|
106
|
-
'formtarget', 'headers', 'height', 'high', 'href',
|
|
107
|
-
'hreflang', 'httpequiv', 'icon', 'importance', 'inputmode',
|
|
108
|
-
'integrity', 'intrinsicsize', 'keytype', 'label',
|
|
109
|
-
'lang', 'list', 'loading', 'low', 'manifest',
|
|
110
|
-
'max', 'maxlength', 'method', 'min', 'minlength',
|
|
111
|
-
'name', 'novalidate', 'optimum', 'pattern',
|
|
112
|
-
'ping', 'placeholder', 'poster', 'preload',
|
|
113
|
-
'radiogroup', 'referrerpolicy', 'rows', 'rowspan',
|
|
114
|
-
'shape', 'size', 'slot', 'src', 'start',
|
|
115
|
-
'title', 'type', 'value', 'width'
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
# Keep these semantic attributes
|
|
119
|
-
self.keep_attributes = {
|
|
120
|
-
'id', 'class', 'data-testid', 'data-test', 'data-cy',
|
|
121
|
-
'aria-label', 'aria-labelledby', 'aria-describedby',
|
|
122
|
-
'role', 'alt', 'title', 'href', 'src', 'action',
|
|
123
|
-
'name', 'value', 'placeholder', 'type'
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
# Patterns to detect valuable JavaScript data
|
|
127
|
-
self.useful_js_patterns = [
|
|
128
|
-
# Next.js/Nuxt.js SSR data
|
|
129
|
-
r'__NEXT_DATA__\s*=\s*(\{.+?\});?',
|
|
130
|
-
r'__NUXT__\s*=\s*(\{.+?\});?',
|
|
131
|
-
r'window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?',
|
|
132
|
-
|
|
133
|
-
# React/Vue hydration data
|
|
134
|
-
r'window\.__REACT_QUERY_STATE__\s*=\s*(\{.+?\});?',
|
|
135
|
-
r'window\.__VUE_SSR_CONTEXT__\s*=\s*(\{.+?\});?',
|
|
136
|
-
|
|
137
|
-
# E-commerce data
|
|
138
|
-
r'window\.productData\s*=\s*(\{.+?\});?',
|
|
139
|
-
r'window\.cartData\s*=\s*(\{.+?\});?',
|
|
140
|
-
r'dataLayer\s*=\s*(\[.+?\]);?',
|
|
141
|
-
|
|
142
|
-
# Analytics and tracking (structured data)
|
|
143
|
-
r'gtag\s*\(\s*[\'"]config[\'"],\s*[\'"][^\'\"]+[\'"],\s*(\{.+?\})\s*\);?',
|
|
144
|
-
|
|
145
|
-
# JSON-LD structured data (often in script tags)
|
|
146
|
-
r'"@context"\s*:\s*"https?://schema\.org"[^}]*\}',
|
|
147
|
-
|
|
148
|
-
# Generic JSON objects (be more selective)
|
|
149
|
-
r'(?:window\.|var\s+|let\s+|const\s+)\w+\s*=\s*(\{.+?\});?',
|
|
150
|
-
]
|
|
151
|
-
|
|
152
|
-
# Compiled regex patterns for efficiency
|
|
153
|
-
self.compiled_patterns = [re.compile(pattern, re.DOTALL | re.IGNORECASE)
|
|
154
|
-
for pattern in self.useful_js_patterns]
|
|
155
|
-
|
|
156
|
-
def clean_html(
|
|
157
|
-
self,
|
|
158
|
-
html_content: str,
|
|
159
|
-
preserve_js_data: bool = True,
|
|
160
|
-
aggressive_cleaning: bool = False
|
|
161
|
-
) -> Tuple[str, Dict[str, any]]:
|
|
162
|
-
"""
|
|
163
|
-
Clean HTML content while preserving valuable data
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
html_content: Raw HTML content
|
|
167
|
-
preserve_js_data: Whether to extract and preserve JS data
|
|
168
|
-
aggressive_cleaning: Whether to apply more aggressive cleaning
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
Tuple of (cleaned_html, extracted_data)
|
|
172
|
-
"""
|
|
173
|
-
if not html_content or not html_content.strip():
|
|
174
|
-
return "", {}
|
|
175
|
-
|
|
176
|
-
# Parse HTML
|
|
177
|
-
try:
|
|
178
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
|
179
|
-
except Exception as e:
|
|
180
|
-
raise ValidationError(f"Failed to parse HTML: {e}")
|
|
181
|
-
|
|
182
|
-
extracted_data = {}
|
|
183
|
-
|
|
184
|
-
# Extract valuable JavaScript data before removing scripts
|
|
185
|
-
if preserve_js_data:
|
|
186
|
-
extracted_data = self._extract_js_data(soup)
|
|
187
|
-
|
|
188
|
-
# Remove universal noise elements for aggressive cleaning
|
|
189
|
-
if aggressive_cleaning:
|
|
190
|
-
self._remove_universal_noise(soup)
|
|
191
|
-
self._truncate_long_urls(soup) # Do this before tracking URL cleaning
|
|
192
|
-
self._clean_tracking_urls(soup)
|
|
193
|
-
self._clean_base64_data(soup)
|
|
194
|
-
self._remove_long_attributes(soup)
|
|
195
|
-
self._remove_html_comments(soup)
|
|
196
|
-
self._clean_whitespace(soup)
|
|
197
|
-
|
|
198
|
-
# Remove noise elements
|
|
199
|
-
self._remove_noise_elements(soup)
|
|
200
|
-
|
|
201
|
-
# Clean attributes
|
|
202
|
-
self._clean_attributes(soup, aggressive_cleaning)
|
|
203
|
-
|
|
204
|
-
# Remove comments
|
|
205
|
-
self._remove_comments(soup)
|
|
206
|
-
|
|
207
|
-
# Clean text and whitespace
|
|
208
|
-
cleaned_html = self._clean_text_and_whitespace(soup)
|
|
209
|
-
|
|
210
|
-
# Final cleanup
|
|
211
|
-
cleaned_html = self._final_cleanup(cleaned_html)
|
|
212
|
-
|
|
213
|
-
return cleaned_html, extracted_data
|
|
214
|
-
|
|
215
|
-
def _extract_js_data(self, soup: BeautifulSoup) -> Dict[str, any]:
|
|
216
|
-
"""Extract valuable data from JavaScript"""
|
|
217
|
-
extracted_data = {
|
|
218
|
-
'ssr_data': {},
|
|
219
|
-
'structured_data': [],
|
|
220
|
-
'analytics_data': {},
|
|
221
|
-
'product_data': {},
|
|
222
|
-
'raw_extracts': []
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
# Find all script tags
|
|
226
|
-
script_tags = soup.find_all('script')
|
|
227
|
-
|
|
228
|
-
for script in script_tags:
|
|
229
|
-
if not script.string:
|
|
230
|
-
continue
|
|
231
|
-
|
|
232
|
-
script_content = script.string.strip()
|
|
233
|
-
|
|
234
|
-
# Skip empty or very short scripts
|
|
235
|
-
if len(script_content) < 10:
|
|
236
|
-
continue
|
|
237
|
-
|
|
238
|
-
# Check for JSON-LD structured data
|
|
239
|
-
if script.get('type') == 'application/ld+json':
|
|
240
|
-
try:
|
|
241
|
-
json_data = json.loads(script_content)
|
|
242
|
-
extracted_data['structured_data'].append(json_data)
|
|
243
|
-
continue
|
|
244
|
-
except json.JSONDecodeError:
|
|
245
|
-
pass
|
|
246
|
-
|
|
247
|
-
# Extract data using patterns
|
|
248
|
-
self._extract_with_patterns(script_content, extracted_data)
|
|
249
|
-
|
|
250
|
-
# Remove empty categories
|
|
251
|
-
extracted_data = {k: v for k, v in extracted_data.items() if v}
|
|
252
|
-
|
|
253
|
-
return extracted_data
|
|
254
|
-
|
|
255
|
-
def _extract_with_patterns(self, script_content: str, extracted_data: Dict):
|
|
256
|
-
"""Extract data using compiled regex patterns and heuristics"""
|
|
257
|
-
|
|
258
|
-
# First try specific named patterns
|
|
259
|
-
self._extract_named_patterns(script_content, extracted_data)
|
|
260
|
-
|
|
261
|
-
# Then try generic JSON extraction as fallback
|
|
262
|
-
self._extract_generic_json(script_content, extracted_data)
|
|
263
|
-
|
|
264
|
-
def _extract_named_patterns(self, script_content: str, extracted_data: Dict):
|
|
265
|
-
"""Extract data using specific named patterns"""
|
|
266
|
-
|
|
267
|
-
# Next.js SSR data
|
|
268
|
-
nextjs_patterns = [
|
|
269
|
-
r'__NEXT_DATA__\s*=\s*({.+?});',
|
|
270
|
-
r'window\.__NEXT_DATA__\s*=\s*({.+?});'
|
|
271
|
-
]
|
|
272
|
-
|
|
273
|
-
for pattern in nextjs_patterns:
|
|
274
|
-
matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
|
|
275
|
-
for match in matches:
|
|
276
|
-
self._try_parse_json(match.group(1), extracted_data, 'ssr_data')
|
|
277
|
-
|
|
278
|
-
# React Query state
|
|
279
|
-
react_patterns = [
|
|
280
|
-
r'window\.__REACT_QUERY_STATE__\s*=\s*({.+?});'
|
|
281
|
-
]
|
|
282
|
-
|
|
283
|
-
for pattern in react_patterns:
|
|
284
|
-
matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
|
|
285
|
-
for match in matches:
|
|
286
|
-
self._try_parse_json(match.group(1), extracted_data, 'ssr_data')
|
|
287
|
-
|
|
288
|
-
# Product data
|
|
289
|
-
product_patterns = [
|
|
290
|
-
r'window\.productData\s*=\s*({.+?});',
|
|
291
|
-
r'dataLayer\s*=\s*(\[.+?\]);'
|
|
292
|
-
]
|
|
293
|
-
|
|
294
|
-
for pattern in product_patterns:
|
|
295
|
-
matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
|
|
296
|
-
for match in matches:
|
|
297
|
-
self._try_parse_json(match.group(1), extracted_data, 'product_data')
|
|
298
|
-
|
|
299
|
-
def _extract_generic_json(self, script_content: str, extracted_data: Dict):
|
|
300
|
-
"""Extract generic JSON objects as fallback"""
|
|
301
|
-
|
|
302
|
-
# Look for variable assignments with objects
|
|
303
|
-
generic_patterns = [
|
|
304
|
-
r'(?:window\.|var\s+|let\s+|const\s+)(\w+)\s*=\s*({[^;]+});',
|
|
305
|
-
r'(\w+)\s*=\s*({[^;]+});'
|
|
306
|
-
]
|
|
307
|
-
|
|
308
|
-
for pattern in generic_patterns:
|
|
309
|
-
matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
|
|
310
|
-
for match in matches:
|
|
311
|
-
var_name = match.group(1)
|
|
312
|
-
json_content = match.group(2)
|
|
313
|
-
|
|
314
|
-
# Only process if it looks like substantial data
|
|
315
|
-
if len(json_content) > 20:
|
|
316
|
-
self._try_parse_json(json_content, extracted_data, 'raw_extracts', var_name)
|
|
317
|
-
|
|
318
|
-
def _try_parse_json(self, json_str: str, extracted_data: Dict, category: str, var_name: str = None):
|
|
319
|
-
"""Try to parse JSON string and categorize it"""
|
|
320
|
-
try:
|
|
321
|
-
json_data = json.loads(json_str)
|
|
322
|
-
|
|
323
|
-
if category == 'ssr_data':
|
|
324
|
-
if 'ssr_data' not in extracted_data:
|
|
325
|
-
extracted_data['ssr_data'] = {}
|
|
326
|
-
if isinstance(json_data, dict):
|
|
327
|
-
extracted_data['ssr_data'].update(json_data)
|
|
328
|
-
else:
|
|
329
|
-
extracted_data['ssr_data'][var_name or 'data'] = json_data
|
|
330
|
-
|
|
331
|
-
elif category == 'product_data':
|
|
332
|
-
if 'product_data' not in extracted_data:
|
|
333
|
-
extracted_data['product_data'] = {}
|
|
334
|
-
if isinstance(json_data, dict):
|
|
335
|
-
extracted_data['product_data'].update(json_data)
|
|
336
|
-
else:
|
|
337
|
-
extracted_data['product_data'][var_name or 'data'] = json_data
|
|
338
|
-
|
|
339
|
-
else: # raw_extracts - filter useful ones only
|
|
340
|
-
# Only store raw extracts if they look like complete objects
|
|
341
|
-
if isinstance(json_data, dict) and len(json_data) > 3:
|
|
342
|
-
if 'raw_extracts' not in extracted_data:
|
|
343
|
-
extracted_data['raw_extracts'] = []
|
|
344
|
-
extracted_data['raw_extracts'].append(json_data)
|
|
345
|
-
|
|
346
|
-
except json.JSONDecodeError:
|
|
347
|
-
# Skip invalid JSON - it's noise
|
|
348
|
-
pass
|
|
349
|
-
|
|
350
|
-
def _remove_noise_elements(self, soup: BeautifulSoup):
|
|
351
|
-
"""Remove noise HTML elements"""
|
|
352
|
-
# Remove noise tags
|
|
353
|
-
for tag_name in self.noise_tags:
|
|
354
|
-
for tag in soup.find_all(tag_name):
|
|
355
|
-
tag.decompose()
|
|
356
|
-
|
|
357
|
-
# Remove empty divs and spans
|
|
358
|
-
for tag in soup.find_all(['div', 'span']):
|
|
359
|
-
if not tag.get_text(strip=True) and not tag.find_all():
|
|
360
|
-
tag.decompose()
|
|
361
|
-
|
|
362
|
-
def _clean_attributes(self, soup: BeautifulSoup, aggressive: bool = False):
|
|
363
|
-
"""Clean HTML attributes"""
|
|
364
|
-
for tag in soup.find_all(True): # Find all tags
|
|
365
|
-
if hasattr(tag, 'attrs'):
|
|
366
|
-
# Determine which attributes to keep
|
|
367
|
-
if aggressive:
|
|
368
|
-
# Keep only essential semantic attributes
|
|
369
|
-
keep_attrs = self.keep_attributes & {'id', 'class', 'href', 'src', 'alt'}
|
|
370
|
-
else:
|
|
371
|
-
keep_attrs = self.keep_attributes
|
|
372
|
-
|
|
373
|
-
# Remove unwanted attributes
|
|
374
|
-
attrs_to_remove = set(tag.attrs.keys()) - keep_attrs
|
|
375
|
-
for attr in attrs_to_remove:
|
|
376
|
-
del tag.attrs[attr]
|
|
377
|
-
|
|
378
|
-
# Clean class names (remove utility classes if aggressive)
|
|
379
|
-
if aggressive and 'class' in tag.attrs:
|
|
380
|
-
classes = tag.attrs['class']
|
|
381
|
-
if isinstance(classes, list):
|
|
382
|
-
# Remove utility classes (Tailwind, Bootstrap, etc.)
|
|
383
|
-
semantic_classes = [
|
|
384
|
-
cls for cls in classes
|
|
385
|
-
if not self._is_utility_class(cls)
|
|
386
|
-
]
|
|
387
|
-
if semantic_classes:
|
|
388
|
-
tag.attrs['class'] = semantic_classes
|
|
389
|
-
else:
|
|
390
|
-
del tag.attrs['class']
|
|
391
|
-
|
|
392
|
-
def _remove_universal_noise(self, soup: BeautifulSoup):
|
|
393
|
-
"""Remove universal noise elements from any website"""
|
|
394
|
-
for selector in self.universal_noise_selectors:
|
|
395
|
-
try:
|
|
396
|
-
elements = soup.select(selector)
|
|
397
|
-
for element in elements:
|
|
398
|
-
# Keep only main product content areas
|
|
399
|
-
if not self._is_main_content_element(element):
|
|
400
|
-
element.decompose()
|
|
401
|
-
except Exception:
|
|
402
|
-
# Skip invalid selectors
|
|
403
|
-
continue
|
|
404
|
-
|
|
405
|
-
def _clean_tracking_urls(self, soup: BeautifulSoup):
|
|
406
|
-
"""Remove or shorten tracking URLs that bloat HTML size"""
|
|
407
|
-
import re
|
|
408
|
-
|
|
409
|
-
# Clean href attributes in links
|
|
410
|
-
for tag in soup.find_all(['a'], href=True):
|
|
411
|
-
href = tag.get('href', '')
|
|
412
|
-
if href and not href.endswith('...truncated'): # Skip already truncated URLs
|
|
413
|
-
for pattern in self.tracking_url_patterns:
|
|
414
|
-
if re.match(pattern, href):
|
|
415
|
-
# Replace with placeholder for tracking URLs
|
|
416
|
-
tag['href'] = '#tracking-url-removed'
|
|
417
|
-
break
|
|
418
|
-
|
|
419
|
-
# Clean src attributes in images
|
|
420
|
-
for tag in soup.find_all(['img'], src=True):
|
|
421
|
-
src = tag.get('src', '')
|
|
422
|
-
if src:
|
|
423
|
-
for pattern in self.tracking_url_patterns:
|
|
424
|
-
if re.match(pattern, src):
|
|
425
|
-
# Replace with minimal SVG placeholder
|
|
426
|
-
tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
427
|
-
break
|
|
428
|
-
|
|
429
|
-
def _clean_base64_data(self, soup: BeautifulSoup):
|
|
430
|
-
"""Remove or replace large base64 encoded data to reduce HTML size"""
|
|
431
|
-
import re
|
|
432
|
-
|
|
433
|
-
# Clean base64 data in img src attributes
|
|
434
|
-
for tag in soup.find_all(['img'], src=True):
|
|
435
|
-
src = tag.get('src', '')
|
|
436
|
-
if src:
|
|
437
|
-
for pattern in self.base64_patterns:
|
|
438
|
-
if re.search(pattern, src):
|
|
439
|
-
# Extract image type if possible
|
|
440
|
-
if src.startswith('data:image/'):
|
|
441
|
-
# Replace with minimal SVG placeholder
|
|
442
|
-
tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
443
|
-
else:
|
|
444
|
-
# Remove the src entirely for non-images
|
|
445
|
-
del tag['src']
|
|
446
|
-
break
|
|
447
|
-
|
|
448
|
-
# Clean base64 data in style attributes and CSS
|
|
449
|
-
for tag in soup.find_all(style=True):
|
|
450
|
-
style = tag.get('style', '')
|
|
451
|
-
if style:
|
|
452
|
-
for pattern in self.base64_patterns:
|
|
453
|
-
if re.search(pattern, style):
|
|
454
|
-
# Remove the entire style attribute if it contains large base64
|
|
455
|
-
del tag['style']
|
|
456
|
-
break
|
|
457
|
-
|
|
458
|
-
# Clean base64 data in href attributes (for downloads, etc.)
|
|
459
|
-
for tag in soup.find_all(['a'], href=True):
|
|
460
|
-
href = tag.get('href', '')
|
|
461
|
-
if href:
|
|
462
|
-
for pattern in self.base64_patterns:
|
|
463
|
-
if re.match(pattern, href):
|
|
464
|
-
# Replace with placeholder
|
|
465
|
-
tag['href'] = '#base64-data-removed'
|
|
466
|
-
break
|
|
467
|
-
|
|
468
|
-
# Clean base64 data from any attribute (catch-all)
|
|
469
|
-
for tag in soup.find_all():
|
|
470
|
-
attrs_to_clean = []
|
|
471
|
-
for attr, value in tag.attrs.items():
|
|
472
|
-
if isinstance(value, str):
|
|
473
|
-
for pattern in self.base64_patterns:
|
|
474
|
-
if re.search(pattern, value):
|
|
475
|
-
attrs_to_clean.append(attr)
|
|
476
|
-
break
|
|
477
|
-
|
|
478
|
-
# Clean or remove attributes with base64 data
|
|
479
|
-
for attr in attrs_to_clean:
|
|
480
|
-
if attr in ['src', 'href']:
|
|
481
|
-
# Replace with placeholder for important attributes
|
|
482
|
-
if attr == 'src':
|
|
483
|
-
tag[attr] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
484
|
-
else:
|
|
485
|
-
tag[attr] = '#base64-data-removed'
|
|
486
|
-
else:
|
|
487
|
-
# Remove entirely for other attributes
|
|
488
|
-
del tag.attrs[attr]
|
|
489
|
-
|
|
490
|
-
def _truncate_long_urls(self, soup: BeautifulSoup, max_url_length: int = 500):
|
|
491
|
-
"""Truncate any URL longer than max_url_length characters"""
|
|
492
|
-
|
|
493
|
-
# Process all elements with href attributes (links)
|
|
494
|
-
for tag in soup.find_all(['a'], href=True):
|
|
495
|
-
href = tag.get('href', '')
|
|
496
|
-
if isinstance(href, str) and len(href) > max_url_length:
|
|
497
|
-
# Keep the beginning of the URL and add indicator
|
|
498
|
-
truncated_url = href[:max_url_length] + '...truncated'
|
|
499
|
-
tag['href'] = truncated_url
|
|
500
|
-
|
|
501
|
-
# Process all elements with src attributes (images, iframes, etc.)
|
|
502
|
-
for tag in soup.find_all(['img', 'iframe', 'embed', 'object'], src=True):
|
|
503
|
-
src = tag.get('src', '')
|
|
504
|
-
if isinstance(src, str) and len(src) > max_url_length:
|
|
505
|
-
# For images, if it's not base64, truncate it
|
|
506
|
-
if not src.startswith('data:'):
|
|
507
|
-
truncated_url = src[:max_url_length] + '...truncated'
|
|
508
|
-
tag['src'] = truncated_url
|
|
509
|
-
# Base64 data is handled by _clean_base64_data method
|
|
510
|
-
|
|
511
|
-
# Process any other URL-like attributes
|
|
512
|
-
url_attributes = ['action', 'formaction', 'poster', 'cite', 'data', 'manifest']
|
|
513
|
-
for tag in soup.find_all():
|
|
514
|
-
for attr in url_attributes:
|
|
515
|
-
if tag.has_attr(attr):
|
|
516
|
-
value = tag.get(attr, '')
|
|
517
|
-
if isinstance(value, str) and len(value) > max_url_length:
|
|
518
|
-
# Check if it looks like a URL (contains :// or starts with / or http)
|
|
519
|
-
if ('://' in value or
|
|
520
|
-
value.startswith('/') or
|
|
521
|
-
value.startswith('http') or
|
|
522
|
-
value.startswith('//')):
|
|
523
|
-
truncated_url = value[:max_url_length] + '...truncated'
|
|
524
|
-
tag[attr] = truncated_url
|
|
525
|
-
|
|
526
|
-
def _remove_long_attributes(self, soup: BeautifulSoup):
|
|
527
|
-
"""Remove attributes with extremely long values that are likely tracking data"""
|
|
528
|
-
for tag in soup.find_all():
|
|
529
|
-
# Check all attributes for excessive length
|
|
530
|
-
attrs_to_remove = []
|
|
531
|
-
for attr, value in tag.attrs.items():
|
|
532
|
-
if isinstance(value, str):
|
|
533
|
-
# Remove attributes longer than 800 chars (likely tracking data)
|
|
534
|
-
# Increased from 500 since URLs are now handled separately
|
|
535
|
-
if len(value) > 800:
|
|
536
|
-
attrs_to_remove.append(attr)
|
|
537
|
-
# Remove specific tracking attributes regardless of length
|
|
538
|
-
elif any(tracking in attr.lower() for tracking in
|
|
539
|
-
['tracking', 'analytics', 'gtm', 'pixel', 'impression', 'asin']):
|
|
540
|
-
attrs_to_remove.append(attr)
|
|
541
|
-
elif isinstance(value, list):
|
|
542
|
-
# Check if list contains very long strings
|
|
543
|
-
if any(isinstance(v, str) and len(v) > 500 for v in value):
|
|
544
|
-
attrs_to_remove.append(attr)
|
|
545
|
-
|
|
546
|
-
# Remove the problematic attributes
|
|
547
|
-
for attr in attrs_to_remove:
|
|
548
|
-
del tag.attrs[attr]
|
|
549
|
-
|
|
550
|
-
def get_cleaning_stats(self, original_size: int, cleaned_size: int) -> Dict[str, any]:
|
|
551
|
-
"""Get statistics about the cleaning process"""
|
|
552
|
-
reduction_bytes = original_size - cleaned_size
|
|
553
|
-
reduction_percent = (reduction_bytes / original_size * 100) if original_size > 0 else 0
|
|
554
|
-
|
|
555
|
-
return {
|
|
556
|
-
'original_size': original_size,
|
|
557
|
-
'cleaned_size': cleaned_size,
|
|
558
|
-
'reduction_bytes': reduction_bytes,
|
|
559
|
-
'reduction_percent': round(reduction_percent, 2),
|
|
560
|
-
'compression_ratio': round(original_size / cleaned_size, 2) if cleaned_size > 0 else 0
|
|
561
|
-
}
|
|
562
|
-
|
|
563
|
-
def _remove_html_comments(self, soup: BeautifulSoup):
|
|
564
|
-
"""Remove all HTML comments to reduce size"""
|
|
565
|
-
# Remove all HTML comments
|
|
566
|
-
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
567
|
-
comment.extract()
|
|
568
|
-
|
|
569
|
-
def _clean_whitespace(self, soup: BeautifulSoup):
|
|
570
|
-
"""Clean excessive whitespace in text content"""
|
|
571
|
-
import re
|
|
572
|
-
|
|
573
|
-
# Process all text nodes
|
|
574
|
-
for element in soup.find_all(text=True):
|
|
575
|
-
if element.parent.name not in ['script', 'style']: # Skip scripts and styles
|
|
576
|
-
# Replace multiple spaces with single space
|
|
577
|
-
cleaned_text = re.sub(r' {3,}', ' ', str(element))
|
|
578
|
-
# Replace multiple newlines with maximum 2
|
|
579
|
-
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
|
|
580
|
-
# Replace multiple tabs with single space
|
|
581
|
-
cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
|
|
582
|
-
# Clean mixed whitespace
|
|
583
|
-
cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text)
|
|
584
|
-
element.replace_with(cleaned_text)
|
|
585
|
-
|
|
586
|
-
def _advanced_whitespace_cleanup(self, html_content: str) -> str:
|
|
587
|
-
"""Advanced whitespace cleanup for aggressive cleaning"""
|
|
588
|
-
import re
|
|
589
|
-
|
|
590
|
-
# Remove excessive spaces (more than 2)
|
|
591
|
-
html_content = re.sub(r' {3,}', ' ', html_content)
|
|
592
|
-
|
|
593
|
-
# Remove excessive newlines (more than 2)
|
|
594
|
-
html_content = re.sub(r'\n{3,}', '\n\n', html_content)
|
|
595
|
-
|
|
596
|
-
# Remove excessive tabs
|
|
597
|
-
html_content = re.sub(r'\t{2,}', '\t', html_content)
|
|
598
|
-
|
|
599
|
-
# Clean mixed whitespace patterns
|
|
600
|
-
html_content = re.sub(r'[ \t]{3,}', ' ', html_content)
|
|
601
|
-
|
|
602
|
-
# Remove whitespace at line endings
|
|
603
|
-
html_content = re.sub(r'[ \t]+\n', '\n', html_content)
|
|
604
|
-
|
|
605
|
-
# Remove whitespace at line beginnings (except single indent)
|
|
606
|
-
html_content = re.sub(r'\n[ \t]{2,}', '\n ', html_content)
|
|
607
|
-
|
|
608
|
-
# Clean space between tags
|
|
609
|
-
html_content = re.sub(r'>\s{2,}<', '> <', html_content)
|
|
610
|
-
|
|
611
|
-
# Final cleanup
|
|
612
|
-
html_content = html_content.strip()
|
|
613
|
-
|
|
614
|
-
return html_content
|
|
615
|
-
|
|
616
|
-
def _is_main_content_element(self, element) -> bool:
|
|
617
|
-
"""Check if element contains main product content"""
|
|
618
|
-
# Keep elements that likely contain product info
|
|
619
|
-
product_indicators = [
|
|
620
|
-
'product', 'detail', 'title', 'price', 'description',
|
|
621
|
-
'spec', 'review', 'rating', 'availability', 'image'
|
|
622
|
-
]
|
|
623
|
-
|
|
624
|
-
element_text = str(element).lower()
|
|
625
|
-
for indicator in product_indicators:
|
|
626
|
-
if indicator in element_text:
|
|
627
|
-
return True
|
|
628
|
-
return False
|
|
629
|
-
|
|
630
|
-
def _is_utility_class(self, class_name: str) -> bool:
|
|
631
|
-
"""Check if a class name is a utility class"""
|
|
632
|
-
utility_patterns = [
|
|
633
|
-
r'^(m|p)[trblxy]?-\d+$', # Margin/padding utilities
|
|
634
|
-
r'^(m|p)[xy]-auto$', # Margin auto utilities
|
|
635
|
-
r'^(w|h)-\d+$', # Width/height utilities
|
|
636
|
-
r'^text-(xs|sm|lg|xl|\d+xl)$', # Text size utilities
|
|
637
|
-
r'^bg-\w+(-\d+)?$', # Background utilities
|
|
638
|
-
r'^text-\w+(-\d+)?$', # Text color utilities
|
|
639
|
-
r'^border(-\w+)?(-\d+)?$', # Border utilities
|
|
640
|
-
r'^flex(-\w+)?$', # Flex utilities
|
|
641
|
-
r'^grid(-\w+)?$', # Grid utilities
|
|
642
|
-
r'^hidden$', # Visibility utilities
|
|
643
|
-
r'^sr-only$', # Screen reader utilities
|
|
644
|
-
r'^(sm|md|lg|xl|2xl):.*$', # Responsive prefixes
|
|
645
|
-
r'^\w+-\d+$', # Generic number-based utilities
|
|
646
|
-
r'^mx-auto$', # Margin x auto
|
|
647
|
-
r'^my-auto$', # Margin y auto
|
|
648
|
-
]
|
|
649
|
-
|
|
650
|
-
return any(re.match(pattern, class_name) for pattern in utility_patterns)
|
|
651
|
-
|
|
652
|
-
def _truncate_long_text_content(self, soup: BeautifulSoup, max_text_length: int = 300):
|
|
653
|
-
"""Truncate text content longer than max_text_length characters"""
|
|
654
|
-
# Process all text nodes in the soup
|
|
655
|
-
for element in soup.find_all(text=True):
|
|
656
|
-
# Skip script and style tags
|
|
657
|
-
if element.parent.name in ['script', 'style']:
|
|
658
|
-
continue
|
|
659
|
-
|
|
660
|
-
text_content = str(element).strip()
|
|
661
|
-
|
|
662
|
-
# Only process non-empty text that's longer than the limit
|
|
663
|
-
if text_content and len(text_content) > max_text_length:
|
|
664
|
-
# Truncate and add ellipsis
|
|
665
|
-
truncated_text = text_content[:max_text_length] + '...'
|
|
666
|
-
element.replace_with(truncated_text)
|
|
667
|
-
|
|
668
|
-
def _remove_comments(self, soup: BeautifulSoup):
|
|
669
|
-
"""Remove HTML comments"""
|
|
670
|
-
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
671
|
-
comment.extract()
|
|
672
|
-
|
|
673
|
-
def _clean_text_and_whitespace(self, soup: BeautifulSoup) -> str:
|
|
674
|
-
"""Clean text content and normalize whitespace"""
|
|
675
|
-
# Truncate long text content before converting to string
|
|
676
|
-
self._truncate_long_text_content(soup)
|
|
677
|
-
|
|
678
|
-
# Get the HTML string
|
|
679
|
-
html_str = str(soup)
|
|
680
|
-
|
|
681
|
-
# Normalize whitespace
|
|
682
|
-
html_str = re.sub(r'\s+', ' ', html_str) # Multiple spaces to single
|
|
683
|
-
html_str = re.sub(r'\n\s*\n', '\n', html_str) # Multiple newlines to single
|
|
684
|
-
html_str = re.sub(r'>\s+<', '><', html_str) # Remove spaces between tags
|
|
685
|
-
|
|
686
|
-
return html_str
|
|
687
|
-
|
|
688
|
-
def _final_cleanup(self, html_content: str) -> str:
|
|
689
|
-
"""Final cleanup and optimization"""
|
|
690
|
-
# Remove empty attributes
|
|
691
|
-
html_content = re.sub(r'\s+\w+=""', '', html_content)
|
|
692
|
-
|
|
693
|
-
# Remove extra spaces in attributes
|
|
694
|
-
html_content = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html_content)
|
|
695
|
-
|
|
696
|
-
# Normalize quotes
|
|
697
|
-
html_content = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html_content)
|
|
698
|
-
|
|
699
|
-
# Remove trailing spaces before closing tags
|
|
700
|
-
html_content = re.sub(r'\s+(/?>)', r'\1', html_content)
|
|
701
|
-
|
|
702
|
-
# Enhanced whitespace cleanup
|
|
703
|
-
html_content = self._advanced_whitespace_cleanup(html_content)
|
|
704
|
-
|
|
705
|
-
return html_content.strip()
|
|
706
|
-
|
|
707
|
-
def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> Dict[str, any]:
|
|
708
|
-
"""Get statistics about the cleaning process"""
|
|
709
|
-
original_size = len(original_html)
|
|
710
|
-
cleaned_size = len(cleaned_html)
|
|
711
|
-
|
|
712
|
-
# Estimate token reduction (rough approximation)
|
|
713
|
-
original_tokens = original_size // 4 # Rough estimate: 4 chars per token
|
|
714
|
-
cleaned_tokens = cleaned_size // 4
|
|
715
|
-
|
|
716
|
-
return {
|
|
717
|
-
"original_size_bytes": original_size,
|
|
718
|
-
"cleaned_size_bytes": cleaned_size,
|
|
719
|
-
"size_reduction_bytes": original_size - cleaned_size,
|
|
720
|
-
"size_reduction_percent": ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0,
|
|
721
|
-
"estimated_original_tokens": original_tokens,
|
|
722
|
-
"estimated_cleaned_tokens": cleaned_tokens,
|
|
723
|
-
"estimated_token_savings": original_tokens - cleaned_tokens,
|
|
724
|
-
"estimated_token_savings_percent": ((original_tokens - cleaned_tokens) / original_tokens * 100) if original_tokens > 0 else 0
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
# Convenience functions
|
|
729
|
-
def clean_html_for_llm(
|
|
730
|
-
html_content: str,
|
|
731
|
-
preserve_js_data: bool = True,
|
|
732
|
-
aggressive_cleaning: bool = False
|
|
733
|
-
) -> Tuple[str, Dict[str, any]]:
|
|
734
|
-
"""
|
|
735
|
-
Quick function to clean HTML for LLM analysis
|
|
736
|
-
|
|
737
|
-
Args:
|
|
738
|
-
html_content: Raw HTML content
|
|
739
|
-
preserve_js_data: Whether to extract and preserve JS data
|
|
740
|
-
aggressive_cleaning: Whether to apply aggressive cleaning
|
|
741
|
-
|
|
742
|
-
Returns:
|
|
743
|
-
Tuple of (cleaned_html, extracted_data)
|
|
744
|
-
"""
|
|
745
|
-
cleaner = SmartHTMLCleaner()
|
|
746
|
-
return cleaner.clean_html(html_content, preserve_js_data, aggressive_cleaning)
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
def extract_js_data_only(html_content: str) -> Dict[str, any]:
|
|
750
|
-
"""
|
|
751
|
-
Extract only JavaScript data without cleaning HTML
|
|
752
|
-
|
|
753
|
-
Args:
|
|
754
|
-
html_content: Raw HTML content
|
|
755
|
-
|
|
756
|
-
Returns:
|
|
757
|
-
Extracted JavaScript data
|
|
758
|
-
"""
|
|
759
|
-
try:
|
|
760
|
-
soup = BeautifulSoup(html_content, 'html.parser')
|
|
761
|
-
cleaner = SmartHTMLCleaner()
|
|
762
|
-
return cleaner._extract_js_data(soup)
|
|
763
|
-
except Exception:
|
|
764
|
-
return {}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|