unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +23 -21
- unrealon-1.1.0.dist-info/METADATA +164 -0
- unrealon-1.1.0.dist-info/RECORD +82 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
- unrealon-1.1.0.dist-info/entry_points.txt +9 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
- unrealon_bridge/__init__.py +114 -0
- unrealon_bridge/cli.py +316 -0
- unrealon_bridge/client/__init__.py +93 -0
- unrealon_bridge/client/base.py +78 -0
- unrealon_bridge/client/commands.py +89 -0
- unrealon_bridge/client/connection.py +90 -0
- unrealon_bridge/client/events.py +65 -0
- unrealon_bridge/client/health.py +38 -0
- unrealon_bridge/client/html_parser.py +146 -0
- unrealon_bridge/client/logging.py +139 -0
- unrealon_bridge/client/proxy.py +70 -0
- unrealon_bridge/client/scheduler.py +450 -0
- unrealon_bridge/client/session.py +70 -0
- unrealon_bridge/configs/__init__.py +14 -0
- unrealon_bridge/configs/bridge_config.py +212 -0
- unrealon_bridge/configs/bridge_config.yaml +39 -0
- unrealon_bridge/models/__init__.py +138 -0
- unrealon_bridge/models/base.py +28 -0
- unrealon_bridge/models/command.py +41 -0
- unrealon_bridge/models/events.py +40 -0
- unrealon_bridge/models/html_parser.py +79 -0
- unrealon_bridge/models/logging.py +55 -0
- unrealon_bridge/models/parser.py +63 -0
- unrealon_bridge/models/proxy.py +41 -0
- unrealon_bridge/models/requests.py +95 -0
- unrealon_bridge/models/responses.py +88 -0
- unrealon_bridge/models/scheduler.py +592 -0
- unrealon_bridge/models/session.py +28 -0
- unrealon_bridge/server/__init__.py +91 -0
- unrealon_bridge/server/base.py +171 -0
- unrealon_bridge/server/handlers/__init__.py +23 -0
- unrealon_bridge/server/handlers/command.py +110 -0
- unrealon_bridge/server/handlers/html_parser.py +139 -0
- unrealon_bridge/server/handlers/logging.py +95 -0
- unrealon_bridge/server/handlers/parser.py +95 -0
- unrealon_bridge/server/handlers/proxy.py +75 -0
- unrealon_bridge/server/handlers/scheduler.py +545 -0
- unrealon_bridge/server/handlers/session.py +66 -0
- unrealon_browser/__init__.py +61 -18
- unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
- unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
- unrealon_browser/{src/core → core}/browser_manager.py +2 -2
- unrealon_browser/{src/managers → managers}/captcha.py +1 -1
- unrealon_browser/{src/managers → managers}/cookies.py +1 -1
- unrealon_browser/managers/logger_bridge.py +231 -0
- unrealon_browser/{src/managers → managers}/profile.py +1 -1
- unrealon_driver/__init__.py +73 -19
- unrealon_driver/browser/__init__.py +8 -0
- unrealon_driver/browser/config.py +74 -0
- unrealon_driver/browser/manager.py +416 -0
- unrealon_driver/exceptions.py +28 -0
- unrealon_driver/parser/__init__.py +55 -0
- unrealon_driver/parser/cli_manager.py +141 -0
- unrealon_driver/parser/daemon_manager.py +227 -0
- unrealon_driver/parser/managers/__init__.py +46 -0
- unrealon_driver/parser/managers/browser.py +51 -0
- unrealon_driver/parser/managers/config.py +281 -0
- unrealon_driver/parser/managers/error.py +412 -0
- unrealon_driver/parser/managers/html.py +732 -0
- unrealon_driver/parser/managers/logging.py +609 -0
- unrealon_driver/parser/managers/result.py +321 -0
- unrealon_driver/parser/parser_manager.py +628 -0
- unrealon/sdk_config.py +0 -88
- unrealon-1.0.9.dist-info/METADATA +0 -810
- unrealon-1.0.9.dist-info/RECORD +0 -246
- unrealon_browser/pyproject.toml +0 -182
- unrealon_browser/src/__init__.py +0 -62
- unrealon_browser/src/managers/logger_bridge.py +0 -395
- unrealon_driver/README.md +0 -204
- unrealon_driver/pyproject.toml +0 -187
- unrealon_driver/src/__init__.py +0 -90
- unrealon_driver/src/cli/__init__.py +0 -10
- unrealon_driver/src/cli/main.py +0 -66
- unrealon_driver/src/cli/simple.py +0 -510
- unrealon_driver/src/config/__init__.py +0 -11
- unrealon_driver/src/config/auto_config.py +0 -478
- unrealon_driver/src/core/__init__.py +0 -18
- unrealon_driver/src/core/exceptions.py +0 -289
- unrealon_driver/src/core/parser.py +0 -638
- unrealon_driver/src/dto/__init__.py +0 -66
- unrealon_driver/src/dto/cli.py +0 -119
- unrealon_driver/src/dto/config.py +0 -18
- unrealon_driver/src/dto/events.py +0 -237
- unrealon_driver/src/dto/execution.py +0 -313
- unrealon_driver/src/dto/services.py +0 -311
- unrealon_driver/src/execution/__init__.py +0 -23
- unrealon_driver/src/execution/daemon_mode.py +0 -317
- unrealon_driver/src/execution/interactive_mode.py +0 -88
- unrealon_driver/src/execution/modes.py +0 -45
- unrealon_driver/src/execution/scheduled_mode.py +0 -209
- unrealon_driver/src/execution/test_mode.py +0 -250
- unrealon_driver/src/logging/__init__.py +0 -24
- unrealon_driver/src/logging/driver_logger.py +0 -512
- unrealon_driver/src/services/__init__.py +0 -24
- unrealon_driver/src/services/browser_service.py +0 -726
- unrealon_driver/src/services/llm/__init__.py +0 -15
- unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
- unrealon_driver/src/services/llm/llm.py +0 -195
- unrealon_driver/src/services/logger_service.py +0 -232
- unrealon_driver/src/services/metrics_service.py +0 -185
- unrealon_driver/src/services/scheduler_service.py +0 -489
- unrealon_driver/src/services/websocket_service.py +0 -362
- unrealon_driver/src/utils/__init__.py +0 -16
- unrealon_driver/src/utils/service_factory.py +0 -317
- unrealon_driver/src/utils/time_formatter.py +0 -338
- unrealon_llm/README.md +0 -44
- unrealon_llm/__init__.py +0 -26
- unrealon_llm/pyproject.toml +0 -154
- unrealon_llm/src/__init__.py +0 -228
- unrealon_llm/src/cli/__init__.py +0 -0
- unrealon_llm/src/core/__init__.py +0 -11
- unrealon_llm/src/core/smart_client.py +0 -438
- unrealon_llm/src/dto/__init__.py +0 -155
- unrealon_llm/src/dto/models/__init__.py +0 -0
- unrealon_llm/src/dto/models/config.py +0 -343
- unrealon_llm/src/dto/models/core.py +0 -328
- unrealon_llm/src/dto/models/enums.py +0 -123
- unrealon_llm/src/dto/models/html_analysis.py +0 -345
- unrealon_llm/src/dto/models/statistics.py +0 -473
- unrealon_llm/src/dto/models/translation.py +0 -383
- unrealon_llm/src/dto/models/type_conversion.py +0 -462
- unrealon_llm/src/dto/schemas/__init__.py +0 -0
- unrealon_llm/src/exceptions.py +0 -392
- unrealon_llm/src/llm_config/__init__.py +0 -20
- unrealon_llm/src/llm_config/logging_config.py +0 -178
- unrealon_llm/src/llm_logging/__init__.py +0 -42
- unrealon_llm/src/llm_logging/llm_events.py +0 -107
- unrealon_llm/src/llm_logging/llm_logger.py +0 -466
- unrealon_llm/src/managers/__init__.py +0 -15
- unrealon_llm/src/managers/cache_manager.py +0 -67
- unrealon_llm/src/managers/cost_manager.py +0 -107
- unrealon_llm/src/managers/request_manager.py +0 -298
- unrealon_llm/src/modules/__init__.py +0 -0
- unrealon_llm/src/modules/html_processor/__init__.py +0 -25
- unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
- unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
- unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
- unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
- unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
- unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
- unrealon_llm/src/modules/html_processor/processor.py +0 -102
- unrealon_llm/src/modules/llm/__init__.py +0 -0
- unrealon_llm/src/modules/translator/__init__.py +0 -0
- unrealon_llm/src/provider.py +0 -116
- unrealon_llm/src/utils/__init__.py +0 -95
- unrealon_llm/src/utils/common.py +0 -64
- unrealon_llm/src/utils/data_extractor.py +0 -188
- unrealon_llm/src/utils/html_cleaner.py +0 -767
- unrealon_llm/src/utils/language_detector.py +0 -308
- unrealon_llm/src/utils/models_cache.py +0 -592
- unrealon_llm/src/utils/smart_counter.py +0 -229
- unrealon_llm/src/utils/token_counter.py +0 -189
- unrealon_sdk/README.md +0 -25
- unrealon_sdk/__init__.py +0 -30
- unrealon_sdk/pyproject.toml +0 -231
- unrealon_sdk/src/__init__.py +0 -150
- unrealon_sdk/src/cli/__init__.py +0 -12
- unrealon_sdk/src/cli/commands/__init__.py +0 -22
- unrealon_sdk/src/cli/commands/benchmark.py +0 -42
- unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
- unrealon_sdk/src/cli/commands/health.py +0 -46
- unrealon_sdk/src/cli/commands/integration.py +0 -498
- unrealon_sdk/src/cli/commands/reports.py +0 -43
- unrealon_sdk/src/cli/commands/security.py +0 -36
- unrealon_sdk/src/cli/commands/server.py +0 -483
- unrealon_sdk/src/cli/commands/servers.py +0 -56
- unrealon_sdk/src/cli/commands/tests.py +0 -55
- unrealon_sdk/src/cli/main.py +0 -126
- unrealon_sdk/src/cli/utils/reporter.py +0 -519
- unrealon_sdk/src/clients/openapi.yaml +0 -3347
- unrealon_sdk/src/clients/python_http/__init__.py +0 -3
- unrealon_sdk/src/clients/python_http/api_config.py +0 -228
- unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
- unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
- unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
- unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
- unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
- unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
- unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
- unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
- unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
- unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
- unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
- unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
- unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
- unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
- unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
- unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
- unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
- unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
- unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
- unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
- unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
- unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
- unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
- unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
- unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
- unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
- unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
- unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
- unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
- unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
- unrealon_sdk/src/clients/python_websocket/client.py +0 -490
- unrealon_sdk/src/clients/python_websocket/events.py +0 -732
- unrealon_sdk/src/clients/python_websocket/example.py +0 -136
- unrealon_sdk/src/clients/python_websocket/types.py +0 -871
- unrealon_sdk/src/core/__init__.py +0 -64
- unrealon_sdk/src/core/client.py +0 -556
- unrealon_sdk/src/core/config.py +0 -465
- unrealon_sdk/src/core/exceptions.py +0 -239
- unrealon_sdk/src/core/metadata.py +0 -191
- unrealon_sdk/src/core/models.py +0 -142
- unrealon_sdk/src/core/types.py +0 -68
- unrealon_sdk/src/dto/__init__.py +0 -268
- unrealon_sdk/src/dto/authentication.py +0 -108
- unrealon_sdk/src/dto/cache.py +0 -208
- unrealon_sdk/src/dto/common.py +0 -19
- unrealon_sdk/src/dto/concurrency.py +0 -393
- unrealon_sdk/src/dto/events.py +0 -108
- unrealon_sdk/src/dto/health.py +0 -339
- unrealon_sdk/src/dto/load_balancing.py +0 -336
- unrealon_sdk/src/dto/logging.py +0 -230
- unrealon_sdk/src/dto/performance.py +0 -165
- unrealon_sdk/src/dto/rate_limiting.py +0 -295
- unrealon_sdk/src/dto/resource_pooling.py +0 -128
- unrealon_sdk/src/dto/structured_logging.py +0 -112
- unrealon_sdk/src/dto/task_scheduling.py +0 -121
- unrealon_sdk/src/dto/websocket.py +0 -55
- unrealon_sdk/src/enterprise/__init__.py +0 -59
- unrealon_sdk/src/enterprise/authentication.py +0 -401
- unrealon_sdk/src/enterprise/cache_manager.py +0 -578
- unrealon_sdk/src/enterprise/error_recovery.py +0 -494
- unrealon_sdk/src/enterprise/event_system.py +0 -549
- unrealon_sdk/src/enterprise/health_monitor.py +0 -747
- unrealon_sdk/src/enterprise/load_balancer.py +0 -964
- unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
- unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
- unrealon_sdk/src/enterprise/logging/development.py +0 -744
- unrealon_sdk/src/enterprise/logging/service.py +0 -410
- unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
- unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
- unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
- unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
- unrealon_sdk/src/enterprise/resource_pool.py +0 -763
- unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
- unrealon_sdk/src/internal/__init__.py +0 -10
- unrealon_sdk/src/internal/command_router.py +0 -497
- unrealon_sdk/src/internal/connection_manager.py +0 -397
- unrealon_sdk/src/internal/http_client.py +0 -446
- unrealon_sdk/src/internal/websocket_client.py +0 -420
- unrealon_sdk/src/provider.py +0 -471
- unrealon_sdk/src/utils.py +0 -234
- /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
- /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
- /unrealon_browser/{src/cli → cli}/main.py +0 -0
- /unrealon_browser/{src/core → core}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
- /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
- /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML Manager - Smart HTML processing and cleaning with Pydantic v2
|
|
3
|
+
|
|
4
|
+
Strict compliance with CRITICAL_REQUIREMENTS.md:
|
|
5
|
+
- No Dict[str, Any] usage
|
|
6
|
+
- Complete type annotations
|
|
7
|
+
- Pydantic v2 models everywhere
|
|
8
|
+
- Custom exception hierarchy
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
from typing import Optional, List, Union
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from pydantic import BaseModel, Field, ConfigDict, field_validator
|
|
16
|
+
import asyncio
|
|
17
|
+
import concurrent.futures
|
|
18
|
+
|
|
19
|
+
from bs4 import BeautifulSoup, Comment
|
|
20
|
+
|
|
21
|
+
from unrealon_rpc.logging import get_logger
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HTMLCleaningConfig(BaseModel):
|
|
25
|
+
"""HTML cleaning configuration with strict typing"""
|
|
26
|
+
model_config = ConfigDict(
|
|
27
|
+
validate_assignment=True,
|
|
28
|
+
extra="forbid"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Cleaning modes
|
|
32
|
+
aggressive_cleaning: bool = Field(
|
|
33
|
+
default=True,
|
|
34
|
+
description="Enable aggressive cleaning"
|
|
35
|
+
)
|
|
36
|
+
preserve_js_data: bool = Field(
|
|
37
|
+
default=True,
|
|
38
|
+
description="Preserve JavaScript data during cleaning"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Content preservation
|
|
42
|
+
preserve_images: bool = Field(
|
|
43
|
+
default=False,
|
|
44
|
+
description="Preserve image tags"
|
|
45
|
+
)
|
|
46
|
+
preserve_links: bool = Field(
|
|
47
|
+
default=True,
|
|
48
|
+
description="Preserve link tags"
|
|
49
|
+
)
|
|
50
|
+
preserve_forms: bool = Field(
|
|
51
|
+
default=False,
|
|
52
|
+
description="Preserve form elements"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Size limits
|
|
56
|
+
max_html_size: int = Field(
|
|
57
|
+
default=1000000,
|
|
58
|
+
ge=1000,
|
|
59
|
+
le=10000000,
|
|
60
|
+
description="Maximum HTML size in characters"
|
|
61
|
+
)
|
|
62
|
+
max_text_length: int = Field(
|
|
63
|
+
default=300,
|
|
64
|
+
ge=50,
|
|
65
|
+
le=1000,
|
|
66
|
+
description="Maximum text content length per element"
|
|
67
|
+
)
|
|
68
|
+
max_url_length: int = Field(
|
|
69
|
+
default=500,
|
|
70
|
+
ge=100,
|
|
71
|
+
le=2000,
|
|
72
|
+
description="Maximum URL length"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Noise removal
|
|
76
|
+
remove_comments: bool = Field(
|
|
77
|
+
default=True,
|
|
78
|
+
description="Remove HTML comments"
|
|
79
|
+
)
|
|
80
|
+
remove_scripts: bool = Field(
|
|
81
|
+
default=True,
|
|
82
|
+
description="Remove script tags"
|
|
83
|
+
)
|
|
84
|
+
remove_styles: bool = Field(
|
|
85
|
+
default=True,
|
|
86
|
+
description="Remove style tags"
|
|
87
|
+
)
|
|
88
|
+
remove_tracking: bool = Field(
|
|
89
|
+
default=True,
|
|
90
|
+
description="Remove tracking URLs and attributes"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Whitespace handling
|
|
94
|
+
normalize_whitespace: bool = Field(
|
|
95
|
+
default=True,
|
|
96
|
+
description="Normalize whitespace"
|
|
97
|
+
)
|
|
98
|
+
remove_empty_elements: bool = Field(
|
|
99
|
+
default=True,
|
|
100
|
+
description="Remove empty elements"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Custom selectors
|
|
104
|
+
noise_selectors: List[str] = Field(
|
|
105
|
+
default_factory=lambda: [
|
|
106
|
+
'[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]',
|
|
107
|
+
'[class*="footer"]', '[class*="header"]', '[class*="ads"]',
|
|
108
|
+
'[class*="popup"]', '[class*="modal"]', '[class*="cookie"]'
|
|
109
|
+
],
|
|
110
|
+
description="CSS selectors for noise elements to remove"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class HTMLCleaningStats(BaseModel):
|
|
115
|
+
"""HTML cleaning statistics"""
|
|
116
|
+
model_config = ConfigDict(
|
|
117
|
+
validate_assignment=True,
|
|
118
|
+
extra="forbid"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
original_size_bytes: int = Field(ge=0)
|
|
122
|
+
cleaned_size_bytes: int = Field(ge=0)
|
|
123
|
+
size_reduction_bytes: int = Field(ge=0)
|
|
124
|
+
size_reduction_percent: float = Field(ge=0.0, le=100.0)
|
|
125
|
+
estimated_original_tokens: int = Field(ge=0)
|
|
126
|
+
estimated_cleaned_tokens: int = Field(ge=0)
|
|
127
|
+
estimated_token_savings: int = Field(ge=0)
|
|
128
|
+
estimated_token_savings_percent: float = Field(ge=0.0, le=100.0)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class ExtractedJSData(BaseModel):
|
|
132
|
+
"""Extracted JavaScript data structure"""
|
|
133
|
+
model_config = ConfigDict(
|
|
134
|
+
validate_assignment=True,
|
|
135
|
+
extra="forbid"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
ssr_data: dict[str, str] = Field(default_factory=dict)
|
|
139
|
+
structured_data: List[dict[str, str]] = Field(default_factory=list)
|
|
140
|
+
raw_extracts: List[dict[str, str]] = Field(default_factory=list)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class HTMLManagerError(Exception):
|
|
144
|
+
"""Base exception for HTML manager"""
|
|
145
|
+
def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
|
|
146
|
+
self.message = message
|
|
147
|
+
self.operation = operation
|
|
148
|
+
self.details = details or {}
|
|
149
|
+
super().__init__(message)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class HTMLParsingError(HTMLManagerError):
|
|
153
|
+
"""Raised when HTML parsing fails"""
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class HTMLCleaningError(HTMLManagerError):
|
|
158
|
+
"""Raised when HTML cleaning fails"""
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class HTMLManager:
|
|
163
|
+
"""
|
|
164
|
+
🧹 HTML Manager - Smart HTML processing and cleaning
|
|
165
|
+
|
|
166
|
+
Features:
|
|
167
|
+
- LLM Optimized: Removes noise, preserves valuable content
|
|
168
|
+
- Token Efficient: Reduces HTML size for cost-effective LLM analysis
|
|
169
|
+
- Smart Extraction: Preserves JavaScript data and structured content
|
|
170
|
+
- Performance: Fast cleaning with configurable aggressiveness
|
|
171
|
+
- Safe: Handles malformed HTML gracefully
|
|
172
|
+
- Type Safety: Full Pydantic v2 compliance
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
def __init__(self, config: Optional[HTMLCleaningConfig] = None):
|
|
176
|
+
self.config = config or HTMLCleaningConfig()
|
|
177
|
+
self.logger = get_logger()
|
|
178
|
+
|
|
179
|
+
# Compile regex patterns for performance
|
|
180
|
+
self._compile_patterns()
|
|
181
|
+
|
|
182
|
+
def _compile_patterns(self) -> None:
|
|
183
|
+
"""Compile regex patterns for performance"""
|
|
184
|
+
# Tracking URL patterns
|
|
185
|
+
self.tracking_url_patterns = [
|
|
186
|
+
re.compile(r'https://aax-[^\s"]{200,}', re.IGNORECASE),
|
|
187
|
+
re.compile(r'https://[^\s"]*tracking[^\s"]{100,}', re.IGNORECASE),
|
|
188
|
+
re.compile(r'https://[^\s"]*analytics[^\s"]{100,}', re.IGNORECASE),
|
|
189
|
+
re.compile(r'https://[^\s"]*gtm[^\s"]{100,}', re.IGNORECASE),
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
# Base64 patterns
|
|
193
|
+
self.base64_patterns = [
|
|
194
|
+
re.compile(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}'),
|
|
195
|
+
re.compile(r'data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
|
|
196
|
+
re.compile(r'data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
# JavaScript data patterns
|
|
200
|
+
self.js_data_patterns = [
|
|
201
|
+
re.compile(r'__NEXT_DATA__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
|
|
202
|
+
re.compile(r'__NUXT__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
|
|
203
|
+
re.compile(r'window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
|
|
204
|
+
re.compile(r'dataLayer\s*=\s*(\[.+?\]);?', re.DOTALL | re.IGNORECASE),
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
# ==========================================
|
|
208
|
+
# MAIN CLEANING METHODS
|
|
209
|
+
# ==========================================
|
|
210
|
+
|
|
211
|
+
async def clean_html(
|
|
212
|
+
self,
|
|
213
|
+
html: str,
|
|
214
|
+
aggressive: Optional[bool] = None,
|
|
215
|
+
preserve_js_data: Optional[bool] = None
|
|
216
|
+
) -> str:
|
|
217
|
+
"""
|
|
218
|
+
Clean HTML content for LLM analysis
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
html: Raw HTML content
|
|
222
|
+
aggressive: Override aggressive cleaning setting
|
|
223
|
+
preserve_js_data: Override JS data preservation setting
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Cleaned HTML optimized for LLM
|
|
227
|
+
"""
|
|
228
|
+
if not html or not html.strip():
|
|
229
|
+
return ""
|
|
230
|
+
|
|
231
|
+
# Use config defaults or overrides
|
|
232
|
+
aggressive_cleaning = aggressive if aggressive is not None else self.config.aggressive_cleaning
|
|
233
|
+
preserve_js = preserve_js_data if preserve_js_data is not None else self.config.preserve_js_data
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
self.logger.info(f"Cleaning HTML: {len(html)} characters")
|
|
237
|
+
|
|
238
|
+
# Check size limits
|
|
239
|
+
if len(html) > self.config.max_html_size:
|
|
240
|
+
self.logger.warning(f"HTML size ({len(html)}) exceeds limit ({self.config.max_html_size})")
|
|
241
|
+
html = html[:self.config.max_html_size]
|
|
242
|
+
|
|
243
|
+
# Parse HTML
|
|
244
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
245
|
+
|
|
246
|
+
# Extract JavaScript data before cleaning
|
|
247
|
+
extracted_data = ExtractedJSData()
|
|
248
|
+
if preserve_js:
|
|
249
|
+
extracted_data = self._extract_js_data(soup)
|
|
250
|
+
|
|
251
|
+
# Apply cleaning steps
|
|
252
|
+
if aggressive_cleaning:
|
|
253
|
+
self._aggressive_cleaning(soup)
|
|
254
|
+
else:
|
|
255
|
+
self._standard_cleaning(soup)
|
|
256
|
+
|
|
257
|
+
# Get cleaned HTML
|
|
258
|
+
cleaned_html = str(soup)
|
|
259
|
+
|
|
260
|
+
# Final cleanup
|
|
261
|
+
cleaned_html = self._final_cleanup(cleaned_html)
|
|
262
|
+
|
|
263
|
+
# Log results
|
|
264
|
+
original_size = len(html)
|
|
265
|
+
cleaned_size = len(cleaned_html)
|
|
266
|
+
reduction = ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0
|
|
267
|
+
|
|
268
|
+
self.logger.info(
|
|
269
|
+
f"HTML cleaned: {original_size} → {cleaned_size} chars "
|
|
270
|
+
f"({reduction:.1f}% reduction)"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
return cleaned_html
|
|
274
|
+
|
|
275
|
+
except Exception as e:
|
|
276
|
+
self.logger.error(f"HTML cleaning failed: {e}")
|
|
277
|
+
raise HTMLCleaningError(
|
|
278
|
+
message=f"Failed to clean HTML: {e}",
|
|
279
|
+
operation="clean_html",
|
|
280
|
+
details={"html_size": str(len(html))}
|
|
281
|
+
) from e
|
|
282
|
+
|
|
283
|
+
def clean_html_sync(self, html: str, **kwargs) -> str:
|
|
284
|
+
"""
|
|
285
|
+
Synchronous HTML cleaning
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
html: Raw HTML content
|
|
289
|
+
**kwargs: Cleaning options
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Cleaned HTML
|
|
293
|
+
"""
|
|
294
|
+
# Handle running event loop
|
|
295
|
+
try:
|
|
296
|
+
loop = asyncio.get_running_loop()
|
|
297
|
+
# If we're in an event loop, create a new thread
|
|
298
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
299
|
+
future = executor.submit(asyncio.run, self.clean_html(html, **kwargs))
|
|
300
|
+
return future.result()
|
|
301
|
+
except RuntimeError:
|
|
302
|
+
# No event loop running, safe to use asyncio.run
|
|
303
|
+
return asyncio.run(self.clean_html(html, **kwargs))
|
|
304
|
+
|
|
305
|
+
async def parse_and_clean_html(
|
|
306
|
+
self,
|
|
307
|
+
html: str,
|
|
308
|
+
schema: Optional[dict[str, str]] = None,
|
|
309
|
+
instructions: Optional[str] = None,
|
|
310
|
+
**kwargs
|
|
311
|
+
) -> dict[str, str]:
|
|
312
|
+
"""
|
|
313
|
+
Parse and clean HTML with LLM analysis preparation
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
html: Raw HTML content
|
|
317
|
+
schema: Optional data schema for extraction
|
|
318
|
+
instructions: Optional parsing instructions
|
|
319
|
+
**kwargs: Additional options
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Dictionary with cleaned HTML and metadata
|
|
323
|
+
"""
|
|
324
|
+
try:
|
|
325
|
+
# Clean HTML
|
|
326
|
+
cleaned_html = await self.clean_html(html, **kwargs)
|
|
327
|
+
|
|
328
|
+
# Get cleaning stats
|
|
329
|
+
stats = self.get_cleaning_stats(html, cleaned_html)
|
|
330
|
+
|
|
331
|
+
result = {
|
|
332
|
+
"cleaned_html": cleaned_html,
|
|
333
|
+
"original_size": str(stats.original_size_bytes),
|
|
334
|
+
"cleaned_size": str(stats.cleaned_size_bytes),
|
|
335
|
+
"reduction_percent": f"{stats.size_reduction_percent:.1f}",
|
|
336
|
+
"estimated_token_savings": str(stats.estimated_token_savings)
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if schema:
|
|
340
|
+
result["schema"] = str(schema)
|
|
341
|
+
if instructions:
|
|
342
|
+
result["instructions"] = instructions
|
|
343
|
+
|
|
344
|
+
return result
|
|
345
|
+
|
|
346
|
+
except Exception as e:
|
|
347
|
+
raise HTMLCleaningError(
|
|
348
|
+
message=f"Failed to parse and clean HTML: {e}",
|
|
349
|
+
operation="parse_and_clean_html"
|
|
350
|
+
) from e
|
|
351
|
+
|
|
352
|
+
# ==========================================
|
|
353
|
+
# CLEANING IMPLEMENTATION
|
|
354
|
+
# ==========================================
|
|
355
|
+
|
|
356
|
+
def _standard_cleaning(self, soup: BeautifulSoup) -> None:
|
|
357
|
+
"""Apply standard cleaning"""
|
|
358
|
+
# Remove noise elements
|
|
359
|
+
self._remove_noise_elements(soup)
|
|
360
|
+
|
|
361
|
+
# Clean attributes
|
|
362
|
+
self._clean_attributes(soup)
|
|
363
|
+
|
|
364
|
+
# Remove comments
|
|
365
|
+
if self.config.remove_comments:
|
|
366
|
+
self._remove_comments(soup)
|
|
367
|
+
|
|
368
|
+
# Normalize whitespace
|
|
369
|
+
if self.config.normalize_whitespace:
|
|
370
|
+
self._normalize_whitespace(soup)
|
|
371
|
+
|
|
372
|
+
def _aggressive_cleaning(self, soup: BeautifulSoup) -> None:
|
|
373
|
+
"""Apply aggressive cleaning"""
|
|
374
|
+
# Standard cleaning first
|
|
375
|
+
self._standard_cleaning(soup)
|
|
376
|
+
|
|
377
|
+
# Remove noise selectors
|
|
378
|
+
self._remove_noise_selectors(soup)
|
|
379
|
+
|
|
380
|
+
# Clean tracking URLs
|
|
381
|
+
if self.config.remove_tracking:
|
|
382
|
+
self._clean_tracking_urls(soup)
|
|
383
|
+
|
|
384
|
+
# Clean base64 data
|
|
385
|
+
self._clean_base64_data(soup)
|
|
386
|
+
|
|
387
|
+
# Truncate long URLs
|
|
388
|
+
self._truncate_long_urls(soup)
|
|
389
|
+
|
|
390
|
+
# Remove long attributes
|
|
391
|
+
self._remove_long_attributes(soup)
|
|
392
|
+
|
|
393
|
+
# Truncate long text
|
|
394
|
+
self._truncate_long_text(soup)
|
|
395
|
+
|
|
396
|
+
def _remove_noise_elements(self, soup: BeautifulSoup) -> None:
|
|
397
|
+
"""Remove noise HTML elements"""
|
|
398
|
+
# Define noise tags
|
|
399
|
+
noise_tags = {
|
|
400
|
+
'meta', 'link', 'base', 'title', 'head', 'noscript',
|
|
401
|
+
'iframe', 'embed', 'object', 'svg', 'canvas',
|
|
402
|
+
'audio', 'video', 'source', 'track', 'area', 'map', 'param'
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
# Add conditional tags
|
|
406
|
+
if self.config.remove_scripts:
|
|
407
|
+
noise_tags.add('script')
|
|
408
|
+
if self.config.remove_styles:
|
|
409
|
+
noise_tags.add('style')
|
|
410
|
+
if not self.config.preserve_forms:
|
|
411
|
+
noise_tags.update({'form', 'input', 'button', 'select', 'textarea', 'fieldset', 'legend'})
|
|
412
|
+
|
|
413
|
+
# Remove noise tags
|
|
414
|
+
for tag_name in noise_tags:
|
|
415
|
+
for tag in soup.find_all(tag_name):
|
|
416
|
+
tag.decompose()
|
|
417
|
+
|
|
418
|
+
# Remove empty elements
|
|
419
|
+
if self.config.remove_empty_elements:
|
|
420
|
+
for tag in soup.find_all(['div', 'span', 'p']):
|
|
421
|
+
if not tag.get_text(strip=True) and not tag.find_all():
|
|
422
|
+
tag.decompose()
|
|
423
|
+
|
|
424
|
+
def _remove_noise_selectors(self, soup: BeautifulSoup) -> None:
|
|
425
|
+
"""Remove elements matching noise selectors"""
|
|
426
|
+
for selector in self.config.noise_selectors:
|
|
427
|
+
try:
|
|
428
|
+
elements = soup.select(selector)
|
|
429
|
+
for element in elements:
|
|
430
|
+
element.decompose()
|
|
431
|
+
except Exception:
|
|
432
|
+
# Skip invalid selectors
|
|
433
|
+
continue
|
|
434
|
+
|
|
435
|
+
def _clean_attributes(self, soup: BeautifulSoup) -> None:
|
|
436
|
+
"""Clean HTML attributes"""
|
|
437
|
+
# Attributes to remove
|
|
438
|
+
noise_attributes = {
|
|
439
|
+
'style', 'onclick', 'onload', 'onchange', 'onmouseover',
|
|
440
|
+
'onmouseout', 'onfocus', 'onblur', 'onsubmit', 'onreset',
|
|
441
|
+
'onerror', 'onabort', 'autocomplete', 'autofocus',
|
|
442
|
+
'checked', 'defer', 'disabled', 'hidden', 'loop',
|
|
443
|
+
'multiple', 'muted', 'open', 'readonly', 'required',
|
|
444
|
+
'tabindex', 'translate', 'draggable', 'contenteditable'
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
# Attributes to keep
|
|
448
|
+
keep_attributes = {
|
|
449
|
+
'id', 'class', 'href', 'src', 'alt', 'title',
|
|
450
|
+
'data-testid', 'data-test', 'data-cy',
|
|
451
|
+
'aria-label', 'aria-labelledby', 'aria-describedby', 'role'
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
for tag in soup.find_all(True):
|
|
455
|
+
if hasattr(tag, 'attrs'):
|
|
456
|
+
# Remove unwanted attributes
|
|
457
|
+
attrs_to_remove = set(tag.attrs.keys()) - keep_attributes
|
|
458
|
+
for attr in attrs_to_remove:
|
|
459
|
+
if attr in noise_attributes:
|
|
460
|
+
del tag.attrs[attr]
|
|
461
|
+
|
|
462
|
+
def _clean_tracking_urls(self, soup: BeautifulSoup) -> None:
|
|
463
|
+
"""Remove or replace tracking URLs"""
|
|
464
|
+
# Clean href attributes
|
|
465
|
+
for tag in soup.find_all(['a'], href=True):
|
|
466
|
+
href = tag.get('href', '')
|
|
467
|
+
if href:
|
|
468
|
+
for pattern in self.tracking_url_patterns:
|
|
469
|
+
if pattern.match(href):
|
|
470
|
+
tag['href'] = '#tracking-url-removed'
|
|
471
|
+
break
|
|
472
|
+
|
|
473
|
+
# Clean src attributes
|
|
474
|
+
for tag in soup.find_all(['img'], src=True):
|
|
475
|
+
src = tag.get('src', '')
|
|
476
|
+
if src:
|
|
477
|
+
for pattern in self.tracking_url_patterns:
|
|
478
|
+
if pattern.match(src):
|
|
479
|
+
tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
480
|
+
break
|
|
481
|
+
|
|
482
|
+
def _clean_base64_data(self, soup: BeautifulSoup) -> None:
|
|
483
|
+
"""Remove large base64 encoded data"""
|
|
484
|
+
for tag in soup.find_all(['img'], src=True):
|
|
485
|
+
src = tag.get('src', '')
|
|
486
|
+
if src:
|
|
487
|
+
for pattern in self.base64_patterns:
|
|
488
|
+
if pattern.search(src):
|
|
489
|
+
tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
|
|
490
|
+
break
|
|
491
|
+
|
|
492
|
+
def _truncate_long_urls(self, soup: BeautifulSoup) -> None:
|
|
493
|
+
"""Truncate URLs longer than max_url_length"""
|
|
494
|
+
max_length = self.config.max_url_length
|
|
495
|
+
|
|
496
|
+
for tag in soup.find_all(['a'], href=True):
|
|
497
|
+
href = tag.get('href', '')
|
|
498
|
+
if isinstance(href, str) and len(href) > max_length:
|
|
499
|
+
tag['href'] = href[:max_length] + '...truncated'
|
|
500
|
+
|
|
501
|
+
for tag in soup.find_all(['img'], src=True):
|
|
502
|
+
src = tag.get('src', '')
|
|
503
|
+
if isinstance(src, str) and len(src) > max_length and not src.startswith('data:'):
|
|
504
|
+
tag['src'] = src[:max_length] + '...truncated'
|
|
505
|
+
|
|
506
|
+
def _remove_long_attributes(self, soup: BeautifulSoup) -> None:
|
|
507
|
+
"""Remove attributes with extremely long values"""
|
|
508
|
+
for tag in soup.find_all():
|
|
509
|
+
attrs_to_remove = []
|
|
510
|
+
for attr, value in tag.attrs.items():
|
|
511
|
+
if isinstance(value, str) and len(value) > 800:
|
|
512
|
+
attrs_to_remove.append(attr)
|
|
513
|
+
elif any(tracking in attr.lower() for tracking in
|
|
514
|
+
['tracking', 'analytics', 'gtm', 'pixel']):
|
|
515
|
+
attrs_to_remove.append(attr)
|
|
516
|
+
|
|
517
|
+
for attr in attrs_to_remove:
|
|
518
|
+
del tag.attrs[attr]
|
|
519
|
+
|
|
520
|
+
def _truncate_long_text(self, soup: BeautifulSoup) -> None:
|
|
521
|
+
"""Truncate text content longer than max_text_length"""
|
|
522
|
+
max_length = self.config.max_text_length
|
|
523
|
+
|
|
524
|
+
for element in soup.find_all(text=True):
|
|
525
|
+
if element.parent.name not in ['script', 'style']:
|
|
526
|
+
text_content = str(element).strip()
|
|
527
|
+
if text_content and len(text_content) > max_length:
|
|
528
|
+
truncated_text = text_content[:max_length] + '...'
|
|
529
|
+
element.replace_with(truncated_text)
|
|
530
|
+
|
|
531
|
+
def _remove_comments(self, soup: BeautifulSoup) -> None:
|
|
532
|
+
"""Remove HTML comments"""
|
|
533
|
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
534
|
+
comment.extract()
|
|
535
|
+
|
|
536
|
+
def _normalize_whitespace(self, soup: BeautifulSoup) -> None:
|
|
537
|
+
"""Normalize whitespace in text content"""
|
|
538
|
+
for element in soup.find_all(text=True):
|
|
539
|
+
if element.parent.name not in ['script', 'style']:
|
|
540
|
+
# Replace multiple spaces with single space
|
|
541
|
+
cleaned_text = re.sub(r' {3,}', ' ', str(element))
|
|
542
|
+
# Replace multiple newlines with maximum 2
|
|
543
|
+
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
|
|
544
|
+
# Replace multiple tabs with single space
|
|
545
|
+
cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
|
|
546
|
+
element.replace_with(cleaned_text)
|
|
547
|
+
|
|
548
|
+
def _final_cleanup(self, html: str) -> str:
|
|
549
|
+
"""Final cleanup and optimization"""
|
|
550
|
+
# Remove empty attributes
|
|
551
|
+
html = re.sub(r'\s+\w+=""', '', html)
|
|
552
|
+
|
|
553
|
+
# Remove extra spaces in attributes
|
|
554
|
+
html = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html)
|
|
555
|
+
|
|
556
|
+
# Normalize quotes
|
|
557
|
+
html = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html)
|
|
558
|
+
|
|
559
|
+
# Remove trailing spaces before closing tags
|
|
560
|
+
html = re.sub(r'\s+(/?>)', r'\1', html)
|
|
561
|
+
|
|
562
|
+
# Advanced whitespace cleanup
|
|
563
|
+
html = self._advanced_whitespace_cleanup(html)
|
|
564
|
+
|
|
565
|
+
return html.strip()
|
|
566
|
+
|
|
567
|
+
def _advanced_whitespace_cleanup(self, html: str) -> str:
|
|
568
|
+
"""Advanced whitespace cleanup"""
|
|
569
|
+
# Remove excessive spaces
|
|
570
|
+
html = re.sub(r' {3,}', ' ', html)
|
|
571
|
+
|
|
572
|
+
# Remove excessive newlines
|
|
573
|
+
html = re.sub(r'\n{3,}', '\n\n', html)
|
|
574
|
+
|
|
575
|
+
# Clean space between tags
|
|
576
|
+
html = re.sub(r'>\s{2,}<', '> <', html)
|
|
577
|
+
|
|
578
|
+
return html
|
|
579
|
+
|
|
580
|
+
# ==========================================
|
|
581
|
+
# JAVASCRIPT DATA EXTRACTION
|
|
582
|
+
# ==========================================
|
|
583
|
+
|
|
584
|
+
def _extract_js_data(self, soup: BeautifulSoup) -> ExtractedJSData:
|
|
585
|
+
"""Extract valuable JavaScript data"""
|
|
586
|
+
extracted_data = ExtractedJSData()
|
|
587
|
+
|
|
588
|
+
# Find all script tags
|
|
589
|
+
script_tags = soup.find_all('script')
|
|
590
|
+
|
|
591
|
+
for script in script_tags:
|
|
592
|
+
if not script.string:
|
|
593
|
+
continue
|
|
594
|
+
|
|
595
|
+
script_content = script.string.strip()
|
|
596
|
+
|
|
597
|
+
# Skip empty scripts
|
|
598
|
+
if len(script_content) < 10:
|
|
599
|
+
continue
|
|
600
|
+
|
|
601
|
+
# Check for JSON-LD structured data
|
|
602
|
+
if script.get('type') == 'application/ld+json':
|
|
603
|
+
try:
|
|
604
|
+
json_data = json.loads(script_content)
|
|
605
|
+
# Convert to string dict for Pydantic compliance
|
|
606
|
+
str_data = {str(k): str(v) for k, v in json_data.items() if isinstance(k, (str, int, float))}
|
|
607
|
+
extracted_data.structured_data.append(str_data)
|
|
608
|
+
continue
|
|
609
|
+
except json.JSONDecodeError:
|
|
610
|
+
pass
|
|
611
|
+
|
|
612
|
+
# Extract data using patterns
|
|
613
|
+
self._extract_with_patterns(script_content, extracted_data)
|
|
614
|
+
|
|
615
|
+
return extracted_data
|
|
616
|
+
|
|
617
|
+
def _extract_with_patterns(self, script_content: str, extracted_data: ExtractedJSData) -> None:
|
|
618
|
+
"""Extract data using compiled regex patterns"""
|
|
619
|
+
for pattern in self.js_data_patterns:
|
|
620
|
+
matches = pattern.finditer(script_content)
|
|
621
|
+
for match in matches:
|
|
622
|
+
self._try_parse_json(match.group(1), extracted_data)
|
|
623
|
+
|
|
624
|
+
def _try_parse_json(self, json_str: str, extracted_data: ExtractedJSData) -> None:
|
|
625
|
+
"""Try to parse JSON string and add to extracted data"""
|
|
626
|
+
try:
|
|
627
|
+
json_data = json.loads(json_str)
|
|
628
|
+
|
|
629
|
+
if isinstance(json_data, dict):
|
|
630
|
+
# Convert to string dict for Pydantic compliance
|
|
631
|
+
str_data = {}
|
|
632
|
+
for k, v in json_data.items():
|
|
633
|
+
if isinstance(k, (str, int, float)) and isinstance(v, (str, int, float, bool)):
|
|
634
|
+
str_data[str(k)] = str(v)
|
|
635
|
+
|
|
636
|
+
if str_data:
|
|
637
|
+
extracted_data.ssr_data.update(str_data)
|
|
638
|
+
|
|
639
|
+
except json.JSONDecodeError:
|
|
640
|
+
# Skip invalid JSON
|
|
641
|
+
pass
|
|
642
|
+
|
|
643
|
+
# ==========================================
|
|
644
|
+
# UTILITY METHODS
|
|
645
|
+
# ==========================================
|
|
646
|
+
|
|
647
|
+
def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
|
|
648
|
+
"""Get statistics about the cleaning process"""
|
|
649
|
+
original_size = len(original_html)
|
|
650
|
+
cleaned_size = len(cleaned_html)
|
|
651
|
+
|
|
652
|
+
# Estimate token reduction (rough approximation)
|
|
653
|
+
original_tokens = original_size // 4 # Rough estimate: 4 chars per token
|
|
654
|
+
cleaned_tokens = cleaned_size // 4
|
|
655
|
+
|
|
656
|
+
size_reduction = original_size - cleaned_size
|
|
657
|
+
size_reduction_percent = (size_reduction / original_size * 100) if original_size > 0 else 0.0
|
|
658
|
+
token_savings = original_tokens - cleaned_tokens
|
|
659
|
+
token_savings_percent = (token_savings / original_tokens * 100) if original_tokens > 0 else 0.0
|
|
660
|
+
|
|
661
|
+
return HTMLCleaningStats(
|
|
662
|
+
original_size_bytes=original_size,
|
|
663
|
+
cleaned_size_bytes=cleaned_size,
|
|
664
|
+
size_reduction_bytes=size_reduction,
|
|
665
|
+
size_reduction_percent=size_reduction_percent,
|
|
666
|
+
estimated_original_tokens=original_tokens,
|
|
667
|
+
estimated_cleaned_tokens=cleaned_tokens,
|
|
668
|
+
estimated_token_savings=token_savings,
|
|
669
|
+
estimated_token_savings_percent=token_savings_percent
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
def update_config(self, **kwargs) -> None:
|
|
673
|
+
"""Update configuration with new values"""
|
|
674
|
+
current_data = self.config.model_dump()
|
|
675
|
+
current_data.update(kwargs)
|
|
676
|
+
self.config = HTMLCleaningConfig.model_validate(current_data)
|
|
677
|
+
|
|
678
|
+
# Recompile patterns if needed
|
|
679
|
+
self._compile_patterns()
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
# ==========================================
|
|
683
|
+
# CONVENIENCE FUNCTIONS
|
|
684
|
+
# ==========================================
|
|
685
|
+
|
|
686
|
+
def get_html_manager(config: Optional[HTMLCleaningConfig] = None) -> HTMLManager:
|
|
687
|
+
"""
|
|
688
|
+
Get an HTML manager instance
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
config: Optional HTML cleaning configuration
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Configured HTMLManager instance
|
|
695
|
+
"""
|
|
696
|
+
return HTMLManager(config=config)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
async def quick_clean_html(html: str, **kwargs) -> str:
|
|
700
|
+
"""
|
|
701
|
+
Quick HTML cleaning convenience function
|
|
702
|
+
|
|
703
|
+
Args:
|
|
704
|
+
html: Raw HTML content
|
|
705
|
+
**kwargs: Cleaning options
|
|
706
|
+
|
|
707
|
+
Returns:
|
|
708
|
+
Cleaned HTML
|
|
709
|
+
"""
|
|
710
|
+
config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
|
|
711
|
+
config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
|
|
712
|
+
|
|
713
|
+
manager = get_html_manager(config)
|
|
714
|
+
return await manager.clean_html(html, **kwargs)
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
def quick_clean_html_sync(html: str, **kwargs) -> str:
|
|
718
|
+
"""
|
|
719
|
+
Quick synchronous HTML cleaning convenience function
|
|
720
|
+
|
|
721
|
+
Args:
|
|
722
|
+
html: Raw HTML content
|
|
723
|
+
**kwargs: Cleaning options
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
Cleaned HTML
|
|
727
|
+
"""
|
|
728
|
+
config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
|
|
729
|
+
config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
|
|
730
|
+
|
|
731
|
+
manager = get_html_manager(config)
|
|
732
|
+
return manager.clean_html_sync(html, **kwargs)
|