unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. unrealon/__init__.py +23 -21
  2. unrealon-1.1.0.dist-info/METADATA +164 -0
  3. unrealon-1.1.0.dist-info/RECORD +82 -0
  4. {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
  5. unrealon-1.1.0.dist-info/entry_points.txt +9 -0
  6. {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
  7. unrealon_bridge/__init__.py +114 -0
  8. unrealon_bridge/cli.py +316 -0
  9. unrealon_bridge/client/__init__.py +93 -0
  10. unrealon_bridge/client/base.py +78 -0
  11. unrealon_bridge/client/commands.py +89 -0
  12. unrealon_bridge/client/connection.py +90 -0
  13. unrealon_bridge/client/events.py +65 -0
  14. unrealon_bridge/client/health.py +38 -0
  15. unrealon_bridge/client/html_parser.py +146 -0
  16. unrealon_bridge/client/logging.py +139 -0
  17. unrealon_bridge/client/proxy.py +70 -0
  18. unrealon_bridge/client/scheduler.py +450 -0
  19. unrealon_bridge/client/session.py +70 -0
  20. unrealon_bridge/configs/__init__.py +14 -0
  21. unrealon_bridge/configs/bridge_config.py +212 -0
  22. unrealon_bridge/configs/bridge_config.yaml +39 -0
  23. unrealon_bridge/models/__init__.py +138 -0
  24. unrealon_bridge/models/base.py +28 -0
  25. unrealon_bridge/models/command.py +41 -0
  26. unrealon_bridge/models/events.py +40 -0
  27. unrealon_bridge/models/html_parser.py +79 -0
  28. unrealon_bridge/models/logging.py +55 -0
  29. unrealon_bridge/models/parser.py +63 -0
  30. unrealon_bridge/models/proxy.py +41 -0
  31. unrealon_bridge/models/requests.py +95 -0
  32. unrealon_bridge/models/responses.py +88 -0
  33. unrealon_bridge/models/scheduler.py +592 -0
  34. unrealon_bridge/models/session.py +28 -0
  35. unrealon_bridge/server/__init__.py +91 -0
  36. unrealon_bridge/server/base.py +171 -0
  37. unrealon_bridge/server/handlers/__init__.py +23 -0
  38. unrealon_bridge/server/handlers/command.py +110 -0
  39. unrealon_bridge/server/handlers/html_parser.py +139 -0
  40. unrealon_bridge/server/handlers/logging.py +95 -0
  41. unrealon_bridge/server/handlers/parser.py +95 -0
  42. unrealon_bridge/server/handlers/proxy.py +75 -0
  43. unrealon_bridge/server/handlers/scheduler.py +545 -0
  44. unrealon_bridge/server/handlers/session.py +66 -0
  45. unrealon_browser/__init__.py +61 -18
  46. unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
  47. unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
  48. unrealon_browser/{src/core → core}/browser_manager.py +2 -2
  49. unrealon_browser/{src/managers → managers}/captcha.py +1 -1
  50. unrealon_browser/{src/managers → managers}/cookies.py +1 -1
  51. unrealon_browser/managers/logger_bridge.py +231 -0
  52. unrealon_browser/{src/managers → managers}/profile.py +1 -1
  53. unrealon_driver/__init__.py +73 -19
  54. unrealon_driver/browser/__init__.py +8 -0
  55. unrealon_driver/browser/config.py +74 -0
  56. unrealon_driver/browser/manager.py +416 -0
  57. unrealon_driver/exceptions.py +28 -0
  58. unrealon_driver/parser/__init__.py +55 -0
  59. unrealon_driver/parser/cli_manager.py +141 -0
  60. unrealon_driver/parser/daemon_manager.py +227 -0
  61. unrealon_driver/parser/managers/__init__.py +46 -0
  62. unrealon_driver/parser/managers/browser.py +51 -0
  63. unrealon_driver/parser/managers/config.py +281 -0
  64. unrealon_driver/parser/managers/error.py +412 -0
  65. unrealon_driver/parser/managers/html.py +732 -0
  66. unrealon_driver/parser/managers/logging.py +609 -0
  67. unrealon_driver/parser/managers/result.py +321 -0
  68. unrealon_driver/parser/parser_manager.py +628 -0
  69. unrealon/sdk_config.py +0 -88
  70. unrealon-1.0.9.dist-info/METADATA +0 -810
  71. unrealon-1.0.9.dist-info/RECORD +0 -246
  72. unrealon_browser/pyproject.toml +0 -182
  73. unrealon_browser/src/__init__.py +0 -62
  74. unrealon_browser/src/managers/logger_bridge.py +0 -395
  75. unrealon_driver/README.md +0 -204
  76. unrealon_driver/pyproject.toml +0 -187
  77. unrealon_driver/src/__init__.py +0 -90
  78. unrealon_driver/src/cli/__init__.py +0 -10
  79. unrealon_driver/src/cli/main.py +0 -66
  80. unrealon_driver/src/cli/simple.py +0 -510
  81. unrealon_driver/src/config/__init__.py +0 -11
  82. unrealon_driver/src/config/auto_config.py +0 -478
  83. unrealon_driver/src/core/__init__.py +0 -18
  84. unrealon_driver/src/core/exceptions.py +0 -289
  85. unrealon_driver/src/core/parser.py +0 -638
  86. unrealon_driver/src/dto/__init__.py +0 -66
  87. unrealon_driver/src/dto/cli.py +0 -119
  88. unrealon_driver/src/dto/config.py +0 -18
  89. unrealon_driver/src/dto/events.py +0 -237
  90. unrealon_driver/src/dto/execution.py +0 -313
  91. unrealon_driver/src/dto/services.py +0 -311
  92. unrealon_driver/src/execution/__init__.py +0 -23
  93. unrealon_driver/src/execution/daemon_mode.py +0 -317
  94. unrealon_driver/src/execution/interactive_mode.py +0 -88
  95. unrealon_driver/src/execution/modes.py +0 -45
  96. unrealon_driver/src/execution/scheduled_mode.py +0 -209
  97. unrealon_driver/src/execution/test_mode.py +0 -250
  98. unrealon_driver/src/logging/__init__.py +0 -24
  99. unrealon_driver/src/logging/driver_logger.py +0 -512
  100. unrealon_driver/src/services/__init__.py +0 -24
  101. unrealon_driver/src/services/browser_service.py +0 -726
  102. unrealon_driver/src/services/llm/__init__.py +0 -15
  103. unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
  104. unrealon_driver/src/services/llm/llm.py +0 -195
  105. unrealon_driver/src/services/logger_service.py +0 -232
  106. unrealon_driver/src/services/metrics_service.py +0 -185
  107. unrealon_driver/src/services/scheduler_service.py +0 -489
  108. unrealon_driver/src/services/websocket_service.py +0 -362
  109. unrealon_driver/src/utils/__init__.py +0 -16
  110. unrealon_driver/src/utils/service_factory.py +0 -317
  111. unrealon_driver/src/utils/time_formatter.py +0 -338
  112. unrealon_llm/README.md +0 -44
  113. unrealon_llm/__init__.py +0 -26
  114. unrealon_llm/pyproject.toml +0 -154
  115. unrealon_llm/src/__init__.py +0 -228
  116. unrealon_llm/src/cli/__init__.py +0 -0
  117. unrealon_llm/src/core/__init__.py +0 -11
  118. unrealon_llm/src/core/smart_client.py +0 -438
  119. unrealon_llm/src/dto/__init__.py +0 -155
  120. unrealon_llm/src/dto/models/__init__.py +0 -0
  121. unrealon_llm/src/dto/models/config.py +0 -343
  122. unrealon_llm/src/dto/models/core.py +0 -328
  123. unrealon_llm/src/dto/models/enums.py +0 -123
  124. unrealon_llm/src/dto/models/html_analysis.py +0 -345
  125. unrealon_llm/src/dto/models/statistics.py +0 -473
  126. unrealon_llm/src/dto/models/translation.py +0 -383
  127. unrealon_llm/src/dto/models/type_conversion.py +0 -462
  128. unrealon_llm/src/dto/schemas/__init__.py +0 -0
  129. unrealon_llm/src/exceptions.py +0 -392
  130. unrealon_llm/src/llm_config/__init__.py +0 -20
  131. unrealon_llm/src/llm_config/logging_config.py +0 -178
  132. unrealon_llm/src/llm_logging/__init__.py +0 -42
  133. unrealon_llm/src/llm_logging/llm_events.py +0 -107
  134. unrealon_llm/src/llm_logging/llm_logger.py +0 -466
  135. unrealon_llm/src/managers/__init__.py +0 -15
  136. unrealon_llm/src/managers/cache_manager.py +0 -67
  137. unrealon_llm/src/managers/cost_manager.py +0 -107
  138. unrealon_llm/src/managers/request_manager.py +0 -298
  139. unrealon_llm/src/modules/__init__.py +0 -0
  140. unrealon_llm/src/modules/html_processor/__init__.py +0 -25
  141. unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
  142. unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
  143. unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
  144. unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
  145. unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
  146. unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
  147. unrealon_llm/src/modules/html_processor/processor.py +0 -102
  148. unrealon_llm/src/modules/llm/__init__.py +0 -0
  149. unrealon_llm/src/modules/translator/__init__.py +0 -0
  150. unrealon_llm/src/provider.py +0 -116
  151. unrealon_llm/src/utils/__init__.py +0 -95
  152. unrealon_llm/src/utils/common.py +0 -64
  153. unrealon_llm/src/utils/data_extractor.py +0 -188
  154. unrealon_llm/src/utils/html_cleaner.py +0 -767
  155. unrealon_llm/src/utils/language_detector.py +0 -308
  156. unrealon_llm/src/utils/models_cache.py +0 -592
  157. unrealon_llm/src/utils/smart_counter.py +0 -229
  158. unrealon_llm/src/utils/token_counter.py +0 -189
  159. unrealon_sdk/README.md +0 -25
  160. unrealon_sdk/__init__.py +0 -30
  161. unrealon_sdk/pyproject.toml +0 -231
  162. unrealon_sdk/src/__init__.py +0 -150
  163. unrealon_sdk/src/cli/__init__.py +0 -12
  164. unrealon_sdk/src/cli/commands/__init__.py +0 -22
  165. unrealon_sdk/src/cli/commands/benchmark.py +0 -42
  166. unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
  167. unrealon_sdk/src/cli/commands/health.py +0 -46
  168. unrealon_sdk/src/cli/commands/integration.py +0 -498
  169. unrealon_sdk/src/cli/commands/reports.py +0 -43
  170. unrealon_sdk/src/cli/commands/security.py +0 -36
  171. unrealon_sdk/src/cli/commands/server.py +0 -483
  172. unrealon_sdk/src/cli/commands/servers.py +0 -56
  173. unrealon_sdk/src/cli/commands/tests.py +0 -55
  174. unrealon_sdk/src/cli/main.py +0 -126
  175. unrealon_sdk/src/cli/utils/reporter.py +0 -519
  176. unrealon_sdk/src/clients/openapi.yaml +0 -3347
  177. unrealon_sdk/src/clients/python_http/__init__.py +0 -3
  178. unrealon_sdk/src/clients/python_http/api_config.py +0 -228
  179. unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
  180. unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
  181. unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
  182. unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
  183. unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
  184. unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
  185. unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
  186. unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
  187. unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
  188. unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
  189. unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
  190. unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
  191. unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
  192. unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
  193. unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
  194. unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
  195. unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
  196. unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
  197. unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
  198. unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
  199. unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
  200. unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
  201. unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
  202. unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
  203. unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
  204. unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
  205. unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
  206. unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
  207. unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
  208. unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
  209. unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
  210. unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
  211. unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
  212. unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
  213. unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
  214. unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
  215. unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
  216. unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
  217. unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
  218. unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
  219. unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
  220. unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
  221. unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
  222. unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
  223. unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
  224. unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
  225. unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
  226. unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
  227. unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
  228. unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
  229. unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
  230. unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
  231. unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
  232. unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
  233. unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
  234. unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
  235. unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
  236. unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
  237. unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
  238. unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
  239. unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
  240. unrealon_sdk/src/clients/python_websocket/client.py +0 -490
  241. unrealon_sdk/src/clients/python_websocket/events.py +0 -732
  242. unrealon_sdk/src/clients/python_websocket/example.py +0 -136
  243. unrealon_sdk/src/clients/python_websocket/types.py +0 -871
  244. unrealon_sdk/src/core/__init__.py +0 -64
  245. unrealon_sdk/src/core/client.py +0 -556
  246. unrealon_sdk/src/core/config.py +0 -465
  247. unrealon_sdk/src/core/exceptions.py +0 -239
  248. unrealon_sdk/src/core/metadata.py +0 -191
  249. unrealon_sdk/src/core/models.py +0 -142
  250. unrealon_sdk/src/core/types.py +0 -68
  251. unrealon_sdk/src/dto/__init__.py +0 -268
  252. unrealon_sdk/src/dto/authentication.py +0 -108
  253. unrealon_sdk/src/dto/cache.py +0 -208
  254. unrealon_sdk/src/dto/common.py +0 -19
  255. unrealon_sdk/src/dto/concurrency.py +0 -393
  256. unrealon_sdk/src/dto/events.py +0 -108
  257. unrealon_sdk/src/dto/health.py +0 -339
  258. unrealon_sdk/src/dto/load_balancing.py +0 -336
  259. unrealon_sdk/src/dto/logging.py +0 -230
  260. unrealon_sdk/src/dto/performance.py +0 -165
  261. unrealon_sdk/src/dto/rate_limiting.py +0 -295
  262. unrealon_sdk/src/dto/resource_pooling.py +0 -128
  263. unrealon_sdk/src/dto/structured_logging.py +0 -112
  264. unrealon_sdk/src/dto/task_scheduling.py +0 -121
  265. unrealon_sdk/src/dto/websocket.py +0 -55
  266. unrealon_sdk/src/enterprise/__init__.py +0 -59
  267. unrealon_sdk/src/enterprise/authentication.py +0 -401
  268. unrealon_sdk/src/enterprise/cache_manager.py +0 -578
  269. unrealon_sdk/src/enterprise/error_recovery.py +0 -494
  270. unrealon_sdk/src/enterprise/event_system.py +0 -549
  271. unrealon_sdk/src/enterprise/health_monitor.py +0 -747
  272. unrealon_sdk/src/enterprise/load_balancer.py +0 -964
  273. unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
  274. unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
  275. unrealon_sdk/src/enterprise/logging/development.py +0 -744
  276. unrealon_sdk/src/enterprise/logging/service.py +0 -410
  277. unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
  278. unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
  279. unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
  280. unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
  281. unrealon_sdk/src/enterprise/resource_pool.py +0 -763
  282. unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
  283. unrealon_sdk/src/internal/__init__.py +0 -10
  284. unrealon_sdk/src/internal/command_router.py +0 -497
  285. unrealon_sdk/src/internal/connection_manager.py +0 -397
  286. unrealon_sdk/src/internal/http_client.py +0 -446
  287. unrealon_sdk/src/internal/websocket_client.py +0 -420
  288. unrealon_sdk/src/provider.py +0 -471
  289. unrealon_sdk/src/utils.py +0 -234
  290. /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
  291. /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
  292. /unrealon_browser/{src/cli → cli}/main.py +0 -0
  293. /unrealon_browser/{src/core → core}/__init__.py +0 -0
  294. /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
  295. /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
  296. /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
  297. /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
  298. /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
  299. /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
  300. /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
  301. /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
  302. /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
@@ -0,0 +1,732 @@
1
+ """
2
+ HTML Manager - Smart HTML processing and cleaning with Pydantic v2
3
+
4
+ Strict compliance with CRITICAL_REQUIREMENTS.md:
5
+ - No Dict[str, Any] usage
6
+ - Complete type annotations
7
+ - Pydantic v2 models everywhere
8
+ - Custom exception hierarchy
9
+ """
10
+
11
+ import json
12
+ import re
13
+ from typing import Optional, List, Union
14
+ from pathlib import Path
15
+ from pydantic import BaseModel, Field, ConfigDict, field_validator
16
+ import asyncio
17
+ import concurrent.futures
18
+
19
+ from bs4 import BeautifulSoup, Comment
20
+
21
+ from unrealon_rpc.logging import get_logger
22
+
23
+
24
+ class HTMLCleaningConfig(BaseModel):
25
+ """HTML cleaning configuration with strict typing"""
26
+ model_config = ConfigDict(
27
+ validate_assignment=True,
28
+ extra="forbid"
29
+ )
30
+
31
+ # Cleaning modes
32
+ aggressive_cleaning: bool = Field(
33
+ default=True,
34
+ description="Enable aggressive cleaning"
35
+ )
36
+ preserve_js_data: bool = Field(
37
+ default=True,
38
+ description="Preserve JavaScript data during cleaning"
39
+ )
40
+
41
+ # Content preservation
42
+ preserve_images: bool = Field(
43
+ default=False,
44
+ description="Preserve image tags"
45
+ )
46
+ preserve_links: bool = Field(
47
+ default=True,
48
+ description="Preserve link tags"
49
+ )
50
+ preserve_forms: bool = Field(
51
+ default=False,
52
+ description="Preserve form elements"
53
+ )
54
+
55
+ # Size limits
56
+ max_html_size: int = Field(
57
+ default=1000000,
58
+ ge=1000,
59
+ le=10000000,
60
+ description="Maximum HTML size in characters"
61
+ )
62
+ max_text_length: int = Field(
63
+ default=300,
64
+ ge=50,
65
+ le=1000,
66
+ description="Maximum text content length per element"
67
+ )
68
+ max_url_length: int = Field(
69
+ default=500,
70
+ ge=100,
71
+ le=2000,
72
+ description="Maximum URL length"
73
+ )
74
+
75
+ # Noise removal
76
+ remove_comments: bool = Field(
77
+ default=True,
78
+ description="Remove HTML comments"
79
+ )
80
+ remove_scripts: bool = Field(
81
+ default=True,
82
+ description="Remove script tags"
83
+ )
84
+ remove_styles: bool = Field(
85
+ default=True,
86
+ description="Remove style tags"
87
+ )
88
+ remove_tracking: bool = Field(
89
+ default=True,
90
+ description="Remove tracking URLs and attributes"
91
+ )
92
+
93
+ # Whitespace handling
94
+ normalize_whitespace: bool = Field(
95
+ default=True,
96
+ description="Normalize whitespace"
97
+ )
98
+ remove_empty_elements: bool = Field(
99
+ default=True,
100
+ description="Remove empty elements"
101
+ )
102
+
103
+ # Custom selectors
104
+ noise_selectors: List[str] = Field(
105
+ default_factory=lambda: [
106
+ '[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]',
107
+ '[class*="footer"]', '[class*="header"]', '[class*="ads"]',
108
+ '[class*="popup"]', '[class*="modal"]', '[class*="cookie"]'
109
+ ],
110
+ description="CSS selectors for noise elements to remove"
111
+ )
112
+
113
+
114
+ class HTMLCleaningStats(BaseModel):
115
+ """HTML cleaning statistics"""
116
+ model_config = ConfigDict(
117
+ validate_assignment=True,
118
+ extra="forbid"
119
+ )
120
+
121
+ original_size_bytes: int = Field(ge=0)
122
+ cleaned_size_bytes: int = Field(ge=0)
123
+ size_reduction_bytes: int = Field(ge=0)
124
+ size_reduction_percent: float = Field(ge=0.0, le=100.0)
125
+ estimated_original_tokens: int = Field(ge=0)
126
+ estimated_cleaned_tokens: int = Field(ge=0)
127
+ estimated_token_savings: int = Field(ge=0)
128
+ estimated_token_savings_percent: float = Field(ge=0.0, le=100.0)
129
+
130
+
131
+ class ExtractedJSData(BaseModel):
132
+ """Extracted JavaScript data structure"""
133
+ model_config = ConfigDict(
134
+ validate_assignment=True,
135
+ extra="forbid"
136
+ )
137
+
138
+ ssr_data: dict[str, str] = Field(default_factory=dict)
139
+ structured_data: List[dict[str, str]] = Field(default_factory=list)
140
+ raw_extracts: List[dict[str, str]] = Field(default_factory=list)
141
+
142
+
143
+ class HTMLManagerError(Exception):
144
+ """Base exception for HTML manager"""
145
+ def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
146
+ self.message = message
147
+ self.operation = operation
148
+ self.details = details or {}
149
+ super().__init__(message)
150
+
151
+
152
+ class HTMLParsingError(HTMLManagerError):
153
+ """Raised when HTML parsing fails"""
154
+ pass
155
+
156
+
157
+ class HTMLCleaningError(HTMLManagerError):
158
+ """Raised when HTML cleaning fails"""
159
+ pass
160
+
161
+
162
+ class HTMLManager:
163
+ """
164
+ 🧹 HTML Manager - Smart HTML processing and cleaning
165
+
166
+ Features:
167
+ - LLM Optimized: Removes noise, preserves valuable content
168
+ - Token Efficient: Reduces HTML size for cost-effective LLM analysis
169
+ - Smart Extraction: Preserves JavaScript data and structured content
170
+ - Performance: Fast cleaning with configurable aggressiveness
171
+ - Safe: Handles malformed HTML gracefully
172
+ - Type Safety: Full Pydantic v2 compliance
173
+ """
174
+
175
+ def __init__(self, config: Optional[HTMLCleaningConfig] = None):
176
+ self.config = config or HTMLCleaningConfig()
177
+ self.logger = get_logger()
178
+
179
+ # Compile regex patterns for performance
180
+ self._compile_patterns()
181
+
182
+ def _compile_patterns(self) -> None:
183
+ """Compile regex patterns for performance"""
184
+ # Tracking URL patterns
185
+ self.tracking_url_patterns = [
186
+ re.compile(r'https://aax-[^\s"]{200,}', re.IGNORECASE),
187
+ re.compile(r'https://[^\s"]*tracking[^\s"]{100,}', re.IGNORECASE),
188
+ re.compile(r'https://[^\s"]*analytics[^\s"]{100,}', re.IGNORECASE),
189
+ re.compile(r'https://[^\s"]*gtm[^\s"]{100,}', re.IGNORECASE),
190
+ ]
191
+
192
+ # Base64 patterns
193
+ self.base64_patterns = [
194
+ re.compile(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}'),
195
+ re.compile(r'data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
196
+ re.compile(r'data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
197
+ ]
198
+
199
+ # JavaScript data patterns
200
+ self.js_data_patterns = [
201
+ re.compile(r'__NEXT_DATA__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
202
+ re.compile(r'__NUXT__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
203
+ re.compile(r'window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
204
+ re.compile(r'dataLayer\s*=\s*(\[.+?\]);?', re.DOTALL | re.IGNORECASE),
205
+ ]
206
+
207
+ # ==========================================
208
+ # MAIN CLEANING METHODS
209
+ # ==========================================
210
+
211
+ async def clean_html(
212
+ self,
213
+ html: str,
214
+ aggressive: Optional[bool] = None,
215
+ preserve_js_data: Optional[bool] = None
216
+ ) -> str:
217
+ """
218
+ Clean HTML content for LLM analysis
219
+
220
+ Args:
221
+ html: Raw HTML content
222
+ aggressive: Override aggressive cleaning setting
223
+ preserve_js_data: Override JS data preservation setting
224
+
225
+ Returns:
226
+ Cleaned HTML optimized for LLM
227
+ """
228
+ if not html or not html.strip():
229
+ return ""
230
+
231
+ # Use config defaults or overrides
232
+ aggressive_cleaning = aggressive if aggressive is not None else self.config.aggressive_cleaning
233
+ preserve_js = preserve_js_data if preserve_js_data is not None else self.config.preserve_js_data
234
+
235
+ try:
236
+ self.logger.info(f"Cleaning HTML: {len(html)} characters")
237
+
238
+ # Check size limits
239
+ if len(html) > self.config.max_html_size:
240
+ self.logger.warning(f"HTML size ({len(html)}) exceeds limit ({self.config.max_html_size})")
241
+ html = html[:self.config.max_html_size]
242
+
243
+ # Parse HTML
244
+ soup = BeautifulSoup(html, 'html.parser')
245
+
246
+ # Extract JavaScript data before cleaning
247
+ extracted_data = ExtractedJSData()
248
+ if preserve_js:
249
+ extracted_data = self._extract_js_data(soup)
250
+
251
+ # Apply cleaning steps
252
+ if aggressive_cleaning:
253
+ self._aggressive_cleaning(soup)
254
+ else:
255
+ self._standard_cleaning(soup)
256
+
257
+ # Get cleaned HTML
258
+ cleaned_html = str(soup)
259
+
260
+ # Final cleanup
261
+ cleaned_html = self._final_cleanup(cleaned_html)
262
+
263
+ # Log results
264
+ original_size = len(html)
265
+ cleaned_size = len(cleaned_html)
266
+ reduction = ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0
267
+
268
+ self.logger.info(
269
+ f"HTML cleaned: {original_size} → {cleaned_size} chars "
270
+ f"({reduction:.1f}% reduction)"
271
+ )
272
+
273
+ return cleaned_html
274
+
275
+ except Exception as e:
276
+ self.logger.error(f"HTML cleaning failed: {e}")
277
+ raise HTMLCleaningError(
278
+ message=f"Failed to clean HTML: {e}",
279
+ operation="clean_html",
280
+ details={"html_size": str(len(html))}
281
+ ) from e
282
+
283
+ def clean_html_sync(self, html: str, **kwargs) -> str:
284
+ """
285
+ Synchronous HTML cleaning
286
+
287
+ Args:
288
+ html: Raw HTML content
289
+ **kwargs: Cleaning options
290
+
291
+ Returns:
292
+ Cleaned HTML
293
+ """
294
+ # Handle running event loop
295
+ try:
296
+ loop = asyncio.get_running_loop()
297
+ # If we're in an event loop, create a new thread
298
+ with concurrent.futures.ThreadPoolExecutor() as executor:
299
+ future = executor.submit(asyncio.run, self.clean_html(html, **kwargs))
300
+ return future.result()
301
+ except RuntimeError:
302
+ # No event loop running, safe to use asyncio.run
303
+ return asyncio.run(self.clean_html(html, **kwargs))
304
+
305
+ async def parse_and_clean_html(
306
+ self,
307
+ html: str,
308
+ schema: Optional[dict[str, str]] = None,
309
+ instructions: Optional[str] = None,
310
+ **kwargs
311
+ ) -> dict[str, str]:
312
+ """
313
+ Parse and clean HTML with LLM analysis preparation
314
+
315
+ Args:
316
+ html: Raw HTML content
317
+ schema: Optional data schema for extraction
318
+ instructions: Optional parsing instructions
319
+ **kwargs: Additional options
320
+
321
+ Returns:
322
+ Dictionary with cleaned HTML and metadata
323
+ """
324
+ try:
325
+ # Clean HTML
326
+ cleaned_html = await self.clean_html(html, **kwargs)
327
+
328
+ # Get cleaning stats
329
+ stats = self.get_cleaning_stats(html, cleaned_html)
330
+
331
+ result = {
332
+ "cleaned_html": cleaned_html,
333
+ "original_size": str(stats.original_size_bytes),
334
+ "cleaned_size": str(stats.cleaned_size_bytes),
335
+ "reduction_percent": f"{stats.size_reduction_percent:.1f}",
336
+ "estimated_token_savings": str(stats.estimated_token_savings)
337
+ }
338
+
339
+ if schema:
340
+ result["schema"] = str(schema)
341
+ if instructions:
342
+ result["instructions"] = instructions
343
+
344
+ return result
345
+
346
+ except Exception as e:
347
+ raise HTMLCleaningError(
348
+ message=f"Failed to parse and clean HTML: {e}",
349
+ operation="parse_and_clean_html"
350
+ ) from e
351
+
352
+ # ==========================================
353
+ # CLEANING IMPLEMENTATION
354
+ # ==========================================
355
+
356
+ def _standard_cleaning(self, soup: BeautifulSoup) -> None:
357
+ """Apply standard cleaning"""
358
+ # Remove noise elements
359
+ self._remove_noise_elements(soup)
360
+
361
+ # Clean attributes
362
+ self._clean_attributes(soup)
363
+
364
+ # Remove comments
365
+ if self.config.remove_comments:
366
+ self._remove_comments(soup)
367
+
368
+ # Normalize whitespace
369
+ if self.config.normalize_whitespace:
370
+ self._normalize_whitespace(soup)
371
+
372
+ def _aggressive_cleaning(self, soup: BeautifulSoup) -> None:
373
+ """Apply aggressive cleaning"""
374
+ # Standard cleaning first
375
+ self._standard_cleaning(soup)
376
+
377
+ # Remove noise selectors
378
+ self._remove_noise_selectors(soup)
379
+
380
+ # Clean tracking URLs
381
+ if self.config.remove_tracking:
382
+ self._clean_tracking_urls(soup)
383
+
384
+ # Clean base64 data
385
+ self._clean_base64_data(soup)
386
+
387
+ # Truncate long URLs
388
+ self._truncate_long_urls(soup)
389
+
390
+ # Remove long attributes
391
+ self._remove_long_attributes(soup)
392
+
393
+ # Truncate long text
394
+ self._truncate_long_text(soup)
395
+
396
+ def _remove_noise_elements(self, soup: BeautifulSoup) -> None:
397
+ """Remove noise HTML elements"""
398
+ # Define noise tags
399
+ noise_tags = {
400
+ 'meta', 'link', 'base', 'title', 'head', 'noscript',
401
+ 'iframe', 'embed', 'object', 'svg', 'canvas',
402
+ 'audio', 'video', 'source', 'track', 'area', 'map', 'param'
403
+ }
404
+
405
+ # Add conditional tags
406
+ if self.config.remove_scripts:
407
+ noise_tags.add('script')
408
+ if self.config.remove_styles:
409
+ noise_tags.add('style')
410
+ if not self.config.preserve_forms:
411
+ noise_tags.update({'form', 'input', 'button', 'select', 'textarea', 'fieldset', 'legend'})
412
+
413
+ # Remove noise tags
414
+ for tag_name in noise_tags:
415
+ for tag in soup.find_all(tag_name):
416
+ tag.decompose()
417
+
418
+ # Remove empty elements
419
+ if self.config.remove_empty_elements:
420
+ for tag in soup.find_all(['div', 'span', 'p']):
421
+ if not tag.get_text(strip=True) and not tag.find_all():
422
+ tag.decompose()
423
+
424
+ def _remove_noise_selectors(self, soup: BeautifulSoup) -> None:
425
+ """Remove elements matching noise selectors"""
426
+ for selector in self.config.noise_selectors:
427
+ try:
428
+ elements = soup.select(selector)
429
+ for element in elements:
430
+ element.decompose()
431
+ except Exception:
432
+ # Skip invalid selectors
433
+ continue
434
+
435
+ def _clean_attributes(self, soup: BeautifulSoup) -> None:
436
+ """Clean HTML attributes"""
437
+ # Attributes to remove
438
+ noise_attributes = {
439
+ 'style', 'onclick', 'onload', 'onchange', 'onmouseover',
440
+ 'onmouseout', 'onfocus', 'onblur', 'onsubmit', 'onreset',
441
+ 'onerror', 'onabort', 'autocomplete', 'autofocus',
442
+ 'checked', 'defer', 'disabled', 'hidden', 'loop',
443
+ 'multiple', 'muted', 'open', 'readonly', 'required',
444
+ 'tabindex', 'translate', 'draggable', 'contenteditable'
445
+ }
446
+
447
+ # Attributes to keep
448
+ keep_attributes = {
449
+ 'id', 'class', 'href', 'src', 'alt', 'title',
450
+ 'data-testid', 'data-test', 'data-cy',
451
+ 'aria-label', 'aria-labelledby', 'aria-describedby', 'role'
452
+ }
453
+
454
+ for tag in soup.find_all(True):
455
+ if hasattr(tag, 'attrs'):
456
+ # Remove unwanted attributes
457
+ attrs_to_remove = set(tag.attrs.keys()) - keep_attributes
458
+ for attr in attrs_to_remove:
459
+ if attr in noise_attributes:
460
+ del tag.attrs[attr]
461
+
462
+ def _clean_tracking_urls(self, soup: BeautifulSoup) -> None:
463
+ """Remove or replace tracking URLs"""
464
+ # Clean href attributes
465
+ for tag in soup.find_all(['a'], href=True):
466
+ href = tag.get('href', '')
467
+ if href:
468
+ for pattern in self.tracking_url_patterns:
469
+ if pattern.match(href):
470
+ tag['href'] = '#tracking-url-removed'
471
+ break
472
+
473
+ # Clean src attributes
474
+ for tag in soup.find_all(['img'], src=True):
475
+ src = tag.get('src', '')
476
+ if src:
477
+ for pattern in self.tracking_url_patterns:
478
+ if pattern.match(src):
479
+ tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
480
+ break
481
+
482
+ def _clean_base64_data(self, soup: BeautifulSoup) -> None:
483
+ """Remove large base64 encoded data"""
484
+ for tag in soup.find_all(['img'], src=True):
485
+ src = tag.get('src', '')
486
+ if src:
487
+ for pattern in self.base64_patterns:
488
+ if pattern.search(src):
489
+ tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
490
+ break
491
+
492
+ def _truncate_long_urls(self, soup: BeautifulSoup) -> None:
493
+ """Truncate URLs longer than max_url_length"""
494
+ max_length = self.config.max_url_length
495
+
496
+ for tag in soup.find_all(['a'], href=True):
497
+ href = tag.get('href', '')
498
+ if isinstance(href, str) and len(href) > max_length:
499
+ tag['href'] = href[:max_length] + '...truncated'
500
+
501
+ for tag in soup.find_all(['img'], src=True):
502
+ src = tag.get('src', '')
503
+ if isinstance(src, str) and len(src) > max_length and not src.startswith('data:'):
504
+ tag['src'] = src[:max_length] + '...truncated'
505
+
506
+ def _remove_long_attributes(self, soup: BeautifulSoup) -> None:
507
+ """Remove attributes with extremely long values"""
508
+ for tag in soup.find_all():
509
+ attrs_to_remove = []
510
+ for attr, value in tag.attrs.items():
511
+ if isinstance(value, str) and len(value) > 800:
512
+ attrs_to_remove.append(attr)
513
+ elif any(tracking in attr.lower() for tracking in
514
+ ['tracking', 'analytics', 'gtm', 'pixel']):
515
+ attrs_to_remove.append(attr)
516
+
517
+ for attr in attrs_to_remove:
518
+ del tag.attrs[attr]
519
+
520
+ def _truncate_long_text(self, soup: BeautifulSoup) -> None:
521
+ """Truncate text content longer than max_text_length"""
522
+ max_length = self.config.max_text_length
523
+
524
+ for element in soup.find_all(text=True):
525
+ if element.parent.name not in ['script', 'style']:
526
+ text_content = str(element).strip()
527
+ if text_content and len(text_content) > max_length:
528
+ truncated_text = text_content[:max_length] + '...'
529
+ element.replace_with(truncated_text)
530
+
531
+ def _remove_comments(self, soup: BeautifulSoup) -> None:
532
+ """Remove HTML comments"""
533
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
534
+ comment.extract()
535
+
536
+ def _normalize_whitespace(self, soup: BeautifulSoup) -> None:
537
+ """Normalize whitespace in text content"""
538
+ for element in soup.find_all(text=True):
539
+ if element.parent.name not in ['script', 'style']:
540
+ # Replace multiple spaces with single space
541
+ cleaned_text = re.sub(r' {3,}', ' ', str(element))
542
+ # Replace multiple newlines with maximum 2
543
+ cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
544
+ # Replace multiple tabs with single space
545
+ cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
546
+ element.replace_with(cleaned_text)
547
+
548
+ def _final_cleanup(self, html: str) -> str:
549
+ """Final cleanup and optimization"""
550
+ # Remove empty attributes
551
+ html = re.sub(r'\s+\w+=""', '', html)
552
+
553
+ # Remove extra spaces in attributes
554
+ html = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html)
555
+
556
+ # Normalize quotes
557
+ html = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html)
558
+
559
+ # Remove trailing spaces before closing tags
560
+ html = re.sub(r'\s+(/?>)', r'\1', html)
561
+
562
+ # Advanced whitespace cleanup
563
+ html = self._advanced_whitespace_cleanup(html)
564
+
565
+ return html.strip()
566
+
567
+ def _advanced_whitespace_cleanup(self, html: str) -> str:
568
+ """Advanced whitespace cleanup"""
569
+ # Remove excessive spaces
570
+ html = re.sub(r' {3,}', ' ', html)
571
+
572
+ # Remove excessive newlines
573
+ html = re.sub(r'\n{3,}', '\n\n', html)
574
+
575
+ # Clean space between tags
576
+ html = re.sub(r'>\s{2,}<', '> <', html)
577
+
578
+ return html
579
+
580
+ # ==========================================
581
+ # JAVASCRIPT DATA EXTRACTION
582
+ # ==========================================
583
+
584
+ def _extract_js_data(self, soup: BeautifulSoup) -> ExtractedJSData:
585
+ """Extract valuable JavaScript data"""
586
+ extracted_data = ExtractedJSData()
587
+
588
+ # Find all script tags
589
+ script_tags = soup.find_all('script')
590
+
591
+ for script in script_tags:
592
+ if not script.string:
593
+ continue
594
+
595
+ script_content = script.string.strip()
596
+
597
+ # Skip empty scripts
598
+ if len(script_content) < 10:
599
+ continue
600
+
601
+ # Check for JSON-LD structured data
602
+ if script.get('type') == 'application/ld+json':
603
+ try:
604
+ json_data = json.loads(script_content)
605
+ # Convert to string dict for Pydantic compliance
606
+ str_data = {str(k): str(v) for k, v in json_data.items() if isinstance(k, (str, int, float))}
607
+ extracted_data.structured_data.append(str_data)
608
+ continue
609
+ except json.JSONDecodeError:
610
+ pass
611
+
612
+ # Extract data using patterns
613
+ self._extract_with_patterns(script_content, extracted_data)
614
+
615
+ return extracted_data
616
+
617
+ def _extract_with_patterns(self, script_content: str, extracted_data: ExtractedJSData) -> None:
618
+ """Extract data using compiled regex patterns"""
619
+ for pattern in self.js_data_patterns:
620
+ matches = pattern.finditer(script_content)
621
+ for match in matches:
622
+ self._try_parse_json(match.group(1), extracted_data)
623
+
624
+ def _try_parse_json(self, json_str: str, extracted_data: ExtractedJSData) -> None:
625
+ """Try to parse JSON string and add to extracted data"""
626
+ try:
627
+ json_data = json.loads(json_str)
628
+
629
+ if isinstance(json_data, dict):
630
+ # Convert to string dict for Pydantic compliance
631
+ str_data = {}
632
+ for k, v in json_data.items():
633
+ if isinstance(k, (str, int, float)) and isinstance(v, (str, int, float, bool)):
634
+ str_data[str(k)] = str(v)
635
+
636
+ if str_data:
637
+ extracted_data.ssr_data.update(str_data)
638
+
639
+ except json.JSONDecodeError:
640
+ # Skip invalid JSON
641
+ pass
642
+
643
+ # ==========================================
644
+ # UTILITY METHODS
645
+ # ==========================================
646
+
647
+ def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
648
+ """Get statistics about the cleaning process"""
649
+ original_size = len(original_html)
650
+ cleaned_size = len(cleaned_html)
651
+
652
+ # Estimate token reduction (rough approximation)
653
+ original_tokens = original_size // 4 # Rough estimate: 4 chars per token
654
+ cleaned_tokens = cleaned_size // 4
655
+
656
+ size_reduction = original_size - cleaned_size
657
+ size_reduction_percent = (size_reduction / original_size * 100) if original_size > 0 else 0.0
658
+ token_savings = original_tokens - cleaned_tokens
659
+ token_savings_percent = (token_savings / original_tokens * 100) if original_tokens > 0 else 0.0
660
+
661
+ return HTMLCleaningStats(
662
+ original_size_bytes=original_size,
663
+ cleaned_size_bytes=cleaned_size,
664
+ size_reduction_bytes=size_reduction,
665
+ size_reduction_percent=size_reduction_percent,
666
+ estimated_original_tokens=original_tokens,
667
+ estimated_cleaned_tokens=cleaned_tokens,
668
+ estimated_token_savings=token_savings,
669
+ estimated_token_savings_percent=token_savings_percent
670
+ )
671
+
672
+ def update_config(self, **kwargs) -> None:
673
+ """Update configuration with new values"""
674
+ current_data = self.config.model_dump()
675
+ current_data.update(kwargs)
676
+ self.config = HTMLCleaningConfig.model_validate(current_data)
677
+
678
+ # Recompile patterns if needed
679
+ self._compile_patterns()
680
+
681
+
682
+ # ==========================================
683
+ # CONVENIENCE FUNCTIONS
684
+ # ==========================================
685
+
686
+ def get_html_manager(config: Optional[HTMLCleaningConfig] = None) -> HTMLManager:
687
+ """
688
+ Get an HTML manager instance
689
+
690
+ Args:
691
+ config: Optional HTML cleaning configuration
692
+
693
+ Returns:
694
+ Configured HTMLManager instance
695
+ """
696
+ return HTMLManager(config=config)
697
+
698
+
699
+ async def quick_clean_html(html: str, **kwargs) -> str:
700
+ """
701
+ Quick HTML cleaning convenience function
702
+
703
+ Args:
704
+ html: Raw HTML content
705
+ **kwargs: Cleaning options
706
+
707
+ Returns:
708
+ Cleaned HTML
709
+ """
710
+ config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
711
+ config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
712
+
713
+ manager = get_html_manager(config)
714
+ return await manager.clean_html(html, **kwargs)
715
+
716
+
717
+ def quick_clean_html_sync(html: str, **kwargs) -> str:
718
+ """
719
+ Quick synchronous HTML cleaning convenience function
720
+
721
+ Args:
722
+ html: Raw HTML content
723
+ **kwargs: Cleaning options
724
+
725
+ Returns:
726
+ Cleaned HTML
727
+ """
728
+ config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
729
+ config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
730
+
731
+ manager = get_html_manager(config)
732
+ return manager.clean_html_sync(html, **kwargs)