unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. unrealon/__init__.py +23 -21
  2. unrealon-1.1.0.dist-info/METADATA +164 -0
  3. unrealon-1.1.0.dist-info/RECORD +82 -0
  4. {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
  5. unrealon-1.1.0.dist-info/entry_points.txt +9 -0
  6. {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
  7. unrealon_bridge/__init__.py +114 -0
  8. unrealon_bridge/cli.py +316 -0
  9. unrealon_bridge/client/__init__.py +93 -0
  10. unrealon_bridge/client/base.py +78 -0
  11. unrealon_bridge/client/commands.py +89 -0
  12. unrealon_bridge/client/connection.py +90 -0
  13. unrealon_bridge/client/events.py +65 -0
  14. unrealon_bridge/client/health.py +38 -0
  15. unrealon_bridge/client/html_parser.py +146 -0
  16. unrealon_bridge/client/logging.py +139 -0
  17. unrealon_bridge/client/proxy.py +70 -0
  18. unrealon_bridge/client/scheduler.py +450 -0
  19. unrealon_bridge/client/session.py +70 -0
  20. unrealon_bridge/configs/__init__.py +14 -0
  21. unrealon_bridge/configs/bridge_config.py +212 -0
  22. unrealon_bridge/configs/bridge_config.yaml +39 -0
  23. unrealon_bridge/models/__init__.py +138 -0
  24. unrealon_bridge/models/base.py +28 -0
  25. unrealon_bridge/models/command.py +41 -0
  26. unrealon_bridge/models/events.py +40 -0
  27. unrealon_bridge/models/html_parser.py +79 -0
  28. unrealon_bridge/models/logging.py +55 -0
  29. unrealon_bridge/models/parser.py +63 -0
  30. unrealon_bridge/models/proxy.py +41 -0
  31. unrealon_bridge/models/requests.py +95 -0
  32. unrealon_bridge/models/responses.py +88 -0
  33. unrealon_bridge/models/scheduler.py +592 -0
  34. unrealon_bridge/models/session.py +28 -0
  35. unrealon_bridge/server/__init__.py +91 -0
  36. unrealon_bridge/server/base.py +171 -0
  37. unrealon_bridge/server/handlers/__init__.py +23 -0
  38. unrealon_bridge/server/handlers/command.py +110 -0
  39. unrealon_bridge/server/handlers/html_parser.py +139 -0
  40. unrealon_bridge/server/handlers/logging.py +95 -0
  41. unrealon_bridge/server/handlers/parser.py +95 -0
  42. unrealon_bridge/server/handlers/proxy.py +75 -0
  43. unrealon_bridge/server/handlers/scheduler.py +545 -0
  44. unrealon_bridge/server/handlers/session.py +66 -0
  45. unrealon_browser/__init__.py +61 -18
  46. unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
  47. unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
  48. unrealon_browser/{src/core → core}/browser_manager.py +2 -2
  49. unrealon_browser/{src/managers → managers}/captcha.py +1 -1
  50. unrealon_browser/{src/managers → managers}/cookies.py +1 -1
  51. unrealon_browser/managers/logger_bridge.py +231 -0
  52. unrealon_browser/{src/managers → managers}/profile.py +1 -1
  53. unrealon_driver/__init__.py +73 -19
  54. unrealon_driver/browser/__init__.py +8 -0
  55. unrealon_driver/browser/config.py +74 -0
  56. unrealon_driver/browser/manager.py +416 -0
  57. unrealon_driver/exceptions.py +28 -0
  58. unrealon_driver/parser/__init__.py +55 -0
  59. unrealon_driver/parser/cli_manager.py +141 -0
  60. unrealon_driver/parser/daemon_manager.py +227 -0
  61. unrealon_driver/parser/managers/__init__.py +46 -0
  62. unrealon_driver/parser/managers/browser.py +51 -0
  63. unrealon_driver/parser/managers/config.py +281 -0
  64. unrealon_driver/parser/managers/error.py +412 -0
  65. unrealon_driver/parser/managers/html.py +732 -0
  66. unrealon_driver/parser/managers/logging.py +609 -0
  67. unrealon_driver/parser/managers/result.py +321 -0
  68. unrealon_driver/parser/parser_manager.py +628 -0
  69. unrealon/sdk_config.py +0 -88
  70. unrealon-1.0.9.dist-info/METADATA +0 -810
  71. unrealon-1.0.9.dist-info/RECORD +0 -246
  72. unrealon_browser/pyproject.toml +0 -182
  73. unrealon_browser/src/__init__.py +0 -62
  74. unrealon_browser/src/managers/logger_bridge.py +0 -395
  75. unrealon_driver/README.md +0 -204
  76. unrealon_driver/pyproject.toml +0 -187
  77. unrealon_driver/src/__init__.py +0 -90
  78. unrealon_driver/src/cli/__init__.py +0 -10
  79. unrealon_driver/src/cli/main.py +0 -66
  80. unrealon_driver/src/cli/simple.py +0 -510
  81. unrealon_driver/src/config/__init__.py +0 -11
  82. unrealon_driver/src/config/auto_config.py +0 -478
  83. unrealon_driver/src/core/__init__.py +0 -18
  84. unrealon_driver/src/core/exceptions.py +0 -289
  85. unrealon_driver/src/core/parser.py +0 -638
  86. unrealon_driver/src/dto/__init__.py +0 -66
  87. unrealon_driver/src/dto/cli.py +0 -119
  88. unrealon_driver/src/dto/config.py +0 -18
  89. unrealon_driver/src/dto/events.py +0 -237
  90. unrealon_driver/src/dto/execution.py +0 -313
  91. unrealon_driver/src/dto/services.py +0 -311
  92. unrealon_driver/src/execution/__init__.py +0 -23
  93. unrealon_driver/src/execution/daemon_mode.py +0 -317
  94. unrealon_driver/src/execution/interactive_mode.py +0 -88
  95. unrealon_driver/src/execution/modes.py +0 -45
  96. unrealon_driver/src/execution/scheduled_mode.py +0 -209
  97. unrealon_driver/src/execution/test_mode.py +0 -250
  98. unrealon_driver/src/logging/__init__.py +0 -24
  99. unrealon_driver/src/logging/driver_logger.py +0 -512
  100. unrealon_driver/src/services/__init__.py +0 -24
  101. unrealon_driver/src/services/browser_service.py +0 -726
  102. unrealon_driver/src/services/llm/__init__.py +0 -15
  103. unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
  104. unrealon_driver/src/services/llm/llm.py +0 -195
  105. unrealon_driver/src/services/logger_service.py +0 -232
  106. unrealon_driver/src/services/metrics_service.py +0 -185
  107. unrealon_driver/src/services/scheduler_service.py +0 -489
  108. unrealon_driver/src/services/websocket_service.py +0 -362
  109. unrealon_driver/src/utils/__init__.py +0 -16
  110. unrealon_driver/src/utils/service_factory.py +0 -317
  111. unrealon_driver/src/utils/time_formatter.py +0 -338
  112. unrealon_llm/README.md +0 -44
  113. unrealon_llm/__init__.py +0 -26
  114. unrealon_llm/pyproject.toml +0 -154
  115. unrealon_llm/src/__init__.py +0 -228
  116. unrealon_llm/src/cli/__init__.py +0 -0
  117. unrealon_llm/src/core/__init__.py +0 -11
  118. unrealon_llm/src/core/smart_client.py +0 -438
  119. unrealon_llm/src/dto/__init__.py +0 -155
  120. unrealon_llm/src/dto/models/__init__.py +0 -0
  121. unrealon_llm/src/dto/models/config.py +0 -343
  122. unrealon_llm/src/dto/models/core.py +0 -328
  123. unrealon_llm/src/dto/models/enums.py +0 -123
  124. unrealon_llm/src/dto/models/html_analysis.py +0 -345
  125. unrealon_llm/src/dto/models/statistics.py +0 -473
  126. unrealon_llm/src/dto/models/translation.py +0 -383
  127. unrealon_llm/src/dto/models/type_conversion.py +0 -462
  128. unrealon_llm/src/dto/schemas/__init__.py +0 -0
  129. unrealon_llm/src/exceptions.py +0 -392
  130. unrealon_llm/src/llm_config/__init__.py +0 -20
  131. unrealon_llm/src/llm_config/logging_config.py +0 -178
  132. unrealon_llm/src/llm_logging/__init__.py +0 -42
  133. unrealon_llm/src/llm_logging/llm_events.py +0 -107
  134. unrealon_llm/src/llm_logging/llm_logger.py +0 -466
  135. unrealon_llm/src/managers/__init__.py +0 -15
  136. unrealon_llm/src/managers/cache_manager.py +0 -67
  137. unrealon_llm/src/managers/cost_manager.py +0 -107
  138. unrealon_llm/src/managers/request_manager.py +0 -298
  139. unrealon_llm/src/modules/__init__.py +0 -0
  140. unrealon_llm/src/modules/html_processor/__init__.py +0 -25
  141. unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
  142. unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
  143. unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
  144. unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
  145. unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
  146. unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
  147. unrealon_llm/src/modules/html_processor/processor.py +0 -102
  148. unrealon_llm/src/modules/llm/__init__.py +0 -0
  149. unrealon_llm/src/modules/translator/__init__.py +0 -0
  150. unrealon_llm/src/provider.py +0 -116
  151. unrealon_llm/src/utils/__init__.py +0 -95
  152. unrealon_llm/src/utils/common.py +0 -64
  153. unrealon_llm/src/utils/data_extractor.py +0 -188
  154. unrealon_llm/src/utils/html_cleaner.py +0 -767
  155. unrealon_llm/src/utils/language_detector.py +0 -308
  156. unrealon_llm/src/utils/models_cache.py +0 -592
  157. unrealon_llm/src/utils/smart_counter.py +0 -229
  158. unrealon_llm/src/utils/token_counter.py +0 -189
  159. unrealon_sdk/README.md +0 -25
  160. unrealon_sdk/__init__.py +0 -30
  161. unrealon_sdk/pyproject.toml +0 -231
  162. unrealon_sdk/src/__init__.py +0 -150
  163. unrealon_sdk/src/cli/__init__.py +0 -12
  164. unrealon_sdk/src/cli/commands/__init__.py +0 -22
  165. unrealon_sdk/src/cli/commands/benchmark.py +0 -42
  166. unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
  167. unrealon_sdk/src/cli/commands/health.py +0 -46
  168. unrealon_sdk/src/cli/commands/integration.py +0 -498
  169. unrealon_sdk/src/cli/commands/reports.py +0 -43
  170. unrealon_sdk/src/cli/commands/security.py +0 -36
  171. unrealon_sdk/src/cli/commands/server.py +0 -483
  172. unrealon_sdk/src/cli/commands/servers.py +0 -56
  173. unrealon_sdk/src/cli/commands/tests.py +0 -55
  174. unrealon_sdk/src/cli/main.py +0 -126
  175. unrealon_sdk/src/cli/utils/reporter.py +0 -519
  176. unrealon_sdk/src/clients/openapi.yaml +0 -3347
  177. unrealon_sdk/src/clients/python_http/__init__.py +0 -3
  178. unrealon_sdk/src/clients/python_http/api_config.py +0 -228
  179. unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
  180. unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
  181. unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
  182. unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
  183. unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
  184. unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
  185. unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
  186. unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
  187. unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
  188. unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
  189. unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
  190. unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
  191. unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
  192. unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
  193. unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
  194. unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
  195. unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
  196. unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
  197. unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
  198. unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
  199. unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
  200. unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
  201. unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
  202. unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
  203. unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
  204. unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
  205. unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
  206. unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
  207. unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
  208. unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
  209. unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
  210. unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
  211. unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
  212. unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
  213. unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
  214. unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
  215. unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
  216. unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
  217. unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
  218. unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
  219. unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
  220. unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
  221. unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
  222. unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
  223. unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
  224. unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
  225. unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
  226. unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
  227. unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
  228. unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
  229. unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
  230. unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
  231. unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
  232. unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
  233. unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
  234. unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
  235. unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
  236. unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
  237. unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
  238. unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
  239. unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
  240. unrealon_sdk/src/clients/python_websocket/client.py +0 -490
  241. unrealon_sdk/src/clients/python_websocket/events.py +0 -732
  242. unrealon_sdk/src/clients/python_websocket/example.py +0 -136
  243. unrealon_sdk/src/clients/python_websocket/types.py +0 -871
  244. unrealon_sdk/src/core/__init__.py +0 -64
  245. unrealon_sdk/src/core/client.py +0 -556
  246. unrealon_sdk/src/core/config.py +0 -465
  247. unrealon_sdk/src/core/exceptions.py +0 -239
  248. unrealon_sdk/src/core/metadata.py +0 -191
  249. unrealon_sdk/src/core/models.py +0 -142
  250. unrealon_sdk/src/core/types.py +0 -68
  251. unrealon_sdk/src/dto/__init__.py +0 -268
  252. unrealon_sdk/src/dto/authentication.py +0 -108
  253. unrealon_sdk/src/dto/cache.py +0 -208
  254. unrealon_sdk/src/dto/common.py +0 -19
  255. unrealon_sdk/src/dto/concurrency.py +0 -393
  256. unrealon_sdk/src/dto/events.py +0 -108
  257. unrealon_sdk/src/dto/health.py +0 -339
  258. unrealon_sdk/src/dto/load_balancing.py +0 -336
  259. unrealon_sdk/src/dto/logging.py +0 -230
  260. unrealon_sdk/src/dto/performance.py +0 -165
  261. unrealon_sdk/src/dto/rate_limiting.py +0 -295
  262. unrealon_sdk/src/dto/resource_pooling.py +0 -128
  263. unrealon_sdk/src/dto/structured_logging.py +0 -112
  264. unrealon_sdk/src/dto/task_scheduling.py +0 -121
  265. unrealon_sdk/src/dto/websocket.py +0 -55
  266. unrealon_sdk/src/enterprise/__init__.py +0 -59
  267. unrealon_sdk/src/enterprise/authentication.py +0 -401
  268. unrealon_sdk/src/enterprise/cache_manager.py +0 -578
  269. unrealon_sdk/src/enterprise/error_recovery.py +0 -494
  270. unrealon_sdk/src/enterprise/event_system.py +0 -549
  271. unrealon_sdk/src/enterprise/health_monitor.py +0 -747
  272. unrealon_sdk/src/enterprise/load_balancer.py +0 -964
  273. unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
  274. unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
  275. unrealon_sdk/src/enterprise/logging/development.py +0 -744
  276. unrealon_sdk/src/enterprise/logging/service.py +0 -410
  277. unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
  278. unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
  279. unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
  280. unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
  281. unrealon_sdk/src/enterprise/resource_pool.py +0 -763
  282. unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
  283. unrealon_sdk/src/internal/__init__.py +0 -10
  284. unrealon_sdk/src/internal/command_router.py +0 -497
  285. unrealon_sdk/src/internal/connection_manager.py +0 -397
  286. unrealon_sdk/src/internal/http_client.py +0 -446
  287. unrealon_sdk/src/internal/websocket_client.py +0 -420
  288. unrealon_sdk/src/provider.py +0 -471
  289. unrealon_sdk/src/utils.py +0 -234
  290. /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
  291. /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
  292. /unrealon_browser/{src/cli → cli}/main.py +0 -0
  293. /unrealon_browser/{src/core → core}/__init__.py +0 -0
  294. /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
  295. /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
  296. /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
  297. /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
  298. /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
  299. /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
  300. /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
  301. /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
  302. /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
@@ -1,767 +0,0 @@
1
- """
2
- Smart HTML Cleaner
3
-
4
- Intelligent HTML cleaning that removes noise but preserves useful data.
5
- Optimizes HTML for LLM token efficiency while keeping valuable content.
6
- """
7
-
8
- import json
9
- import re
10
- from typing import Dict, List, Optional, Tuple
11
-
12
- from bs4 import BeautifulSoup, Comment
13
-
14
- from unrealon_llm.src.exceptions import ValidationError
15
-
16
-
17
- class SmartHTMLCleaner:
18
- """
19
- Intelligent HTML cleaner that optimizes for LLM analysis
20
-
21
- Features:
22
- - Removes noise (scripts, styles, comments)
23
- - Preserves useful JavaScript data (JSON objects, SSR data)
24
- - Cleans whitespace and formatting
25
- - Maintains semantic structure
26
- - Extracts and preserves Next.js/Nuxt.js SSR data
27
- """
28
-
29
- def __init__(self):
30
- """Initialize the HTML cleaner"""
31
- # Tags to completely remove
32
- self.noise_tags = {
33
- 'script', 'style', 'meta', 'link', 'base', 'title',
34
- 'head', 'noscript', 'iframe', 'embed', 'object',
35
- 'svg', 'canvas', 'audio', 'video', 'source',
36
- 'track', 'area', 'map', 'param', 'form', 'input',
37
- 'button', 'select', 'textarea', 'fieldset', 'legend'
38
- }
39
-
40
- # URL patterns to remove or shorten (for tracking/analytics)
41
- self.tracking_url_patterns = [
42
- r'https://aax-[^\s"]{200,}', # Amazon tracking URLs over 200 chars
43
- r'https://[^\s"]*tracking[^\s"]{100,}', # General tracking URLs
44
- r'https://[^\s"]*analytics[^\s"]{100,}', # Analytics URLs
45
- r'https://[^\s"]*gtm[^\s"]{100,}', # Google Tag Manager URLs
46
- ]
47
-
48
- # Base64 patterns to remove or replace
49
- self.base64_patterns = [
50
- r'data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}', # Base64 images over 50 chars
51
- r'data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}', # Base64 applications
52
- r'data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}', # Base64 text
53
- ]
54
-
55
- # Universal noise selectors to remove (for any site)
56
- self.universal_noise_selectors = [
57
- '[id*="nav"]', '[class*="nav"]', # Navigation
58
- '[id*="menu"]', '[class*="menu"]', # Menus
59
- '[id*="sidebar"]', '[class*="sidebar"]', # Sidebars
60
- '[id*="footer"]', '[class*="footer"]', # Footers
61
- '[id*="header"]', '[class*="header"]', # Headers
62
- '[class*="ads"]', '[class*="advertisement"]', # Ads
63
- '[class*="sponsored"]', '[class*="promo"]', # Sponsored content
64
- '[class*="popup"]', '[class*="modal"]', # Popups/modals
65
- '[class*="overlay"]', '[class*="tooltip"]', # Overlays
66
- '[class*="cookie"]', '[class*="gdpr"]', # Cookie notices
67
- '[class*="newsletter"]', '[class*="subscription"]', # Email signup
68
- '[class*="social"]', '[class*="share"]', # Social media
69
- '[class*="comment"]', '[class*="discussion"]', # Comments (unless main content)
70
- '[class*="tracking"]', '[class*="analytics"]', # Tracking
71
- ]
72
-
73
- # Attributes to remove (keep only semantic ones)
74
- self.noise_attributes = {
75
- 'style', 'onclick', 'onload', 'onchange', 'onmouseover',
76
- 'onmouseout', 'onfocus', 'onblur', 'onsubmit', 'onreset',
77
- 'onerror', 'onabort', 'oncanplay', 'oncanplaythrough',
78
- 'ondurationchange', 'onemptied', 'onended', 'onloadeddata',
79
- 'onloadedmetadata', 'onloadstart', 'onpause', 'onplay',
80
- 'onplaying', 'onprogress', 'onratechange', 'onseeked',
81
- 'onseeking', 'onstalled', 'onsuspend', 'ontimeupdate',
82
- 'onvolumechange', 'onwaiting', 'onkeydown', 'onkeypress',
83
- 'onkeyup', 'onmousedown', 'onmousemove', 'onmouseup',
84
- 'onwheel', 'ondrag', 'ondragend', 'ondragenter',
85
- 'ondragleave', 'ondragover', 'ondragstart', 'ondrop',
86
- 'onscroll', 'onresize', 'onstorage', 'onhashchange',
87
- 'onpopstate', 'onbeforeprint', 'onafterprint',
88
- 'onbeforeunload', 'onunload', 'onmessage', 'oninput',
89
- 'oninvalid', 'onsearch', 'autocomplete', 'autofocus',
90
- 'checked', 'defer', 'disabled', 'hidden', 'loop',
91
- 'multiple', 'muted', 'open', 'readonly', 'required',
92
- 'reversed', 'selected', 'autoplay', 'controls',
93
- 'crossorigin', 'download', 'hreflang', 'ismap',
94
- 'itemid', 'itemprop', 'itemref', 'itemscope',
95
- 'itemtype', 'kind', 'media', 'rel', 'sandbox',
96
- 'scope', 'sizes', 'span', 'spellcheck', 'srcdoc',
97
- 'srclang', 'srcset', 'step', 'tabindex', 'target',
98
- 'translate', 'usemap', 'wrap', 'accept', 'acceptcharset',
99
- 'accesskey', 'action', 'allowfullscreen', 'alt',
100
- 'async', 'autocapitalize', 'capture', 'charset',
101
- 'cols', 'colspan', 'content', 'contenteditable',
102
- 'contextmenu', 'coords', 'datetime', 'decoding',
103
- 'default', 'dir', 'dirname', 'download', 'draggable',
104
- 'enctype', 'enterkeyhint', 'for', 'form', 'formaction',
105
- 'formenctype', 'formmethod', 'formnovalidate',
106
- 'formtarget', 'headers', 'height', 'high', 'href',
107
- 'hreflang', 'httpequiv', 'icon', 'importance', 'inputmode',
108
- 'integrity', 'intrinsicsize', 'keytype', 'label',
109
- 'lang', 'list', 'loading', 'low', 'manifest',
110
- 'max', 'maxlength', 'method', 'min', 'minlength',
111
- 'name', 'novalidate', 'optimum', 'pattern',
112
- 'ping', 'placeholder', 'poster', 'preload',
113
- 'radiogroup', 'referrerpolicy', 'rows', 'rowspan',
114
- 'shape', 'size', 'slot', 'src', 'start',
115
- 'title', 'type', 'value', 'width'
116
- }
117
-
118
- # Keep these semantic attributes
119
- self.keep_attributes = {
120
- 'id', 'class', 'data-testid', 'data-test', 'data-cy',
121
- 'aria-label', 'aria-labelledby', 'aria-describedby',
122
- 'role', 'alt', 'title', 'href', 'src', 'action',
123
- 'name', 'value', 'placeholder', 'type'
124
- }
125
-
126
- # Patterns to detect valuable JavaScript data
127
- self.useful_js_patterns = [
128
- # Next.js/Nuxt.js SSR data
129
- r'__NEXT_DATA__\s*=\s*(\{.+?\});?',
130
- r'__NUXT__\s*=\s*(\{.+?\});?',
131
- r'window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?',
132
-
133
- # React/Vue hydration data
134
- r'window\.__REACT_QUERY_STATE__\s*=\s*(\{.+?\});?',
135
- r'window\.__VUE_SSR_CONTEXT__\s*=\s*(\{.+?\});?',
136
-
137
- # E-commerce data
138
- r'window\.productData\s*=\s*(\{.+?\});?',
139
- r'window\.cartData\s*=\s*(\{.+?\});?',
140
- r'dataLayer\s*=\s*(\[.+?\]);?',
141
-
142
- # Analytics and tracking (structured data)
143
- r'gtag\s*\(\s*[\'"]config[\'"],\s*[\'"][^\'\"]+[\'"],\s*(\{.+?\})\s*\);?',
144
-
145
- # JSON-LD structured data (often in script tags)
146
- r'"@context"\s*:\s*"https?://schema\.org"[^}]*\}',
147
-
148
- # Generic JSON objects (be more selective)
149
- r'(?:window\.|var\s+|let\s+|const\s+)\w+\s*=\s*(\{.+?\});?',
150
- ]
151
-
152
- # Compiled regex patterns for efficiency
153
- self.compiled_patterns = [re.compile(pattern, re.DOTALL | re.IGNORECASE)
154
- for pattern in self.useful_js_patterns]
155
-
156
- def clean_html(
157
- self,
158
- html_content: str,
159
- preserve_js_data: bool = True,
160
- aggressive_cleaning: bool = False
161
- ) -> Tuple[str, Dict[str, any]]:
162
- """
163
- Clean HTML content while preserving valuable data
164
-
165
- Args:
166
- html_content: Raw HTML content
167
- preserve_js_data: Whether to extract and preserve JS data
168
- aggressive_cleaning: Whether to apply more aggressive cleaning
169
-
170
- Returns:
171
- Tuple of (cleaned_html, extracted_data)
172
- """
173
- if not html_content or not html_content.strip():
174
- return "", {}
175
-
176
- # Parse HTML
177
- try:
178
- soup = BeautifulSoup(html_content, 'html.parser')
179
- except Exception as e:
180
- raise ValidationError(f"Failed to parse HTML: {e}")
181
-
182
- extracted_data = {}
183
-
184
- # Extract valuable JavaScript data before removing scripts
185
- if preserve_js_data:
186
- extracted_data = self._extract_js_data(soup)
187
-
188
- # Remove universal noise elements for aggressive cleaning
189
- if aggressive_cleaning:
190
- self._remove_universal_noise(soup)
191
- self._truncate_long_urls(soup) # Do this before tracking URL cleaning
192
- self._clean_tracking_urls(soup)
193
- self._clean_base64_data(soup)
194
- self._remove_long_attributes(soup)
195
- self._remove_html_comments(soup)
196
- self._clean_whitespace(soup)
197
-
198
- # Remove noise elements
199
- self._remove_noise_elements(soup)
200
-
201
- # Clean attributes
202
- self._clean_attributes(soup, aggressive_cleaning)
203
-
204
- # Remove comments
205
- self._remove_comments(soup)
206
-
207
- # Clean text and whitespace
208
- cleaned_html = self._clean_text_and_whitespace(soup)
209
-
210
- # Final cleanup
211
- cleaned_html = self._final_cleanup(cleaned_html)
212
-
213
- return cleaned_html, extracted_data
214
-
215
- def _extract_js_data(self, soup: BeautifulSoup) -> Dict[str, any]:
216
- """Extract valuable data from JavaScript"""
217
- extracted_data = {
218
- 'ssr_data': {},
219
- 'structured_data': [],
220
- 'analytics_data': {},
221
- 'product_data': {},
222
- 'raw_extracts': []
223
- }
224
-
225
- # Find all script tags
226
- script_tags = soup.find_all('script')
227
-
228
- for script in script_tags:
229
- if not script.string:
230
- continue
231
-
232
- script_content = script.string.strip()
233
-
234
- # Skip empty or very short scripts
235
- if len(script_content) < 10:
236
- continue
237
-
238
- # Check for JSON-LD structured data
239
- if script.get('type') == 'application/ld+json':
240
- try:
241
- json_data = json.loads(script_content)
242
- extracted_data['structured_data'].append(json_data)
243
- continue
244
- except json.JSONDecodeError:
245
- pass
246
-
247
- # Extract data using patterns
248
- self._extract_with_patterns(script_content, extracted_data)
249
-
250
- # Remove empty categories
251
- extracted_data = {k: v for k, v in extracted_data.items() if v}
252
-
253
- return extracted_data
254
-
255
- def _extract_with_patterns(self, script_content: str, extracted_data: Dict):
256
- """Extract data using compiled regex patterns and heuristics"""
257
-
258
- # First try specific named patterns
259
- self._extract_named_patterns(script_content, extracted_data)
260
-
261
- # Then try generic JSON extraction as fallback
262
- self._extract_generic_json(script_content, extracted_data)
263
-
264
- def _extract_named_patterns(self, script_content: str, extracted_data: Dict):
265
- """Extract data using specific named patterns"""
266
-
267
- # Next.js SSR data
268
- nextjs_patterns = [
269
- r'__NEXT_DATA__\s*=\s*({.+?});',
270
- r'window\.__NEXT_DATA__\s*=\s*({.+?});'
271
- ]
272
-
273
- for pattern in nextjs_patterns:
274
- matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
275
- for match in matches:
276
- self._try_parse_json(match.group(1), extracted_data, 'ssr_data')
277
-
278
- # React Query state
279
- react_patterns = [
280
- r'window\.__REACT_QUERY_STATE__\s*=\s*({.+?});'
281
- ]
282
-
283
- for pattern in react_patterns:
284
- matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
285
- for match in matches:
286
- self._try_parse_json(match.group(1), extracted_data, 'ssr_data')
287
-
288
- # Product data
289
- product_patterns = [
290
- r'window\.productData\s*=\s*({.+?});',
291
- r'dataLayer\s*=\s*(\[.+?\]);'
292
- ]
293
-
294
- for pattern in product_patterns:
295
- matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
296
- for match in matches:
297
- self._try_parse_json(match.group(1), extracted_data, 'product_data')
298
-
299
- def _extract_generic_json(self, script_content: str, extracted_data: Dict):
300
- """Extract generic JSON objects as fallback"""
301
-
302
- # Look for variable assignments with objects
303
- generic_patterns = [
304
- r'(?:window\.|var\s+|let\s+|const\s+)(\w+)\s*=\s*({[^;]+});',
305
- r'(\w+)\s*=\s*({[^;]+});'
306
- ]
307
-
308
- for pattern in generic_patterns:
309
- matches = re.finditer(pattern, script_content, re.DOTALL | re.IGNORECASE)
310
- for match in matches:
311
- var_name = match.group(1)
312
- json_content = match.group(2)
313
-
314
- # Only process if it looks like substantial data
315
- if len(json_content) > 20:
316
- self._try_parse_json(json_content, extracted_data, 'raw_extracts', var_name)
317
-
318
- def _try_parse_json(self, json_str: str, extracted_data: Dict, category: str, var_name: str = None):
319
- """Try to parse JSON string and categorize it"""
320
- try:
321
- json_data = json.loads(json_str)
322
-
323
- if category == 'ssr_data':
324
- if 'ssr_data' not in extracted_data:
325
- extracted_data['ssr_data'] = {}
326
- if isinstance(json_data, dict):
327
- extracted_data['ssr_data'].update(json_data)
328
- else:
329
- extracted_data['ssr_data'][var_name or 'data'] = json_data
330
-
331
- elif category == 'product_data':
332
- if 'product_data' not in extracted_data:
333
- extracted_data['product_data'] = {}
334
- if isinstance(json_data, dict):
335
- extracted_data['product_data'].update(json_data)
336
- else:
337
- extracted_data['product_data'][var_name or 'data'] = json_data
338
-
339
- else: # raw_extracts - filter useful ones only
340
- # Only store raw extracts if they look like complete objects
341
- if isinstance(json_data, dict) and len(json_data) > 3:
342
- if 'raw_extracts' not in extracted_data:
343
- extracted_data['raw_extracts'] = []
344
- extracted_data['raw_extracts'].append(json_data)
345
-
346
- except json.JSONDecodeError:
347
- # Skip invalid JSON - it's noise
348
- pass
349
-
350
- def _remove_noise_elements(self, soup: BeautifulSoup):
351
- """Remove noise HTML elements"""
352
- # Remove noise tags
353
- for tag_name in self.noise_tags:
354
- for tag in soup.find_all(tag_name):
355
- tag.decompose()
356
-
357
- # Remove empty divs and spans
358
- for tag in soup.find_all(['div', 'span']):
359
- if not tag.get_text(strip=True) and not tag.find_all():
360
- tag.decompose()
361
-
362
- def _clean_attributes(self, soup: BeautifulSoup, aggressive: bool = False):
363
- """Clean HTML attributes"""
364
- for tag in soup.find_all(True): # Find all tags
365
- if hasattr(tag, 'attrs'):
366
- # Determine which attributes to keep
367
- if aggressive:
368
- # Keep only essential semantic attributes
369
- keep_attrs = self.keep_attributes & {'id', 'class', 'href', 'src', 'alt'}
370
- else:
371
- keep_attrs = self.keep_attributes
372
-
373
- # Remove unwanted attributes
374
- attrs_to_remove = set(tag.attrs.keys()) - keep_attrs
375
- for attr in attrs_to_remove:
376
- del tag.attrs[attr]
377
-
378
- # Clean class names (remove utility classes if aggressive)
379
- if aggressive and 'class' in tag.attrs:
380
- classes = tag.attrs['class']
381
- if isinstance(classes, list):
382
- # Remove utility classes (Tailwind, Bootstrap, etc.)
383
- semantic_classes = [
384
- cls for cls in classes
385
- if not self._is_utility_class(cls)
386
- ]
387
- if semantic_classes:
388
- tag.attrs['class'] = semantic_classes
389
- else:
390
- del tag.attrs['class']
391
-
392
- def _remove_universal_noise(self, soup: BeautifulSoup):
393
- """Remove universal noise elements from any website"""
394
- for selector in self.universal_noise_selectors:
395
- try:
396
- elements = soup.select(selector)
397
- for element in elements:
398
- # Keep only main product content areas
399
- if not self._is_main_content_element(element):
400
- element.decompose()
401
- except Exception:
402
- # Skip invalid selectors
403
- continue
404
-
405
- def _clean_tracking_urls(self, soup: BeautifulSoup):
406
- """Remove or shorten tracking URLs that bloat HTML size"""
407
- import re
408
-
409
- # Clean href attributes in links
410
- for tag in soup.find_all(['a'], href=True):
411
- href = tag.get('href', '')
412
- if href and not href.endswith('...truncated'): # Skip already truncated URLs
413
- for pattern in self.tracking_url_patterns:
414
- if re.match(pattern, href):
415
- # Replace with placeholder for tracking URLs
416
- tag['href'] = '#tracking-url-removed'
417
- break
418
-
419
- # Clean src attributes in images
420
- for tag in soup.find_all(['img'], src=True):
421
- src = tag.get('src', '')
422
- if src:
423
- for pattern in self.tracking_url_patterns:
424
- if re.match(pattern, src):
425
- # Replace with minimal SVG placeholder
426
- tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
427
- break
428
-
429
- def _clean_base64_data(self, soup: BeautifulSoup):
430
- """Remove or replace large base64 encoded data to reduce HTML size"""
431
- import re
432
-
433
- # Clean base64 data in img src attributes
434
- for tag in soup.find_all(['img'], src=True):
435
- src = tag.get('src', '')
436
- if src:
437
- for pattern in self.base64_patterns:
438
- if re.search(pattern, src):
439
- # Extract image type if possible
440
- if src.startswith('data:image/'):
441
- # Replace with minimal SVG placeholder
442
- tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
443
- else:
444
- # Remove the src entirely for non-images
445
- del tag['src']
446
- break
447
-
448
- # Clean base64 data in style attributes and CSS
449
- for tag in soup.find_all(style=True):
450
- style = tag.get('style', '')
451
- if style:
452
- for pattern in self.base64_patterns:
453
- if re.search(pattern, style):
454
- # Remove the entire style attribute if it contains large base64
455
- del tag['style']
456
- break
457
-
458
- # Clean base64 data in href attributes (for downloads, etc.)
459
- for tag in soup.find_all(['a'], href=True):
460
- href = tag.get('href', '')
461
- if href:
462
- for pattern in self.base64_patterns:
463
- if re.match(pattern, href):
464
- # Replace with placeholder
465
- tag['href'] = '#base64-data-removed'
466
- break
467
-
468
- # Clean base64 data from any attribute (catch-all)
469
- for tag in soup.find_all():
470
- attrs_to_clean = []
471
- for attr, value in tag.attrs.items():
472
- if isinstance(value, str):
473
- for pattern in self.base64_patterns:
474
- if re.search(pattern, value):
475
- attrs_to_clean.append(attr)
476
- break
477
-
478
- # Clean or remove attributes with base64 data
479
- for attr in attrs_to_clean:
480
- if attr in ['src', 'href']:
481
- # Replace with placeholder for important attributes
482
- if attr == 'src':
483
- tag[attr] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
484
- else:
485
- tag[attr] = '#base64-data-removed'
486
- else:
487
- # Remove entirely for other attributes
488
- del tag.attrs[attr]
489
-
490
- def _truncate_long_urls(self, soup: BeautifulSoup, max_url_length: int = 500):
491
- """Truncate any URL longer than max_url_length characters"""
492
-
493
- # Process all elements with href attributes (links)
494
- for tag in soup.find_all(['a'], href=True):
495
- href = tag.get('href', '')
496
- if isinstance(href, str) and len(href) > max_url_length:
497
- # Keep the beginning of the URL and add indicator
498
- truncated_url = href[:max_url_length] + '...truncated'
499
- tag['href'] = truncated_url
500
-
501
- # Process all elements with src attributes (images, iframes, etc.)
502
- for tag in soup.find_all(['img', 'iframe', 'embed', 'object'], src=True):
503
- src = tag.get('src', '')
504
- if isinstance(src, str) and len(src) > max_url_length:
505
- # For images, if it's not base64, truncate it
506
- if not src.startswith('data:'):
507
- truncated_url = src[:max_url_length] + '...truncated'
508
- tag['src'] = truncated_url
509
- # Base64 data is handled by _clean_base64_data method
510
-
511
- # Process any other URL-like attributes
512
- url_attributes = ['action', 'formaction', 'poster', 'cite', 'data', 'manifest']
513
- for tag in soup.find_all():
514
- for attr in url_attributes:
515
- if tag.has_attr(attr):
516
- value = tag.get(attr, '')
517
- if isinstance(value, str) and len(value) > max_url_length:
518
- # Check if it looks like a URL (contains :// or starts with / or http)
519
- if ('://' in value or
520
- value.startswith('/') or
521
- value.startswith('http') or
522
- value.startswith('//')):
523
- truncated_url = value[:max_url_length] + '...truncated'
524
- tag[attr] = truncated_url
525
-
526
- def _remove_long_attributes(self, soup: BeautifulSoup):
527
- """Remove attributes with extremely long values that are likely tracking data"""
528
- for tag in soup.find_all():
529
- # Check all attributes for excessive length
530
- attrs_to_remove = []
531
- for attr, value in tag.attrs.items():
532
- if isinstance(value, str):
533
- # Remove attributes longer than 800 chars (likely tracking data)
534
- # Increased from 500 since URLs are now handled separately
535
- if len(value) > 800:
536
- attrs_to_remove.append(attr)
537
- # Remove specific tracking attributes regardless of length
538
- elif any(tracking in attr.lower() for tracking in
539
- ['tracking', 'analytics', 'gtm', 'pixel', 'impression', 'asin']):
540
- attrs_to_remove.append(attr)
541
- elif isinstance(value, list):
542
- # Check if list contains very long strings
543
- if any(isinstance(v, str) and len(v) > 500 for v in value):
544
- attrs_to_remove.append(attr)
545
-
546
- # Remove the problematic attributes
547
- for attr in attrs_to_remove:
548
- del tag.attrs[attr]
549
-
550
- def get_cleaning_stats(self, original_size: int, cleaned_size: int) -> Dict[str, any]:
551
- """Get statistics about the cleaning process"""
552
- reduction_bytes = original_size - cleaned_size
553
- reduction_percent = (reduction_bytes / original_size * 100) if original_size > 0 else 0
554
-
555
- return {
556
- 'original_size': original_size,
557
- 'cleaned_size': cleaned_size,
558
- 'reduction_bytes': reduction_bytes,
559
- 'reduction_percent': round(reduction_percent, 2),
560
- 'compression_ratio': round(original_size / cleaned_size, 2) if cleaned_size > 0 else 0
561
- }
562
-
563
- def _remove_html_comments(self, soup: BeautifulSoup):
564
- """Remove all HTML comments to reduce size"""
565
- # Remove all HTML comments
566
- for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
567
- comment.extract()
568
-
569
- def _clean_whitespace(self, soup: BeautifulSoup):
570
- """Clean excessive whitespace in text content"""
571
- import re
572
-
573
- # Process all text nodes
574
- for element in soup.find_all(text=True):
575
- if element.parent.name not in ['script', 'style']: # Skip scripts and styles
576
- # Replace multiple spaces with single space
577
- cleaned_text = re.sub(r' {3,}', ' ', str(element))
578
- # Replace multiple newlines with maximum 2
579
- cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
580
- # Replace multiple tabs with single space
581
- cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
582
- # Clean mixed whitespace
583
- cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text)
584
- element.replace_with(cleaned_text)
585
-
586
- def _advanced_whitespace_cleanup(self, html_content: str) -> str:
587
- """Advanced whitespace cleanup for aggressive cleaning"""
588
- import re
589
-
590
- # Remove excessive spaces (more than 2)
591
- html_content = re.sub(r' {3,}', ' ', html_content)
592
-
593
- # Remove excessive newlines (more than 2)
594
- html_content = re.sub(r'\n{3,}', '\n\n', html_content)
595
-
596
- # Remove excessive tabs
597
- html_content = re.sub(r'\t{2,}', '\t', html_content)
598
-
599
- # Clean mixed whitespace patterns
600
- html_content = re.sub(r'[ \t]{3,}', ' ', html_content)
601
-
602
- # Remove whitespace at line endings
603
- html_content = re.sub(r'[ \t]+\n', '\n', html_content)
604
-
605
- # Remove whitespace at line beginnings (except single indent)
606
- html_content = re.sub(r'\n[ \t]{2,}', '\n ', html_content)
607
-
608
- # Clean space between tags
609
- html_content = re.sub(r'>\s{2,}<', '> <', html_content)
610
-
611
- # Final cleanup
612
- html_content = html_content.strip()
613
-
614
- return html_content
615
-
616
- def _is_main_content_element(self, element) -> bool:
617
- """Check if element contains main product content"""
618
- # Keep elements that likely contain product info
619
- product_indicators = [
620
- 'product', 'detail', 'title', 'price', 'description',
621
- 'spec', 'review', 'rating', 'availability', 'image'
622
- ]
623
-
624
- element_text = str(element).lower()
625
- for indicator in product_indicators:
626
- if indicator in element_text:
627
- return True
628
- return False
629
-
630
- def _is_utility_class(self, class_name: str) -> bool:
631
- """Check if a class name is a utility class"""
632
- utility_patterns = [
633
- r'^(m|p)[trblxy]?-\d+$', # Margin/padding utilities
634
- r'^(m|p)[xy]-auto$', # Margin auto utilities
635
- r'^(w|h)-\d+$', # Width/height utilities
636
- r'^text-(xs|sm|lg|xl|\d+xl)$', # Text size utilities
637
- r'^bg-\w+(-\d+)?$', # Background utilities
638
- r'^text-\w+(-\d+)?$', # Text color utilities
639
- r'^border(-\w+)?(-\d+)?$', # Border utilities
640
- r'^flex(-\w+)?$', # Flex utilities
641
- r'^grid(-\w+)?$', # Grid utilities
642
- r'^hidden$', # Visibility utilities
643
- r'^sr-only$', # Screen reader utilities
644
- r'^(sm|md|lg|xl|2xl):.*$', # Responsive prefixes
645
- r'^\w+-\d+$', # Generic number-based utilities
646
- r'^mx-auto$', # Margin x auto
647
- r'^my-auto$', # Margin y auto
648
- ]
649
-
650
- return any(re.match(pattern, class_name) for pattern in utility_patterns)
651
-
652
- def _truncate_long_text_content(self, soup: BeautifulSoup, max_text_length: int = 300):
653
- """Truncate text content longer than max_text_length characters"""
654
- # Process all text nodes in the soup
655
- for element in soup.find_all(text=True):
656
- # Skip script and style tags
657
- if element.parent.name in ['script', 'style']:
658
- continue
659
-
660
- text_content = str(element).strip()
661
-
662
- # Only process non-empty text that's longer than the limit
663
- if text_content and len(text_content) > max_text_length:
664
- # Truncate and add ellipsis
665
- truncated_text = text_content[:max_text_length] + '...'
666
- element.replace_with(truncated_text)
667
-
668
- def _remove_comments(self, soup: BeautifulSoup):
669
- """Remove HTML comments"""
670
- for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
671
- comment.extract()
672
-
673
- def _clean_text_and_whitespace(self, soup: BeautifulSoup) -> str:
674
- """Clean text content and normalize whitespace"""
675
- # Truncate long text content before converting to string
676
- self._truncate_long_text_content(soup)
677
-
678
- # Get the HTML string
679
- html_str = str(soup)
680
-
681
- # Normalize whitespace
682
- html_str = re.sub(r'\s+', ' ', html_str) # Multiple spaces to single
683
- html_str = re.sub(r'\n\s*\n', '\n', html_str) # Multiple newlines to single
684
- html_str = re.sub(r'>\s+<', '><', html_str) # Remove spaces between tags
685
-
686
- return html_str
687
-
688
- def _final_cleanup(self, html_content: str) -> str:
689
- """Final cleanup and optimization"""
690
- # Remove empty attributes
691
- html_content = re.sub(r'\s+\w+=""', '', html_content)
692
-
693
- # Remove extra spaces in attributes
694
- html_content = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html_content)
695
-
696
- # Normalize quotes
697
- html_content = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html_content)
698
-
699
- # Remove trailing spaces before closing tags
700
- html_content = re.sub(r'\s+(/?>)', r'\1', html_content)
701
-
702
- # Enhanced whitespace cleanup
703
- html_content = self._advanced_whitespace_cleanup(html_content)
704
-
705
- return html_content.strip()
706
-
707
- def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> Dict[str, any]:
708
- """Get statistics about the cleaning process"""
709
- original_size = len(original_html)
710
- cleaned_size = len(cleaned_html)
711
-
712
- # Estimate token reduction (rough approximation)
713
- original_tokens = original_size // 4 # Rough estimate: 4 chars per token
714
- cleaned_tokens = cleaned_size // 4
715
-
716
- return {
717
- "original_size_bytes": original_size,
718
- "cleaned_size_bytes": cleaned_size,
719
- "size_reduction_bytes": original_size - cleaned_size,
720
- "size_reduction_percent": ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0,
721
- "estimated_original_tokens": original_tokens,
722
- "estimated_cleaned_tokens": cleaned_tokens,
723
- "estimated_token_savings": original_tokens - cleaned_tokens,
724
- "estimated_token_savings_percent": ((original_tokens - cleaned_tokens) / original_tokens * 100) if original_tokens > 0 else 0
725
- }
726
-
727
-
728
- # Convenience functions
729
- def clean_html_for_llm(
730
- html_content: str,
731
- preserve_js_data: bool = True,
732
- aggressive_cleaning: bool = False
733
- ) -> Tuple[str, Dict[str, any]]:
734
- """
735
- Quick function to clean HTML for LLM analysis
736
-
737
- Args:
738
- html_content: Raw HTML content
739
- preserve_js_data: Whether to extract and preserve JS data
740
- aggressive_cleaning: Whether to apply aggressive cleaning
741
-
742
- Returns:
743
- Tuple of (cleaned_html, extracted_data)
744
- """
745
- cleaner = SmartHTMLCleaner()
746
- return cleaner.clean_html(html_content, preserve_js_data, aggressive_cleaning)
747
-
748
-
749
- def extract_js_data_only(html_content: str) -> Dict[str, any]:
750
- """
751
- Extract only JavaScript data without cleaning HTML
752
-
753
- Args:
754
- html_content: Raw HTML content
755
-
756
- Returns:
757
- Extracted JavaScript data
758
- """
759
- try:
760
- soup = BeautifulSoup(html_content, 'html.parser')
761
- cleaner = SmartHTMLCleaner()
762
- return cleaner._extract_js_data(soup)
763
- except Exception:
764
- return {}
765
-
766
-
767
-