unrealon 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. unrealon/__init__.py +23 -21
  2. unrealon-1.1.1.dist-info/METADATA +722 -0
  3. unrealon-1.1.1.dist-info/RECORD +82 -0
  4. {unrealon-1.0.9.dist-info → unrealon-1.1.1.dist-info}/WHEEL +1 -1
  5. unrealon-1.1.1.dist-info/entry_points.txt +9 -0
  6. {unrealon-1.0.9.dist-info → unrealon-1.1.1.dist-info/licenses}/LICENSE +1 -1
  7. unrealon_bridge/__init__.py +114 -0
  8. unrealon_bridge/cli.py +316 -0
  9. unrealon_bridge/client/__init__.py +93 -0
  10. unrealon_bridge/client/base.py +78 -0
  11. unrealon_bridge/client/commands.py +89 -0
  12. unrealon_bridge/client/connection.py +90 -0
  13. unrealon_bridge/client/events.py +65 -0
  14. unrealon_bridge/client/health.py +38 -0
  15. unrealon_bridge/client/html_parser.py +146 -0
  16. unrealon_bridge/client/logging.py +139 -0
  17. unrealon_bridge/client/proxy.py +70 -0
  18. unrealon_bridge/client/scheduler.py +450 -0
  19. unrealon_bridge/client/session.py +70 -0
  20. unrealon_bridge/configs/__init__.py +14 -0
  21. unrealon_bridge/configs/bridge_config.py +212 -0
  22. unrealon_bridge/configs/bridge_config.yaml +39 -0
  23. unrealon_bridge/models/__init__.py +138 -0
  24. unrealon_bridge/models/base.py +28 -0
  25. unrealon_bridge/models/command.py +41 -0
  26. unrealon_bridge/models/events.py +40 -0
  27. unrealon_bridge/models/html_parser.py +79 -0
  28. unrealon_bridge/models/logging.py +55 -0
  29. unrealon_bridge/models/parser.py +63 -0
  30. unrealon_bridge/models/proxy.py +41 -0
  31. unrealon_bridge/models/requests.py +95 -0
  32. unrealon_bridge/models/responses.py +88 -0
  33. unrealon_bridge/models/scheduler.py +592 -0
  34. unrealon_bridge/models/session.py +28 -0
  35. unrealon_bridge/server/__init__.py +91 -0
  36. unrealon_bridge/server/base.py +171 -0
  37. unrealon_bridge/server/handlers/__init__.py +23 -0
  38. unrealon_bridge/server/handlers/command.py +110 -0
  39. unrealon_bridge/server/handlers/html_parser.py +139 -0
  40. unrealon_bridge/server/handlers/logging.py +95 -0
  41. unrealon_bridge/server/handlers/parser.py +95 -0
  42. unrealon_bridge/server/handlers/proxy.py +75 -0
  43. unrealon_bridge/server/handlers/scheduler.py +545 -0
  44. unrealon_bridge/server/handlers/session.py +66 -0
  45. unrealon_browser/__init__.py +61 -18
  46. unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
  47. unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
  48. unrealon_browser/{src/core → core}/browser_manager.py +2 -2
  49. unrealon_browser/{src/managers → managers}/captcha.py +1 -1
  50. unrealon_browser/{src/managers → managers}/cookies.py +1 -1
  51. unrealon_browser/managers/logger_bridge.py +231 -0
  52. unrealon_browser/{src/managers → managers}/profile.py +1 -1
  53. unrealon_driver/__init__.py +73 -19
  54. unrealon_driver/browser/__init__.py +8 -0
  55. unrealon_driver/browser/config.py +74 -0
  56. unrealon_driver/browser/manager.py +416 -0
  57. unrealon_driver/exceptions.py +28 -0
  58. unrealon_driver/parser/__init__.py +55 -0
  59. unrealon_driver/parser/cli_manager.py +141 -0
  60. unrealon_driver/parser/daemon_manager.py +227 -0
  61. unrealon_driver/parser/managers/__init__.py +46 -0
  62. unrealon_driver/parser/managers/browser.py +51 -0
  63. unrealon_driver/parser/managers/config.py +281 -0
  64. unrealon_driver/parser/managers/error.py +412 -0
  65. unrealon_driver/parser/managers/html.py +732 -0
  66. unrealon_driver/parser/managers/logging.py +609 -0
  67. unrealon_driver/parser/managers/result.py +321 -0
  68. unrealon_driver/parser/parser_manager.py +628 -0
  69. unrealon/sdk_config.py +0 -88
  70. unrealon-1.0.9.dist-info/METADATA +0 -810
  71. unrealon-1.0.9.dist-info/RECORD +0 -246
  72. unrealon_browser/pyproject.toml +0 -182
  73. unrealon_browser/src/__init__.py +0 -62
  74. unrealon_browser/src/managers/logger_bridge.py +0 -395
  75. unrealon_driver/README.md +0 -204
  76. unrealon_driver/pyproject.toml +0 -187
  77. unrealon_driver/src/__init__.py +0 -90
  78. unrealon_driver/src/cli/__init__.py +0 -10
  79. unrealon_driver/src/cli/main.py +0 -66
  80. unrealon_driver/src/cli/simple.py +0 -510
  81. unrealon_driver/src/config/__init__.py +0 -11
  82. unrealon_driver/src/config/auto_config.py +0 -478
  83. unrealon_driver/src/core/__init__.py +0 -18
  84. unrealon_driver/src/core/exceptions.py +0 -289
  85. unrealon_driver/src/core/parser.py +0 -638
  86. unrealon_driver/src/dto/__init__.py +0 -66
  87. unrealon_driver/src/dto/cli.py +0 -119
  88. unrealon_driver/src/dto/config.py +0 -18
  89. unrealon_driver/src/dto/events.py +0 -237
  90. unrealon_driver/src/dto/execution.py +0 -313
  91. unrealon_driver/src/dto/services.py +0 -311
  92. unrealon_driver/src/execution/__init__.py +0 -23
  93. unrealon_driver/src/execution/daemon_mode.py +0 -317
  94. unrealon_driver/src/execution/interactive_mode.py +0 -88
  95. unrealon_driver/src/execution/modes.py +0 -45
  96. unrealon_driver/src/execution/scheduled_mode.py +0 -209
  97. unrealon_driver/src/execution/test_mode.py +0 -250
  98. unrealon_driver/src/logging/__init__.py +0 -24
  99. unrealon_driver/src/logging/driver_logger.py +0 -512
  100. unrealon_driver/src/services/__init__.py +0 -24
  101. unrealon_driver/src/services/browser_service.py +0 -726
  102. unrealon_driver/src/services/llm/__init__.py +0 -15
  103. unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
  104. unrealon_driver/src/services/llm/llm.py +0 -195
  105. unrealon_driver/src/services/logger_service.py +0 -232
  106. unrealon_driver/src/services/metrics_service.py +0 -185
  107. unrealon_driver/src/services/scheduler_service.py +0 -489
  108. unrealon_driver/src/services/websocket_service.py +0 -362
  109. unrealon_driver/src/utils/__init__.py +0 -16
  110. unrealon_driver/src/utils/service_factory.py +0 -317
  111. unrealon_driver/src/utils/time_formatter.py +0 -338
  112. unrealon_llm/README.md +0 -44
  113. unrealon_llm/__init__.py +0 -26
  114. unrealon_llm/pyproject.toml +0 -154
  115. unrealon_llm/src/__init__.py +0 -228
  116. unrealon_llm/src/cli/__init__.py +0 -0
  117. unrealon_llm/src/core/__init__.py +0 -11
  118. unrealon_llm/src/core/smart_client.py +0 -438
  119. unrealon_llm/src/dto/__init__.py +0 -155
  120. unrealon_llm/src/dto/models/__init__.py +0 -0
  121. unrealon_llm/src/dto/models/config.py +0 -343
  122. unrealon_llm/src/dto/models/core.py +0 -328
  123. unrealon_llm/src/dto/models/enums.py +0 -123
  124. unrealon_llm/src/dto/models/html_analysis.py +0 -345
  125. unrealon_llm/src/dto/models/statistics.py +0 -473
  126. unrealon_llm/src/dto/models/translation.py +0 -383
  127. unrealon_llm/src/dto/models/type_conversion.py +0 -462
  128. unrealon_llm/src/dto/schemas/__init__.py +0 -0
  129. unrealon_llm/src/exceptions.py +0 -392
  130. unrealon_llm/src/llm_config/__init__.py +0 -20
  131. unrealon_llm/src/llm_config/logging_config.py +0 -178
  132. unrealon_llm/src/llm_logging/__init__.py +0 -42
  133. unrealon_llm/src/llm_logging/llm_events.py +0 -107
  134. unrealon_llm/src/llm_logging/llm_logger.py +0 -466
  135. unrealon_llm/src/managers/__init__.py +0 -15
  136. unrealon_llm/src/managers/cache_manager.py +0 -67
  137. unrealon_llm/src/managers/cost_manager.py +0 -107
  138. unrealon_llm/src/managers/request_manager.py +0 -298
  139. unrealon_llm/src/modules/__init__.py +0 -0
  140. unrealon_llm/src/modules/html_processor/__init__.py +0 -25
  141. unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
  142. unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
  143. unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
  144. unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
  145. unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
  146. unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
  147. unrealon_llm/src/modules/html_processor/processor.py +0 -102
  148. unrealon_llm/src/modules/llm/__init__.py +0 -0
  149. unrealon_llm/src/modules/translator/__init__.py +0 -0
  150. unrealon_llm/src/provider.py +0 -116
  151. unrealon_llm/src/utils/__init__.py +0 -95
  152. unrealon_llm/src/utils/common.py +0 -64
  153. unrealon_llm/src/utils/data_extractor.py +0 -188
  154. unrealon_llm/src/utils/html_cleaner.py +0 -767
  155. unrealon_llm/src/utils/language_detector.py +0 -308
  156. unrealon_llm/src/utils/models_cache.py +0 -592
  157. unrealon_llm/src/utils/smart_counter.py +0 -229
  158. unrealon_llm/src/utils/token_counter.py +0 -189
  159. unrealon_sdk/README.md +0 -25
  160. unrealon_sdk/__init__.py +0 -30
  161. unrealon_sdk/pyproject.toml +0 -231
  162. unrealon_sdk/src/__init__.py +0 -150
  163. unrealon_sdk/src/cli/__init__.py +0 -12
  164. unrealon_sdk/src/cli/commands/__init__.py +0 -22
  165. unrealon_sdk/src/cli/commands/benchmark.py +0 -42
  166. unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
  167. unrealon_sdk/src/cli/commands/health.py +0 -46
  168. unrealon_sdk/src/cli/commands/integration.py +0 -498
  169. unrealon_sdk/src/cli/commands/reports.py +0 -43
  170. unrealon_sdk/src/cli/commands/security.py +0 -36
  171. unrealon_sdk/src/cli/commands/server.py +0 -483
  172. unrealon_sdk/src/cli/commands/servers.py +0 -56
  173. unrealon_sdk/src/cli/commands/tests.py +0 -55
  174. unrealon_sdk/src/cli/main.py +0 -126
  175. unrealon_sdk/src/cli/utils/reporter.py +0 -519
  176. unrealon_sdk/src/clients/openapi.yaml +0 -3347
  177. unrealon_sdk/src/clients/python_http/__init__.py +0 -3
  178. unrealon_sdk/src/clients/python_http/api_config.py +0 -228
  179. unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
  180. unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
  181. unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
  182. unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
  183. unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
  184. unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
  185. unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
  186. unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
  187. unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
  188. unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
  189. unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
  190. unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
  191. unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
  192. unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
  193. unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
  194. unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
  195. unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
  196. unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
  197. unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
  198. unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
  199. unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
  200. unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
  201. unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
  202. unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
  203. unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
  204. unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
  205. unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
  206. unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
  207. unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
  208. unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
  209. unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
  210. unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
  211. unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
  212. unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
  213. unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
  214. unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
  215. unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
  216. unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
  217. unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
  218. unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
  219. unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
  220. unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
  221. unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
  222. unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
  223. unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
  224. unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
  225. unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
  226. unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
  227. unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
  228. unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
  229. unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
  230. unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
  231. unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
  232. unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
  233. unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
  234. unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
  235. unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
  236. unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
  237. unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
  238. unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
  239. unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
  240. unrealon_sdk/src/clients/python_websocket/client.py +0 -490
  241. unrealon_sdk/src/clients/python_websocket/events.py +0 -732
  242. unrealon_sdk/src/clients/python_websocket/example.py +0 -136
  243. unrealon_sdk/src/clients/python_websocket/types.py +0 -871
  244. unrealon_sdk/src/core/__init__.py +0 -64
  245. unrealon_sdk/src/core/client.py +0 -556
  246. unrealon_sdk/src/core/config.py +0 -465
  247. unrealon_sdk/src/core/exceptions.py +0 -239
  248. unrealon_sdk/src/core/metadata.py +0 -191
  249. unrealon_sdk/src/core/models.py +0 -142
  250. unrealon_sdk/src/core/types.py +0 -68
  251. unrealon_sdk/src/dto/__init__.py +0 -268
  252. unrealon_sdk/src/dto/authentication.py +0 -108
  253. unrealon_sdk/src/dto/cache.py +0 -208
  254. unrealon_sdk/src/dto/common.py +0 -19
  255. unrealon_sdk/src/dto/concurrency.py +0 -393
  256. unrealon_sdk/src/dto/events.py +0 -108
  257. unrealon_sdk/src/dto/health.py +0 -339
  258. unrealon_sdk/src/dto/load_balancing.py +0 -336
  259. unrealon_sdk/src/dto/logging.py +0 -230
  260. unrealon_sdk/src/dto/performance.py +0 -165
  261. unrealon_sdk/src/dto/rate_limiting.py +0 -295
  262. unrealon_sdk/src/dto/resource_pooling.py +0 -128
  263. unrealon_sdk/src/dto/structured_logging.py +0 -112
  264. unrealon_sdk/src/dto/task_scheduling.py +0 -121
  265. unrealon_sdk/src/dto/websocket.py +0 -55
  266. unrealon_sdk/src/enterprise/__init__.py +0 -59
  267. unrealon_sdk/src/enterprise/authentication.py +0 -401
  268. unrealon_sdk/src/enterprise/cache_manager.py +0 -578
  269. unrealon_sdk/src/enterprise/error_recovery.py +0 -494
  270. unrealon_sdk/src/enterprise/event_system.py +0 -549
  271. unrealon_sdk/src/enterprise/health_monitor.py +0 -747
  272. unrealon_sdk/src/enterprise/load_balancer.py +0 -964
  273. unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
  274. unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
  275. unrealon_sdk/src/enterprise/logging/development.py +0 -744
  276. unrealon_sdk/src/enterprise/logging/service.py +0 -410
  277. unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
  278. unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
  279. unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
  280. unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
  281. unrealon_sdk/src/enterprise/resource_pool.py +0 -763
  282. unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
  283. unrealon_sdk/src/internal/__init__.py +0 -10
  284. unrealon_sdk/src/internal/command_router.py +0 -497
  285. unrealon_sdk/src/internal/connection_manager.py +0 -397
  286. unrealon_sdk/src/internal/http_client.py +0 -446
  287. unrealon_sdk/src/internal/websocket_client.py +0 -420
  288. unrealon_sdk/src/provider.py +0 -471
  289. unrealon_sdk/src/utils.py +0 -234
  290. /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
  291. /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
  292. /unrealon_browser/{src/cli → cli}/main.py +0 -0
  293. /unrealon_browser/{src/core → core}/__init__.py +0 -0
  294. /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
  295. /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
  296. /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
  297. /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
  298. /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
  299. /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
  300. /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
  301. /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
  302. /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
@@ -1,415 +0,0 @@
1
- """
2
- Base HTML Processor
3
-
4
- Universal base class for HTML pattern extraction processors.
5
- Provides common functionality for listing and details processors.
6
- """
7
-
8
- from abc import ABC, abstractmethod
9
- import json
10
- import random
11
- from typing import Type
12
- import traceback
13
- import re
14
-
15
- from unrealon_llm.src.core import SmartLLMClient
16
- from unrealon_llm.src.dto import ChatMessage, MessageRole
17
- from unrealon_llm.src.utils.html_cleaner import SmartHTMLCleaner
18
- from unrealon_llm.src.utils.data_extractor import SmartDataExtractor
19
- from unrealon_llm.src.llm_logging import (
20
- get_llm_logger,
21
- initialize_development_logger,
22
- initialize_llm_logger,
23
- )
24
-
25
- from .models import (
26
- UniversalExtractionSchema,
27
- ProcessingInfo,
28
- ExtractionResult,
29
- )
30
-
31
- # Ensure loggers are initialized
32
- logger = get_llm_logger()
33
- if logger is None:
34
- try:
35
- initialize_development_logger()
36
- initialize_llm_logger()
37
- logger = get_llm_logger()
38
- except:
39
- logger = None
40
-
41
-
42
- class BaseHTMLProcessor(ABC):
43
- """Base class for HTML pattern extraction processors"""
44
-
45
- def __init__(self, llm_client: SmartLLMClient):
46
- """
47
- Initialize base processor
48
-
49
- Args:
50
- llm_client: LLM client for AI analysis
51
- """
52
- self.llm_client = llm_client
53
- self.cleaner = SmartHTMLCleaner()
54
- self.data_extractor = SmartDataExtractor()
55
-
56
- # Get processor-specific configuration
57
- self.processor_type = self.get_processor_type()
58
- self.schema_class = self.get_schema_class()
59
-
60
- logger.log_html_analysis_start(
61
- html_size_bytes=0, # Will be filled when processing
62
- target_elements=[self.processor_type],
63
- details={"processor_class": self.__class__.__name__},
64
- )
65
-
66
- @abstractmethod
67
- def get_processor_type(self) -> str:
68
- """Return processor type identifier"""
69
- pass
70
-
71
- @abstractmethod
72
- def get_schema_class(self) -> Type:
73
- """Return Pydantic schema class for this processor"""
74
- pass
75
-
76
- @abstractmethod
77
- def get_extraction_prompt_template(self) -> str:
78
- """Return extraction prompt template for this processor type"""
79
- pass
80
-
81
- def _trim_system_prompt(self, system_prompt: str) -> str:
82
- """Trim system prompt to remove empty lines"""
83
- return "\n".join(system_prompt.split("\n")[1:])
84
-
85
- async def extract_patterns(self, html_content: str) -> ExtractionResult:
86
- """
87
- Extract patterns from HTML using LLM intelligence
88
-
89
- Args:
90
- html_content: Raw HTML content
91
-
92
- Returns:
93
- ExtractionResult: Validated Pydantic result with extraction patterns and processing metadata
94
- """
95
- logger.log_html_analysis_start(
96
- html_size_bytes=len(html_content),
97
- target_elements=[self.processor_type],
98
- details={"processor_type": self.processor_type},
99
- )
100
-
101
- # Clean HTML first with aggressive cleaning for LLM analysis
102
- cleaned_html, extracted_data = self.cleaner.clean_html(
103
- html_content, preserve_js_data=True, aggressive_cleaning=True
104
- )
105
-
106
- cleaning_stats = self.cleaner.get_cleaning_stats(html_content, cleaned_html)
107
- logger.log_html_cleaning(
108
- original_size_bytes=len(html_content),
109
- cleaned_size_bytes=len(cleaned_html),
110
- optimization_type="aggressive",
111
- details=cleaning_stats,
112
- )
113
-
114
- # Build extraction prompt
115
- prompt = self._build_extraction_prompt(cleaned_html)
116
-
117
- # Log the full prompt for debugging
118
- logger.log_llm_request_start(
119
- provider="debug",
120
- model="prompt_debug",
121
- prompt_tokens=0,
122
- details={
123
- "full_prompt": prompt[:2000] + "..." if len(prompt) > 2000 else prompt,
124
- "schema_json": json.dumps(
125
- self.schema_class.model_json_schema(), indent=2
126
- ),
127
- },
128
- )
129
-
130
- # Add critical format requirements to the prompt
131
- SYSTEM_PROMPT = f"""
132
- You are an HTML-to-JSON expert at analyzing {self.processor_type} pages.
133
- You MUST return JSON that EXACTLY matches the Pydantic schema provided.
134
- RESPOND ONLY WITH VALID JSON.
135
- NO EXPLANATIONS, NO TEXT, ONLY JSON!
136
- Include ALL required fields from the schema!
137
- CRITICAL: The 'selectors' field must be a DICTIONARY/OBJECT, not a list!
138
- """
139
-
140
- # Prepare LLM messages
141
- messages = [
142
- ChatMessage(
143
- role=MessageRole.SYSTEM,
144
- content=self._trim_system_prompt(SYSTEM_PROMPT),
145
- ),
146
- ChatMessage(
147
- role=MessageRole.USER,
148
- content=prompt
149
- + "\n\nRESPOND ONLY WITH JSON! START WITH { AND END WITH }. NO OTHER TEXT!",
150
- ),
151
- ]
152
-
153
- logger.log_llm_request_start(
154
- provider="openrouter",
155
- model=getattr(self.llm_client, "model", "unknown"),
156
- prompt_tokens=len(prompt) // 4, # rough estimate
157
- details={"processor_type": self.processor_type},
158
- )
159
-
160
- response = None
161
- try:
162
- # Call LLM
163
- response = await self.llm_client.chat_completion(
164
- messages, response_model=self.schema_class
165
- )
166
-
167
- # Log full LLM response for debugging
168
- logger.log_llm_response_received(
169
- provider="openrouter",
170
- model=getattr(response, "model", "unknown"),
171
- completion_tokens=(
172
- getattr(response.usage, "completion_tokens", 0)
173
- if hasattr(response, "usage")
174
- else 0
175
- ),
176
- total_tokens=(
177
- getattr(response.usage, "total_tokens", 0)
178
- if hasattr(response, "usage")
179
- else 0
180
- ),
181
- cost_usd=getattr(response, "cost_usd", 0.0),
182
- details={"raw_response_full": response.content},
183
- )
184
-
185
- # Use the validated model from LLM response
186
- if hasattr(response, "extracted_model") and response.extracted_model:
187
- validated_model = response.extracted_model
188
- validated_result = validated_model.model_dump()
189
- logger.log_html_analysis_completed(
190
- selectors_generated=len(str(validated_result)),
191
- confidence_score=validated_result.get("confidence", 0.0),
192
- details={
193
- "processor_type": self.processor_type,
194
- "validation_success": True,
195
- "schema_matched": True,
196
- },
197
- )
198
- else:
199
- # Fallback: parse manually if no model provided
200
- result_data = self.data_extractor.extract_json(response.content)
201
- try:
202
- validated_model = self.schema_class(**result_data)
203
- validated_result = validated_model.model_dump()
204
- logger.log_html_analysis_completed(
205
- selectors_generated=len(str(result_data)),
206
- confidence_score=result_data.get("confidence", 0.0),
207
- details={
208
- "processor_type": self.processor_type,
209
- "validation_success": True,
210
- "schema_matched": True,
211
- },
212
- )
213
- except Exception as e:
214
- logger.log_html_analysis_failed(
215
- error_message=f"Pydantic validation failed: {str(e)}",
216
- details={
217
- "processor_type": self.processor_type,
218
- "validation_error": str(e),
219
- "raw_llm_response": result_data,
220
- },
221
- )
222
-
223
- # 🔥 SMART FALLBACK: Try to fix common LLM format issues
224
- try:
225
- fixed_data = self._fix_llm_response_format(result_data, str(e))
226
- validated_model = self.schema_class(**fixed_data)
227
- validated_result = validated_model.model_dump()
228
- logger.log_html_analysis_completed(
229
- selectors_generated=len(str(fixed_data)),
230
- confidence_score=fixed_data.get("confidence", 0.0),
231
- details={
232
- "processor_type": self.processor_type,
233
- "validation_success": True,
234
- "schema_matched": True,
235
- "format_fixed": True,
236
- },
237
- )
238
- except Exception as fix_error:
239
- logger.log_html_analysis_failed(
240
- error_message=f"Format fixing also failed: {str(fix_error)}",
241
- details={
242
- "processor_type": self.processor_type,
243
- "validation_error": str(e),
244
- "fix_error": str(fix_error),
245
- "raw_llm_response": result_data,
246
- },
247
- )
248
- # Final fallback: create minimal valid structure
249
- validated_result = self._create_fallback_result(
250
- result_data, str(e)
251
- )
252
-
253
- # Create Pydantic processing metadata
254
- processing_info = ProcessingInfo(
255
- original_html_size=len(html_content),
256
- cleaned_html_size=len(cleaned_html),
257
- cleaning_stats=cleaning_stats,
258
- extracted_js_data=extracted_data,
259
- processor_type=self.processor_type,
260
- llm_model=getattr(response, "model", "unknown"),
261
- tokens_used=(
262
- getattr(response.usage, "total_tokens", 0)
263
- if hasattr(response, "usage")
264
- else 0
265
- ),
266
- cost_usd=getattr(response, "cost_usd", 0.0),
267
- )
268
-
269
- # Return validated Pydantic result
270
- return ExtractionResult(
271
- extraction_result=validated_result,
272
- processing_info=processing_info,
273
- )
274
-
275
- except Exception as e:
276
- logger.log_html_analysis_failed(
277
- error_message=str(e),
278
- details={
279
- "processor_type": self.processor_type,
280
- "raw_response": getattr(response, "content", "No response"),
281
- "traceback": traceback.format_exc(),
282
- },
283
- )
284
- raise
285
-
286
- def _build_extraction_prompt(self, cleaned_html: str) -> str:
287
- """Build extraction prompt using processor-specific template"""
288
- # Processors handle their own prompt construction with schema and HTML
289
- # Just get the template and let it handle the details
290
- prompt_template = self.get_extraction_prompt_template()
291
-
292
- # Use more content for better analysis, but still respect token limits
293
- html_limit = 50000 # Increase from 15K to 50K characters
294
-
295
- # Build full prompt with auto-generated Pydantic 2 schema
296
- schema_json = json.dumps(self.schema_class.model_json_schema(), indent=2)
297
-
298
- # Add random number to bypass any caching
299
- cache_buster = random.randint(100000, 999999)
300
-
301
- schema_prompt = f"""
302
- PYDANTIC 2 SCHEMA (Request #{cache_buster}):
303
- {schema_json}
304
-
305
- 🚨 CRITICAL FORMAT REQUIREMENTS:
306
- 1. Return JSON that EXACTLY matches this schema structure!
307
- 2. The response must include ALL required fields: detected_item_type, extraction_strategy, confidence, selectors, documentation
308
- 3. The "selectors" field MUST be a DICTIONARY/OBJECT with field names as keys and arrays of CSS selectors as values
309
- 4. Example: "selectors": {{"title": ["h1.title", ".product-name"], "price": [".price", ".cost"]}}
310
- 5. DO NOT return "selectors" as a list: ❌ ["h1.title", ".price"]
311
- 6. DO return "selectors" as a dictionary: ✅ {{"title": ["h1.title"], "price": [".price"]}}
312
- """
313
- schema_prompt = self._trim_system_prompt(schema_prompt)
314
-
315
- return prompt_template.format(
316
- processor_type=self.processor_type,
317
- html_content=cleaned_html[:html_limit]
318
- + ("..." if len(cleaned_html) > html_limit else ""),
319
- schema=schema_prompt,
320
- )
321
-
322
- def get_cost_estimate(self, html_content: str) -> float:
323
- """
324
- Estimate cost for processing HTML content
325
-
326
- Args:
327
- html_content: HTML content to estimate
328
-
329
- Returns:
330
- Estimated cost in USD
331
- """
332
- # Clean HTML to get realistic token count
333
- cleaned_html, _ = self.cleaner.clean_html(
334
- html_content, aggressive_cleaning=True
335
- )
336
-
337
- # Rough token estimation (1 token ≈ 4 characters)
338
- estimated_tokens = len(cleaned_html) / 4
339
-
340
- # Add prompt overhead (approximately 500 tokens)
341
- total_tokens = estimated_tokens + 500
342
-
343
- # Estimate cost (Claude Haiku: ~$0.25 per 1M input tokens)
344
- estimated_cost = (total_tokens / 1_000_000) * 0.25
345
-
346
- return estimated_cost
347
-
348
- def _fix_llm_response_format(self, result_data: dict, error_message: str) -> dict:
349
- """Fix common LLM response format issues."""
350
- fixed_data = result_data.copy()
351
-
352
- # Fix selectors if it's a list instead of dict
353
- if "selectors" in fixed_data and isinstance(fixed_data["selectors"], list):
354
- logger.log_html_analysis_failed(
355
- error_message="Fixing selectors format: list -> dict",
356
- details={
357
- "processor_type": self.processor_type,
358
- "original_selectors": fixed_data["selectors"],
359
- },
360
- )
361
-
362
- # Convert list to dict with generic field names
363
- selectors_list = fixed_data["selectors"]
364
- fixed_data["selectors"] = {}
365
-
366
- # Try to intelligently map list items to field names
367
- field_names = ["item", "title", "price", "description", "image", "link"]
368
- for i, selector in enumerate(selectors_list):
369
- if i < len(field_names):
370
- field_name = field_names[i]
371
- else:
372
- field_name = f"field_{i+1}"
373
-
374
- # Convert single selector to list
375
- if isinstance(selector, str):
376
- fixed_data["selectors"][field_name] = [selector]
377
- elif isinstance(selector, list):
378
- fixed_data["selectors"][field_name] = selector
379
- else:
380
- fixed_data["selectors"][field_name] = [str(selector)]
381
-
382
- # Ensure all required fields exist
383
- required_fields = [
384
- "detected_item_type",
385
- "extraction_strategy",
386
- "confidence",
387
- "selectors",
388
- "documentation",
389
- ]
390
- for field in required_fields:
391
- if field not in fixed_data:
392
- if field == "detected_item_type":
393
- fixed_data[field] = "unknown"
394
- elif field == "extraction_strategy":
395
- fixed_data[field] = "fallback_strategy"
396
- elif field == "confidence":
397
- fixed_data[field] = 0.1
398
- elif field == "selectors":
399
- fixed_data[field] = {}
400
- elif field == "documentation":
401
- fixed_data[field] = (
402
- "Extraction completed with fallback processing due to format issues."
403
- )
404
-
405
- return fixed_data
406
-
407
- def _create_fallback_result(self, result_data: dict, error_message: str) -> dict:
408
- """Create a minimal valid result when all else fails."""
409
- return {
410
- "detected_item_type": "unknown",
411
- "extraction_strategy": "fallback_strategy",
412
- "confidence": 0.1,
413
- "selectors": {},
414
- "documentation": f"Extraction failed due to validation error: {error_message}. Raw data: {str(result_data)[:500]}...",
415
- }
@@ -1,85 +0,0 @@
1
- """
2
- Details Processor
3
-
4
- Universal processor for detail/product/item pages.
5
- Handles ANY type of detail pages: product details, service info, article content, job descriptions, etc.
6
- """
7
-
8
- from typing import Type
9
-
10
- from .base_processor import BaseHTMLProcessor
11
- from .models import UniversalExtractionSchema
12
-
13
-
14
- class DetailsProcessor(BaseHTMLProcessor):
15
- """Universal details page pattern extractor"""
16
-
17
- def get_processor_type(self) -> str:
18
- """Return processor type identifier"""
19
- return "details"
20
-
21
- def get_schema_class(self) -> Type[UniversalExtractionSchema]:
22
- """Return Pydantic schema class for details extraction"""
23
- return UniversalExtractionSchema
24
-
25
- def get_extraction_prompt_template(self) -> str:
26
- """Return details-specific extraction prompt template"""
27
-
28
- prompt = """{schema}
29
- [__TASK_DESCRIPTION__]
30
- Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
31
- PROCESSOR TYPE: {processor_type}
32
- THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
33
- [/__TASK_DESCRIPTION__]
34
-
35
- [__CRITICAL_FORMAT_REQUIREMENTS__]
36
- 🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
37
- Example of CORRECT format:
38
- "selectors": {{
39
- "title": ["h1.product-title", "h1.page-title", ".item-name"],
40
- "price": [".price", ".cost", "span[data-price]", ".product-price"],
41
- "description": [".description", ".product-desc", ".item-details"],
42
- "images": ["img.product-image", ".gallery img", "img[src*='product']"],
43
- "specifications": [".specs", ".product-specs", ".item-specifications"],
44
- "reviews": [".reviews", ".product-reviews", ".customer-reviews"]
45
- }}
46
-
47
- ❌ WRONG format (DO NOT USE):
48
- "selectors": ["h1.title", ".price", ".description"]
49
-
50
- ✅ CORRECT format (USE THIS):
51
- "selectors": {{
52
- "title": ["h1.title", ".product-name", "h1[itemprop='name']"],
53
- "price": [".price", ".cost", "span[data-price]"],
54
- "description": [".description", ".product-desc", ".item-details"]
55
- }}
56
- [/__CRITICAL_FORMAT_REQUIREMENTS__]
57
-
58
- [__INSTRUCTIONS__]
59
- YOUR TASK:
60
- Analyze this details page and generate extraction patterns for ANY type of item.
61
- This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
62
-
63
- CRITICAL REQUIREMENTS:
64
- 1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
65
- 2. Include comprehensive markdown documentation
66
- 3. Provide real examples from the actual HTML
67
- 4. Explain the page structure and best extraction approach
68
- 5. Include confidence scores and fallback strategies
69
- 6. Document any special handling needed
70
-
71
- ANALYZE THE HTML AND DETERMINE:
72
- - What type of item this page describes
73
- - What information is available (specs, pricing, reviews, etc.)
74
- - How content is structured and organized
75
- - What actions are possible (buy, contact, etc.)
76
- - Best extraction strategy for this specific page
77
- [/__INSTRUCTIONS__]
78
-
79
- [__HTML_CONTENT__]
80
- HTML CONTENT (first 50KB):
81
- {html_content}
82
- [/__HTML_CONTENT__]
83
- """
84
-
85
- return self._trim_system_prompt(prompt)
@@ -1,91 +0,0 @@
1
- """
2
- Listing Processor
3
-
4
- Universal processor for listing/catalog pages.
5
- Handles ANY type of listings: products, services, articles, real estate, jobs, etc.
6
- """
7
-
8
- from typing import Type
9
-
10
- from .base_processor import BaseHTMLProcessor
11
- from .models import UniversalExtractionSchema
12
-
13
-
14
- class ListingProcessor(BaseHTMLProcessor):
15
- """Universal listing page pattern extractor"""
16
-
17
- def get_processor_type(self) -> str:
18
- """Return processor type identifier"""
19
- return "listing"
20
-
21
- def get_schema_class(self) -> Type[UniversalExtractionSchema]:
22
- """Return Pydantic schema class for listing extraction"""
23
- return UniversalExtractionSchema
24
-
25
- def get_extraction_prompt_template(self) -> str:
26
- """Return listing-specific extraction prompt template"""
27
-
28
- prompt = """{schema}
29
-
30
- [__TASK_DESCRIPTION__]
31
- Analyze this LISTING/CATALOG page and generate universal extraction patterns.
32
- PROCESSOR TYPE: {processor_type}
33
- THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
34
- [/__TASK_DESCRIPTION__]
35
-
36
- [__CRITICAL_FORMAT_REQUIREMENTS__]
37
- 🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
38
- Example of CORRECT format:
39
- "selectors": {{
40
- "items_container": ["div.product-grid", "ul.product-list", "div.items"],
41
- "item_title": ["h3.product-title", "a.product-link", ".item-name"],
42
- "item_price": [".price", ".cost", "span[data-price]"],
43
- "item_image": ["img.product-image", ".item-img", "img[src*='product']"],
44
- "pagination": [".pagination", ".page-nav", "nav[aria-label='pagination']"]
45
- }}
46
-
47
- ❌ WRONG format (DO NOT USE):
48
- "selectors": ["div.product", "h3.title", ".price"]
49
-
50
- ✅ CORRECT format (USE THIS):
51
- "selectors": {{
52
- "items": ["div.product", "li.item", ".product-card"],
53
- "titles": ["h3.title", ".product-name", "a[title]"],
54
- "prices": [".price", ".cost", "span[data-price]"]
55
- }}
56
- [/__CRITICAL_FORMAT_REQUIREMENTS__]
57
-
58
- [__INSTRUCTIONS__]
59
- YOUR TASK:
60
- Analyze this listing page and generate extraction patterns for ANY type of items.
61
- This could be: products, services, articles, jobs, real estate, people, cars, etc.
62
-
63
- CRITICAL REQUIREMENTS:
64
- 1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
65
- 2. This is a LISTING PAGE with multiple items
66
- 3. Focus on identifying item containers and individual item patterns
67
- 4. Detect ANY type of items - not just products!
68
- 5. Provide multiple fallback selectors for reliability
69
- 6. Include pagination and navigation patterns
70
- 7. Use realistic confidence scores (0.1-1.0)
71
- 8. Auto-detect what type of content this listing contains
72
- 9. Provide extraction strategy advice
73
- 10. Look for structured data (JSON-LD, microdata)
74
- 11. Generate patterns that work with BeautifulSoup4 .select() method
75
- 12. RETURN JSON that EXACTLY matches the Pydantic schema above!
76
-
77
- ANALYZE THE HTML AND DETERMINE:
78
- - What type of items are listed (products, services, articles, etc.)
79
- - How items are structured and contained
80
- - What navigation elements exist
81
- - What metadata is available
82
- - Best extraction strategy for this specific page
83
- [/__INSTRUCTIONS__]
84
-
85
- [__HTML_CONTENT__]
86
- HTML CONTENT (first 50KB):
87
- {html_content}
88
- [/__HTML_CONTENT__]
89
- """
90
-
91
- return self._trim_system_prompt(prompt)
@@ -1,20 +0,0 @@
1
- """
2
- HTML Processor Models
3
-
4
- Simplified universal model for HTML pattern extraction with markdown documentation.
5
- """
6
-
7
- # Universal model
8
- from .universal_model import UniversalExtractionSchema
9
-
10
- # Processing models
11
- from .processing_models import ProcessingInfo, ExtractionResult
12
-
13
- __all__ = [
14
- # Universal model
15
- "UniversalExtractionSchema",
16
-
17
- # Processing models
18
- "ProcessingInfo",
19
- "ExtractionResult",
20
- ]
@@ -1,40 +0,0 @@
1
- """
2
- Processing Models for HTML Processing
3
-
4
- Pydantic models for processing metadata and results.
5
- """
6
-
7
- from typing import Dict, Any
8
- from pydantic import BaseModel, Field, ConfigDict
9
-
10
-
11
- class ProcessingInfo(BaseModel):
12
- """Processing metadata and statistics"""
13
-
14
- model_config = ConfigDict(
15
- validate_assignment=True,
16
- extra="forbid",
17
- title="Processing Information"
18
- )
19
-
20
- original_html_size: int = Field(..., description="Original HTML size in bytes")
21
- cleaned_html_size: int = Field(..., description="Cleaned HTML size in bytes")
22
- cleaning_stats: Dict[str, Any] = Field(..., description="HTML cleaning statistics")
23
- extracted_js_data: Dict[str, Any] = Field(..., description="Extracted JavaScript data")
24
- processor_type: str = Field(..., description="Type of processor used")
25
- llm_model: str = Field(..., description="LLM model used for extraction")
26
- tokens_used: int = Field(..., description="Total tokens used in LLM request")
27
- cost_usd: float = Field(..., description="Cost of LLM request in USD")
28
-
29
-
30
- class ExtractionResult(BaseModel):
31
- """Complete extraction result with metadata"""
32
-
33
- model_config = ConfigDict(
34
- validate_assignment=True,
35
- extra="forbid",
36
- title="Extraction Result"
37
- )
38
-
39
- extraction_result: Dict[str, Any] = Field(..., description="Raw extraction patterns")
40
- processing_info: ProcessingInfo = Field(..., description="Processing metadata")