unrealon 1.0.5__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. {unrealon-1.0.5 → unrealon-1.0.6}/PKG-INFO +1 -1
  2. {unrealon-1.0.5 → unrealon-1.0.6}/pyproject.toml +1 -1
  3. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon/sdk_config.py +1 -1
  4. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/base_processor.py +134 -13
  5. unrealon-1.0.6/unrealon_llm/src/modules/html_processor/details_processor.py +85 -0
  6. unrealon-1.0.6/unrealon_llm/src/modules/html_processor/listing_processor.py +91 -0
  7. unrealon-1.0.5/unrealon_llm/src/modules/html_processor/details_processor.py +0 -61
  8. unrealon-1.0.5/unrealon_llm/src/modules/html_processor/listing_processor.py +0 -67
  9. {unrealon-1.0.5 → unrealon-1.0.6}/LICENSE +0 -0
  10. {unrealon-1.0.5 → unrealon-1.0.6}/README.md +0 -0
  11. {unrealon-1.0.5 → unrealon-1.0.6}/requirements-dev.txt +0 -0
  12. {unrealon-1.0.5 → unrealon-1.0.6}/requirements.txt +0 -0
  13. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon/__init__.py +0 -0
  14. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/README.md +0 -0
  15. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/__init__.py +0 -0
  16. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/pyproject.toml +0 -0
  17. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/__init__.py +0 -0
  18. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/__init__.py +0 -0
  19. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/browser_cli.py +0 -0
  20. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/cookies_cli.py +0 -0
  21. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/interactive_mode.py +0 -0
  22. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/main.py +0 -0
  23. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/core/__init__.py +0 -0
  24. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/core/browser_manager.py +0 -0
  25. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/__init__.py +0 -0
  26. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/config.py +0 -0
  27. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/core.py +0 -0
  28. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/dataclasses.py +0 -0
  29. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/detection.py +0 -0
  30. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/enums.py +0 -0
  31. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/statistics.py +0 -0
  32. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/__init__.py +0 -0
  33. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/captcha.py +0 -0
  34. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/cookies.py +0 -0
  35. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/logger_bridge.py +0 -0
  36. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/profile.py +0 -0
  37. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/stealth.py +0 -0
  38. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/README.md +0 -0
  39. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/__init__.py +0 -0
  40. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/pyproject.toml +0 -0
  41. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/__init__.py +0 -0
  42. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/cli/__init__.py +0 -0
  43. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/cli/main.py +0 -0
  44. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/cli/simple.py +0 -0
  45. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/config/__init__.py +0 -0
  46. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/config/auto_config.py +0 -0
  47. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/core/__init__.py +0 -0
  48. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/core/exceptions.py +0 -0
  49. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/core/parser.py +0 -0
  50. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/__init__.py +0 -0
  51. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/cli.py +0 -0
  52. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/config.py +0 -0
  53. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/events.py +0 -0
  54. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/execution.py +0 -0
  55. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/services.py +0 -0
  56. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/__init__.py +0 -0
  57. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/daemon_mode.py +0 -0
  58. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/interactive_mode.py +0 -0
  59. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/modes.py +0 -0
  60. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/scheduled_mode.py +0 -0
  61. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/test_mode.py +0 -0
  62. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/logging/__init__.py +0 -0
  63. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/logging/driver_logger.py +0 -0
  64. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/__init__.py +0 -0
  65. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/browser_service.py +0 -0
  66. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/llm/__init__.py +0 -0
  67. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/llm/browser_llm_service.py +0 -0
  68. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/llm/llm.py +0 -0
  69. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/logger_service.py +0 -0
  70. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/metrics_service.py +0 -0
  71. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/scheduler_service.py +0 -0
  72. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/websocket_service.py +0 -0
  73. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/utils/__init__.py +0 -0
  74. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/utils/service_factory.py +0 -0
  75. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/utils/time_formatter.py +0 -0
  76. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/README.md +0 -0
  77. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/__init__.py +0 -0
  78. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/pyproject.toml +0 -0
  79. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/__init__.py +0 -0
  80. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/cli/__init__.py +0 -0
  81. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/core/__init__.py +0 -0
  82. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/core/smart_client.py +0 -0
  83. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/__init__.py +0 -0
  84. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/__init__.py +0 -0
  85. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/config.py +0 -0
  86. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/core.py +0 -0
  87. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/enums.py +0 -0
  88. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/html_analysis.py +0 -0
  89. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/statistics.py +0 -0
  90. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/translation.py +0 -0
  91. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/type_conversion.py +0 -0
  92. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/schemas/__init__.py +0 -0
  93. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/exceptions.py +0 -0
  94. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_config/__init__.py +0 -0
  95. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_config/logging_config.py +0 -0
  96. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_logging/__init__.py +0 -0
  97. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_logging/llm_events.py +0 -0
  98. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_logging/llm_logger.py +0 -0
  99. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/__init__.py +0 -0
  100. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/cache_manager.py +0 -0
  101. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/cost_manager.py +0 -0
  102. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/request_manager.py +0 -0
  103. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/__init__.py +0 -0
  104. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/__init__.py +0 -0
  105. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/models/__init__.py +0 -0
  106. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -0
  107. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -0
  108. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/processor.py +0 -0
  109. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/llm/__init__.py +0 -0
  110. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/translator/__init__.py +0 -0
  111. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/provider.py +0 -0
  112. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/__init__.py +0 -0
  113. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/common.py +0 -0
  114. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/data_extractor.py +0 -0
  115. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/html_cleaner.py +0 -0
  116. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/language_detector.py +0 -0
  117. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/models_cache.py +0 -0
  118. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/smart_counter.py +0 -0
  119. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/token_counter.py +0 -0
  120. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/README.md +0 -0
  121. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/__init__.py +0 -0
  122. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/pyproject.toml +0 -0
  123. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/__init__.py +0 -0
  124. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/__init__.py +0 -0
  125. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/__init__.py +0 -0
  126. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/benchmark.py +0 -0
  127. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/diagnostics.py +0 -0
  128. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/health.py +0 -0
  129. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/integration.py +0 -0
  130. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/reports.py +0 -0
  131. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/security.py +0 -0
  132. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/server.py +0 -0
  133. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/servers.py +0 -0
  134. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/tests.py +0 -0
  135. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/main.py +0 -0
  136. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/utils/reporter.py +0 -0
  137. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/openapi.yaml +0 -0
  138. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/__init__.py +0 -0
  139. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/api_config.py +0 -0
  140. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -0
  141. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -0
  142. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -0
  143. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -0
  144. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -0
  145. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -0
  146. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -0
  147. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -0
  148. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -0
  149. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -0
  150. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -0
  151. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -0
  152. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -0
  153. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -0
  154. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -0
  155. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -0
  156. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -0
  157. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -0
  158. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -0
  159. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -0
  160. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -0
  161. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -0
  162. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -0
  163. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -0
  164. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -0
  165. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -0
  166. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -0
  167. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -0
  168. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -0
  169. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -0
  170. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -0
  171. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -0
  172. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -0
  173. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -0
  174. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -0
  175. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -0
  176. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -0
  177. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -0
  178. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -0
  179. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -0
  180. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -0
  181. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -0
  182. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -0
  183. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -0
  184. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/__init__.py +0 -0
  185. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/None_service.py +0 -0
  186. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -0
  187. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -0
  188. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -0
  189. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -0
  190. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -0
  191. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
  192. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -0
  193. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -0
  194. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -0
  195. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -0
  196. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -0
  197. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -0
  198. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -0
  199. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -0
  200. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/__init__.py +0 -0
  201. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/client.py +0 -0
  202. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/events.py +0 -0
  203. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/example.py +0 -0
  204. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/types.py +0 -0
  205. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/__init__.py +0 -0
  206. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/client.py +0 -0
  207. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/config.py +0 -0
  208. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/exceptions.py +0 -0
  209. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/metadata.py +0 -0
  210. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/models.py +0 -0
  211. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/types.py +0 -0
  212. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/__init__.py +0 -0
  213. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/authentication.py +0 -0
  214. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/cache.py +0 -0
  215. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/common.py +0 -0
  216. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/concurrency.py +0 -0
  217. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/events.py +0 -0
  218. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/health.py +0 -0
  219. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/load_balancing.py +0 -0
  220. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/logging.py +0 -0
  221. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/performance.py +0 -0
  222. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/rate_limiting.py +0 -0
  223. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/resource_pooling.py +0 -0
  224. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/structured_logging.py +0 -0
  225. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/task_scheduling.py +0 -0
  226. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/websocket.py +0 -0
  227. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/__init__.py +0 -0
  228. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/authentication.py +0 -0
  229. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/cache_manager.py +0 -0
  230. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/error_recovery.py +0 -0
  231. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/event_system.py +0 -0
  232. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/health_monitor.py +0 -0
  233. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/load_balancer.py +0 -0
  234. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/__init__.py +0 -0
  235. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/cleanup.py +0 -0
  236. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/development.py +0 -0
  237. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/service.py +0 -0
  238. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/multithreading_manager.py +0 -0
  239. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/performance_monitor.py +0 -0
  240. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/proxy_manager.py +0 -0
  241. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/rate_limiter.py +0 -0
  242. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/resource_pool.py +0 -0
  243. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/task_scheduler.py +0 -0
  244. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/__init__.py +0 -0
  245. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/command_router.py +0 -0
  246. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/connection_manager.py +0 -0
  247. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/http_client.py +0 -0
  248. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/websocket_client.py +0 -0
  249. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/provider.py +0 -0
  250. {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: unrealon
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: AI-powered web scraping platform with real-time orchestration
5
5
  License: MIT
6
6
  Author: Unrealon Team
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "unrealon"
7
- version = "1.0.5"
7
+ version = "1.0.6"
8
8
  description = "AI-powered web scraping platform with real-time orchestration"
9
9
  authors = ["Unrealon Team <dev@unrealon.com>"]
10
10
  readme = "README.md"
@@ -8,7 +8,7 @@ import os
8
8
  from pydantic import BaseModel, Field, ConfigDict
9
9
 
10
10
  # Simple version constants
11
- VERSION = "1.0.5"
11
+ VERSION = "1.0.6"
12
12
 
13
13
  # Project info
14
14
  AUTHOR = "UnrealOn Team"
@@ -10,6 +10,7 @@ import json
10
10
  import random
11
11
  from typing import Type
12
12
  import traceback
13
+ import re
13
14
 
14
15
  from unrealon_llm.src.core import SmartLLMClient
15
16
  from unrealon_llm.src.dto import ChatMessage, MessageRole
@@ -77,6 +78,10 @@ class BaseHTMLProcessor(ABC):
77
78
  """Return extraction prompt template for this processor type"""
78
79
  pass
79
80
 
81
+ def _trim_system_prompt(self, system_prompt: str) -> str:
82
+ """Trim system prompt to remove empty lines"""
83
+ return "\n".join(system_prompt.split("\n")[1:])
84
+
80
85
  async def extract_patterns(self, html_content: str) -> ExtractionResult:
81
86
  """
82
87
  Extract patterns from HTML using LLM intelligence
@@ -116,15 +121,27 @@ class BaseHTMLProcessor(ABC):
116
121
  prompt_tokens=0,
117
122
  details={
118
123
  "full_prompt": prompt[:2000] + "..." if len(prompt) > 2000 else prompt,
119
- "schema_json": json.dumps(self.schema_class.model_json_schema(), indent=2)
120
- }
124
+ "schema_json": json.dumps(
125
+ self.schema_class.model_json_schema(), indent=2
126
+ ),
127
+ },
121
128
  )
122
129
 
130
+ # Add critical format requirements to the prompt
131
+ SYSTEM_PROMPT = f"""
132
+ You are an HTML-to-JSON expert at analyzing {self.processor_type} pages.
133
+ You MUST return JSON that EXACTLY matches the Pydantic schema provided.
134
+ RESPOND ONLY WITH VALID JSON.
135
+ NO EXPLANATIONS, NO TEXT, ONLY JSON!
136
+ Include ALL required fields from the schema!
137
+ CRITICAL: The 'selectors' field must be a DICTIONARY/OBJECT, not a list!
138
+ """
139
+
123
140
  # Prepare LLM messages
124
141
  messages = [
125
142
  ChatMessage(
126
143
  role=MessageRole.SYSTEM,
127
- content=f"You are an HTML-to-JSON expert at analyzing {self.processor_type} pages. You MUST return JSON that EXACTLY matches the Pydantic schema provided. RESPOND ONLY WITH VALID JSON. NO EXPLANATIONS, NO TEXT, ONLY JSON! Include ALL required fields from the schema!",
144
+ content=self._trim_system_prompt(SYSTEM_PROMPT),
128
145
  ),
129
146
  ChatMessage(
130
147
  role=MessageRole.USER,
@@ -144,8 +161,7 @@ class BaseHTMLProcessor(ABC):
144
161
  try:
145
162
  # Call LLM
146
163
  response = await self.llm_client.chat_completion(
147
- messages,
148
- response_model=self.schema_class
164
+ messages, response_model=self.schema_class
149
165
  )
150
166
 
151
167
  # Log full LLM response for debugging
@@ -167,7 +183,7 @@ class BaseHTMLProcessor(ABC):
167
183
  )
168
184
 
169
185
  # Use the validated model from LLM response
170
- if hasattr(response, 'extracted_model') and response.extracted_model:
186
+ if hasattr(response, "extracted_model") and response.extracted_model:
171
187
  validated_model = response.extracted_model
172
188
  validated_result = validated_model.model_dump()
173
189
  logger.log_html_analysis_completed(
@@ -203,8 +219,36 @@ class BaseHTMLProcessor(ABC):
203
219
  "raw_llm_response": result_data,
204
220
  },
205
221
  )
206
- # Fall back to raw data
207
- validated_result = result_data
222
+
223
+ # 🔥 SMART FALLBACK: Try to fix common LLM format issues
224
+ try:
225
+ fixed_data = self._fix_llm_response_format(result_data, str(e))
226
+ validated_model = self.schema_class(**fixed_data)
227
+ validated_result = validated_model.model_dump()
228
+ logger.log_html_analysis_completed(
229
+ selectors_generated=len(str(fixed_data)),
230
+ confidence_score=fixed_data.get("confidence", 0.0),
231
+ details={
232
+ "processor_type": self.processor_type,
233
+ "validation_success": True,
234
+ "schema_matched": True,
235
+ "format_fixed": True,
236
+ },
237
+ )
238
+ except Exception as fix_error:
239
+ logger.log_html_analysis_failed(
240
+ error_message=f"Format fixing also failed: {str(fix_error)}",
241
+ details={
242
+ "processor_type": self.processor_type,
243
+ "validation_error": str(e),
244
+ "fix_error": str(fix_error),
245
+ "raw_llm_response": result_data,
246
+ },
247
+ )
248
+ # Final fallback: create minimal valid structure
249
+ validated_result = self._create_fallback_result(
250
+ result_data, str(e)
251
+ )
208
252
 
209
253
  # Create Pydantic processing metadata
210
254
  processing_info = ProcessingInfo(
@@ -253,12 +297,20 @@ class BaseHTMLProcessor(ABC):
253
297
 
254
298
  # Add random number to bypass any caching
255
299
  cache_buster = random.randint(100000, 999999)
256
-
257
- schema_prompt = f"""PYDANTIC 2 SCHEMA (Request #{cache_buster}):
258
- {schema_json}
259
300
 
260
- CRITICAL: Return JSON that EXACTLY matches this schema structure!
261
- The response must include ALL required fields: detected_item_type, extraction_strategy, confidence, selectors, documentation."""
301
+ schema_prompt = f"""
302
+ PYDANTIC 2 SCHEMA (Request #{cache_buster}):
303
+ {schema_json}
304
+
305
+ 🚨 CRITICAL FORMAT REQUIREMENTS:
306
+ 1. Return JSON that EXACTLY matches this schema structure!
307
+ 2. The response must include ALL required fields: detected_item_type, extraction_strategy, confidence, selectors, documentation
308
+ 3. The "selectors" field MUST be a DICTIONARY/OBJECT with field names as keys and arrays of CSS selectors as values
309
+ 4. Example: "selectors": {{"title": ["h1.title", ".product-name"], "price": [".price", ".cost"]}}
310
+ 5. DO NOT return "selectors" as a list: ❌ ["h1.title", ".price"]
311
+ 6. DO return "selectors" as a dictionary: ✅ {{"title": ["h1.title"], "price": [".price"]}}
312
+ """
313
+ schema_prompt = self._trim_system_prompt(schema_prompt)
262
314
 
263
315
  return prompt_template.format(
264
316
  processor_type=self.processor_type,
@@ -292,3 +344,72 @@ The response must include ALL required fields: detected_item_type, extraction_st
292
344
  estimated_cost = (total_tokens / 1_000_000) * 0.25
293
345
 
294
346
  return estimated_cost
347
+
348
+ def _fix_llm_response_format(self, result_data: dict, error_message: str) -> dict:
349
+ """Fix common LLM response format issues."""
350
+ fixed_data = result_data.copy()
351
+
352
+ # Fix selectors if it's a list instead of dict
353
+ if "selectors" in fixed_data and isinstance(fixed_data["selectors"], list):
354
+ logger.log_html_analysis_failed(
355
+ error_message="Fixing selectors format: list -> dict",
356
+ details={
357
+ "processor_type": self.processor_type,
358
+ "original_selectors": fixed_data["selectors"],
359
+ },
360
+ )
361
+
362
+ # Convert list to dict with generic field names
363
+ selectors_list = fixed_data["selectors"]
364
+ fixed_data["selectors"] = {}
365
+
366
+ # Try to intelligently map list items to field names
367
+ field_names = ["item", "title", "price", "description", "image", "link"]
368
+ for i, selector in enumerate(selectors_list):
369
+ if i < len(field_names):
370
+ field_name = field_names[i]
371
+ else:
372
+ field_name = f"field_{i+1}"
373
+
374
+ # Convert single selector to list
375
+ if isinstance(selector, str):
376
+ fixed_data["selectors"][field_name] = [selector]
377
+ elif isinstance(selector, list):
378
+ fixed_data["selectors"][field_name] = selector
379
+ else:
380
+ fixed_data["selectors"][field_name] = [str(selector)]
381
+
382
+ # Ensure all required fields exist
383
+ required_fields = [
384
+ "detected_item_type",
385
+ "extraction_strategy",
386
+ "confidence",
387
+ "selectors",
388
+ "documentation",
389
+ ]
390
+ for field in required_fields:
391
+ if field not in fixed_data:
392
+ if field == "detected_item_type":
393
+ fixed_data[field] = "unknown"
394
+ elif field == "extraction_strategy":
395
+ fixed_data[field] = "fallback_strategy"
396
+ elif field == "confidence":
397
+ fixed_data[field] = 0.1
398
+ elif field == "selectors":
399
+ fixed_data[field] = {}
400
+ elif field == "documentation":
401
+ fixed_data[field] = (
402
+ "Extraction completed with fallback processing due to format issues."
403
+ )
404
+
405
+ return fixed_data
406
+
407
+ def _create_fallback_result(self, result_data: dict, error_message: str) -> dict:
408
+ """Create a minimal valid result when all else fails."""
409
+ return {
410
+ "detected_item_type": "unknown",
411
+ "extraction_strategy": "fallback_strategy",
412
+ "confidence": 0.1,
413
+ "selectors": {},
414
+ "documentation": f"Extraction failed due to validation error: {error_message}. Raw data: {str(result_data)[:500]}...",
415
+ }
@@ -0,0 +1,85 @@
1
+ """
2
+ Details Processor
3
+
4
+ Universal processor for detail/product/item pages.
5
+ Handles ANY type of detail pages: product details, service info, article content, job descriptions, etc.
6
+ """
7
+
8
+ from typing import Type
9
+
10
+ from .base_processor import BaseHTMLProcessor
11
+ from .models import UniversalExtractionSchema
12
+
13
+
14
+ class DetailsProcessor(BaseHTMLProcessor):
15
+ """Universal details page pattern extractor"""
16
+
17
+ def get_processor_type(self) -> str:
18
+ """Return processor type identifier"""
19
+ return "details"
20
+
21
+ def get_schema_class(self) -> Type[UniversalExtractionSchema]:
22
+ """Return Pydantic schema class for details extraction"""
23
+ return UniversalExtractionSchema
24
+
25
+ def get_extraction_prompt_template(self) -> str:
26
+ """Return details-specific extraction prompt template"""
27
+
28
+ prompt = """{schema}
29
+ [__TASK_DESCRIPTION__]
30
+ Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
31
+ PROCESSOR TYPE: {processor_type}
32
+ THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
33
+ [/__TASK_DESCRIPTION__]
34
+
35
+ [__CRITICAL_FORMAT_REQUIREMENTS__]
36
+ 🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
37
+ Example of CORRECT format:
38
+ "selectors": {{
39
+ "title": ["h1.product-title", "h1.page-title", ".item-name"],
40
+ "price": [".price", ".cost", "span[data-price]", ".product-price"],
41
+ "description": [".description", ".product-desc", ".item-details"],
42
+ "images": ["img.product-image", ".gallery img", "img[src*='product']"],
43
+ "specifications": [".specs", ".product-specs", ".item-specifications"],
44
+ "reviews": [".reviews", ".product-reviews", ".customer-reviews"]
45
+ }}
46
+
47
+ ❌ WRONG format (DO NOT USE):
48
+ "selectors": ["h1.title", ".price", ".description"]
49
+
50
+ ✅ CORRECT format (USE THIS):
51
+ "selectors": {{
52
+ "title": ["h1.title", ".product-name", "h1[itemprop='name']"],
53
+ "price": [".price", ".cost", "span[data-price]"],
54
+ "description": [".description", ".product-desc", ".item-details"]
55
+ }}
56
+ [/__CRITICAL_FORMAT_REQUIREMENTS__]
57
+
58
+ [__INSTRUCTIONS__]
59
+ YOUR TASK:
60
+ Analyze this details page and generate extraction patterns for ANY type of item.
61
+ This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
62
+
63
+ CRITICAL REQUIREMENTS:
64
+ 1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
65
+ 2. Include comprehensive markdown documentation
66
+ 3. Provide real examples from the actual HTML
67
+ 4. Explain the page structure and best extraction approach
68
+ 5. Include confidence scores and fallback strategies
69
+ 6. Document any special handling needed
70
+
71
+ ANALYZE THE HTML AND DETERMINE:
72
+ - What type of item this page describes
73
+ - What information is available (specs, pricing, reviews, etc.)
74
+ - How content is structured and organized
75
+ - What actions are possible (buy, contact, etc.)
76
+ - Best extraction strategy for this specific page
77
+ [/__INSTRUCTIONS__]
78
+
79
+ [__HTML_CONTENT__]
80
+ HTML CONTENT (first 50KB):
81
+ {html_content}
82
+ [/__HTML_CONTENT__]
83
+ """
84
+
85
+ return self._trim_system_prompt(prompt)
@@ -0,0 +1,91 @@
1
+ """
2
+ Listing Processor
3
+
4
+ Universal processor for listing/catalog pages.
5
+ Handles ANY type of listings: products, services, articles, real estate, jobs, etc.
6
+ """
7
+
8
+ from typing import Type
9
+
10
+ from .base_processor import BaseHTMLProcessor
11
+ from .models import UniversalExtractionSchema
12
+
13
+
14
+ class ListingProcessor(BaseHTMLProcessor):
15
+ """Universal listing page pattern extractor"""
16
+
17
+ def get_processor_type(self) -> str:
18
+ """Return processor type identifier"""
19
+ return "listing"
20
+
21
+ def get_schema_class(self) -> Type[UniversalExtractionSchema]:
22
+ """Return Pydantic schema class for listing extraction"""
23
+ return UniversalExtractionSchema
24
+
25
+ def get_extraction_prompt_template(self) -> str:
26
+ """Return listing-specific extraction prompt template"""
27
+
28
+ prompt = """{schema}
29
+
30
+ [__TASK_DESCRIPTION__]
31
+ Analyze this LISTING/CATALOG page and generate universal extraction patterns.
32
+ PROCESSOR TYPE: {processor_type}
33
+ THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
34
+ [/__TASK_DESCRIPTION__]
35
+
36
+ [__CRITICAL_FORMAT_REQUIREMENTS__]
37
+ 🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
38
+ Example of CORRECT format:
39
+ "selectors": {{
40
+ "items_container": ["div.product-grid", "ul.product-list", "div.items"],
41
+ "item_title": ["h3.product-title", "a.product-link", ".item-name"],
42
+ "item_price": [".price", ".cost", "span[data-price]"],
43
+ "item_image": ["img.product-image", ".item-img", "img[src*='product']"],
44
+ "pagination": [".pagination", ".page-nav", "nav[aria-label='pagination']"]
45
+ }}
46
+
47
+ ❌ WRONG format (DO NOT USE):
48
+ "selectors": ["div.product", "h3.title", ".price"]
49
+
50
+ ✅ CORRECT format (USE THIS):
51
+ "selectors": {{
52
+ "items": ["div.product", "li.item", ".product-card"],
53
+ "titles": ["h3.title", ".product-name", "a[title]"],
54
+ "prices": [".price", ".cost", "span[data-price]"]
55
+ }}
56
+ [/__CRITICAL_FORMAT_REQUIREMENTS__]
57
+
58
+ [__INSTRUCTIONS__]
59
+ YOUR TASK:
60
+ Analyze this listing page and generate extraction patterns for ANY type of items.
61
+ This could be: products, services, articles, jobs, real estate, people, cars, etc.
62
+
63
+ CRITICAL REQUIREMENTS:
64
+ 1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
65
+ 2. This is a LISTING PAGE with multiple items
66
+ 3. Focus on identifying item containers and individual item patterns
67
+ 4. Detect ANY type of items - not just products!
68
+ 5. Provide multiple fallback selectors for reliability
69
+ 6. Include pagination and navigation patterns
70
+ 7. Use realistic confidence scores (0.1-1.0)
71
+ 8. Auto-detect what type of content this listing contains
72
+ 9. Provide extraction strategy advice
73
+ 10. Look for structured data (JSON-LD, microdata)
74
+ 11. Generate patterns that work with BeautifulSoup4 .select() method
75
+ 12. RETURN JSON that EXACTLY matches the Pydantic schema above!
76
+
77
+ ANALYZE THE HTML AND DETERMINE:
78
+ - What type of items are listed (products, services, articles, etc.)
79
+ - How items are structured and contained
80
+ - What navigation elements exist
81
+ - What metadata is available
82
+ - Best extraction strategy for this specific page
83
+ [/__INSTRUCTIONS__]
84
+
85
+ [__HTML_CONTENT__]
86
+ HTML CONTENT (first 50KB):
87
+ {html_content}
88
+ [/__HTML_CONTENT__]
89
+ """
90
+
91
+ return self._trim_system_prompt(prompt)
@@ -1,61 +0,0 @@
1
- """
2
- Details Processor
3
-
4
- Universal processor for detail/product/item pages.
5
- Handles ANY type of detail pages: product details, service info, article content, job descriptions, etc.
6
- """
7
-
8
- from typing import Type
9
-
10
- from .base_processor import BaseHTMLProcessor
11
- from .models import UniversalExtractionSchema
12
-
13
-
14
- class DetailsProcessor(BaseHTMLProcessor):
15
- """Universal details page pattern extractor"""
16
-
17
- def get_processor_type(self) -> str:
18
- """Return processor type identifier"""
19
- return "details"
20
-
21
- def get_schema_class(self) -> Type[UniversalExtractionSchema]:
22
- """Return Pydantic schema class for details extraction"""
23
- return UniversalExtractionSchema
24
-
25
- def get_extraction_prompt_template(self) -> str:
26
- """Return details-specific extraction prompt template"""
27
-
28
- return """{schema}
29
-
30
- [__TASK_DESCRIPTION__]
31
- Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
32
- PROCESSOR TYPE: {processor_type}
33
- THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
34
- [/__TASK_DESCRIPTION__]
35
-
36
- [__INSTRUCTIONS__]
37
- YOUR TASK:
38
- Analyze this details page and generate extraction patterns for ANY type of item.
39
- This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
40
-
41
- CRITICAL REQUIREMENTS:
42
- 1. Return simple CSS selectors in the "selectors" object
43
- 2. Include comprehensive markdown documentation
44
- 3. Provide real examples from the actual HTML
45
- 4. Explain the page structure and best extraction approach
46
- 5. Include confidence scores and fallback strategies
47
- 6. Document any special handling needed
48
-
49
- ANALYZE THE HTML AND DETERMINE:
50
- - What type of item this page describes
51
- - What information is available (specs, pricing, reviews, etc.)
52
- - How content is structured and organized
53
- - What actions are possible (buy, contact, etc.)
54
- - Best extraction strategy for this specific page
55
- [/__INSTRUCTIONS__]
56
-
57
- [__HTML_CONTENT__]
58
- HTML CONTENT (first 50KB):
59
- {html_content}
60
- [/__HTML_CONTENT__]
61
- """
@@ -1,67 +0,0 @@
1
- """
2
- Listing Processor
3
-
4
- Universal processor for listing/catalog pages.
5
- Handles ANY type of listings: products, services, articles, real estate, jobs, etc.
6
- """
7
-
8
- from typing import Type
9
-
10
- from .base_processor import BaseHTMLProcessor
11
- from .models import UniversalExtractionSchema
12
-
13
-
14
- class ListingProcessor(BaseHTMLProcessor):
15
- """Universal listing page pattern extractor"""
16
-
17
- def get_processor_type(self) -> str:
18
- """Return processor type identifier"""
19
- return "listing"
20
-
21
- def get_schema_class(self) -> Type[UniversalExtractionSchema]:
22
- """Return Pydantic schema class for listing extraction"""
23
- return UniversalExtractionSchema
24
-
25
- def get_extraction_prompt_template(self) -> str:
26
- """Return listing-specific extraction prompt template"""
27
-
28
- return """{schema}
29
-
30
- [__TASK_DESCRIPTION__]
31
- Analyze this LISTING/CATALOG page and generate universal extraction patterns.
32
- PROCESSOR TYPE: {processor_type}
33
- THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
34
- [/__TASK_DESCRIPTION__]
35
-
36
- [__INSTRUCTIONS__]
37
- YOUR TASK:
38
- Analyze this listing page and generate extraction patterns for ANY type of items.
39
- This could be: products, services, articles, jobs, real estate, people, cars, etc.
40
-
41
- CRITICAL REQUIREMENTS:
42
- 1. Return simple CSS selectors in the "selectors" object
43
- 2. This is a LISTING PAGE with multiple items
44
- 3. Focus on identifying item containers and individual item patterns
45
- 4. Detect ANY type of items - not just products!
46
- 5. Provide multiple fallback selectors for reliability
47
- 6. Include pagination and navigation patterns
48
- 7. Use realistic confidence scores (0.1-1.0)
49
- 8. Auto-detect what type of content this listing contains
50
- 9. Provide extraction strategy advice
51
- 10. Look for structured data (JSON-LD, microdata)
52
- 11. Generate patterns that work with BeautifulSoup4 .select() method
53
- 12. RETURN JSON that EXACTLY matches the Pydantic schema above!
54
-
55
- ANALYZE THE HTML AND DETERMINE:
56
- - What type of items are listed (products, services, articles, etc.)
57
- - How items are structured and contained
58
- - What navigation elements exist
59
- - What metadata is available
60
- - Best extraction strategy for this specific page
61
- [/__INSTRUCTIONS__]
62
-
63
- [__HTML_CONTENT__]
64
- HTML CONTENT (first 50KB):
65
- {html_content}
66
- [/__HTML_CONTENT__]
67
- """
File without changes
File without changes
File without changes
File without changes
File without changes