unrealon 2.0.33__tar.gz → 2.0.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {unrealon-2.0.33/unrealon.egg-info → unrealon-2.0.35}/PKG-INFO +1 -1
  2. {unrealon-2.0.33 → unrealon-2.0.35}/pyproject.toml +1 -1
  3. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/core/browser_manager.py +6 -16
  4. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/models/statistics.py +0 -1
  5. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/managers/__init__.py +2 -0
  6. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/managers/captcha.py +3 -3
  7. unrealon-2.0.35/unrealon-browser/src/unrealon_browser/managers/data_extraction_manager.py +266 -0
  8. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/managers/logger_bridge.py +0 -12
  9. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/__init__.py +21 -0
  10. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/core/driver.py +2 -1
  11. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/factory/manager_factory.py +14 -1
  12. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/__init__.py +2 -1
  13. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/browser.py +37 -1
  14. unrealon-2.0.35/unrealon-driver/src/unrealon_driver/managers/http.py +211 -0
  15. unrealon-2.0.35/unrealon-driver/src/unrealon_driver/managers/threading.py +95 -0
  16. {unrealon-2.0.33 → unrealon-2.0.35/unrealon.egg-info}/PKG-INFO +1 -1
  17. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon.egg-info/SOURCES.txt +1 -0
  18. unrealon-2.0.33/unrealon-driver/src/unrealon_driver/managers/http.py +0 -107
  19. unrealon-2.0.33/unrealon-driver/src/unrealon_driver/managers/threading.py +0 -54
  20. {unrealon-2.0.33 → unrealon-2.0.35}/LICENSE +0 -0
  21. {unrealon-2.0.33 → unrealon-2.0.35}/MANIFEST.in +0 -0
  22. {unrealon-2.0.33 → unrealon-2.0.35}/README.md +0 -0
  23. {unrealon-2.0.33 → unrealon-2.0.35}/setup.cfg +0 -0
  24. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/README.md +0 -0
  25. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/__init__.py +0 -0
  26. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/cli/__init__.py +0 -0
  27. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/cli/browser_cli.py +0 -0
  28. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/cli/cookies_cli.py +0 -0
  29. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/cli/interactive_mode.py +0 -0
  30. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/cli/main.py +0 -0
  31. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/core/__init__.py +0 -0
  32. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/__init__.py +0 -0
  33. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/bot_detection.py +0 -0
  34. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/models/config.py +0 -0
  35. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/models/core.py +0 -0
  36. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/models/dataclasses.py +0 -0
  37. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/models/detection.py +0 -0
  38. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/dto/models/enums.py +0 -0
  39. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/managers/cookies.py +0 -0
  40. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/managers/page_wait_manager.py +0 -0
  41. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/managers/profile.py +0 -0
  42. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/managers/script_manager.py +0 -0
  43. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/stealth/__init__.py +0 -0
  44. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/stealth/bypass_techniques.py +0 -0
  45. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/stealth/manager.py +0 -0
  46. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/stealth/nodriver_stealth.py +0 -0
  47. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/stealth/playwright_stealth.py +0 -0
  48. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/stealth/scanner_tester.py +0 -0
  49. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-browser/src/unrealon_browser/stealth/undetected_chrome.py +0 -0
  50. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/__init__.py +0 -0
  51. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/config/__init__.py +0 -0
  52. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/config/environment.py +0 -0
  53. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/config/urls.py +0 -0
  54. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/enums/__init__.py +0 -0
  55. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/enums/events.py +0 -0
  56. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/enums/jobs.py +0 -0
  57. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/enums/status.py +0 -0
  58. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/enums/types.py +0 -0
  59. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/error_handling/__init__.py +0 -0
  60. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/error_handling/circuit_breaker.py +0 -0
  61. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/error_handling/error_context.py +0 -0
  62. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/error_handling/recovery.py +0 -0
  63. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/error_handling/retry.py +0 -0
  64. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/exceptions/__init__.py +0 -0
  65. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/exceptions/base.py +0 -0
  66. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/exceptions/communication.py +0 -0
  67. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/exceptions/driver.py +0 -0
  68. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/exceptions/proxy.py +0 -0
  69. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/exceptions/task.py +0 -0
  70. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/exceptions/validation.py +0 -0
  71. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/__init__.py +0 -0
  72. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/arq_context.py +0 -0
  73. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/arq_responses.py +0 -0
  74. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/authentication.py +0 -0
  75. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/base.py +0 -0
  76. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/bridge_stats.py +0 -0
  77. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/communication.py +0 -0
  78. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/connection_stats.py +0 -0
  79. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/driver.py +0 -0
  80. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/driver_details.py +0 -0
  81. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/logging.py +0 -0
  82. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/task.py +0 -0
  83. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/typed_responses.py +0 -0
  84. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/__init__.py +0 -0
  85. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/base.py +0 -0
  86. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/broadcast.py +0 -0
  87. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/config.py +0 -0
  88. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/driver.py +0 -0
  89. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/errors.py +0 -0
  90. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/heartbeat.py +0 -0
  91. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/logging.py +0 -0
  92. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/proxy.py +0 -0
  93. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/tasks.py +0 -0
  94. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket/utils.py +0 -0
  95. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/models/websocket_session.py +0 -0
  96. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/monitoring/__init__.py +0 -0
  97. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/monitoring/alerts.py +0 -0
  98. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/monitoring/dashboard.py +0 -0
  99. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/monitoring/health_check.py +0 -0
  100. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/monitoring/metrics.py +0 -0
  101. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/utils/__init__.py +0 -0
  102. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/utils/time.py +0 -0
  103. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-core/src/unrealon_core/version.py +0 -0
  104. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/core_module/__init__.py +0 -0
  105. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/core_module/base.py +0 -0
  106. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/core_module/config.py +0 -0
  107. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/core_module/event_manager.py +0 -0
  108. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/core_module/protocols.py +0 -0
  109. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/core_module/registry.py +0 -0
  110. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/decorators/__init__.py +0 -0
  111. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/decorators/retry.py +0 -0
  112. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/decorators/schedule.py +0 -0
  113. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/decorators/task.py +0 -0
  114. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/decorators/timing.py +0 -0
  115. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/__init__.py +0 -0
  116. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/communication/__init__.py +0 -0
  117. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/communication/session.py +0 -0
  118. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/communication/websocket_client.py +0 -0
  119. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/core/__init__.py +0 -0
  120. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/core/config.py +0 -0
  121. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/factory/__init__.py +0 -0
  122. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/lifecycle/__init__.py +0 -0
  123. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/lifecycle/daemon.py +0 -0
  124. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/lifecycle/initialization.py +0 -0
  125. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/lifecycle/shutdown.py +0 -0
  126. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/monitoring/__init__.py +0 -0
  127. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/monitoring/health.py +0 -0
  128. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/utilities/__init__.py +0 -0
  129. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/utilities/logging.py +0 -0
  130. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/driver/utilities/serialization.py +0 -0
  131. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/installer/__init__.py +0 -0
  132. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/installer/platform.py +0 -0
  133. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/base.py +0 -0
  134. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/cache.py +0 -0
  135. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/logger.py +0 -0
  136. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/proxy.py +0 -0
  137. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/registry.py +0 -0
  138. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/managers/update.py +0 -0
  139. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/utils/__init__.py +0 -0
  140. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon-driver/src/unrealon_driver/utils/time.py +0 -0
  141. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon.egg-info/dependency_links.txt +0 -0
  142. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon.egg-info/entry_points.txt +0 -0
  143. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon.egg-info/requires.txt +0 -0
  144. {unrealon-2.0.33 → unrealon-2.0.35}/unrealon.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unrealon
3
- Version: 2.0.33
3
+ Version: 2.0.35
4
4
  Summary: Enterprise-grade web scraping platform with AI-powered automation and real-time orchestration capabilities
5
5
  Author-email: UnrealOn Team <team@unrealon.com>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "unrealon"
7
- version = "2.0.33"
7
+ version = "2.0.35"
8
8
  description = "Enterprise-grade web scraping platform with AI-powered automation and real-time orchestration capabilities"
9
9
  authors = [
10
10
  {name = "UnrealOn Team", email = "team@unrealon.com"}
@@ -31,6 +31,7 @@ from unrealon_browser.managers import (
31
31
  create_browser_logger_bridge,
32
32
  PageWaitManager,
33
33
  ScriptManager,
34
+ DataExtractionManager,
34
35
  )
35
36
 
36
37
 
@@ -71,6 +72,7 @@ class BrowserManager:
71
72
  self.captcha_manager = CaptchaDetector()
72
73
  self.page_wait = PageWaitManager(None, self.logger_bridge)
73
74
  self.script_manager = ScriptManager(None, self.logger_bridge)
75
+ self.data_extraction = DataExtractionManager(None, self.logger_bridge)
74
76
 
75
77
  # Signal handlers for graceful shutdown
76
78
  self._setup_signal_handlers()
@@ -251,6 +253,9 @@ class BrowserManager:
251
253
 
252
254
  # Update script manager with new page
253
255
  self.script_manager.update_page(self._page)
256
+
257
+ # Update data extraction manager with new page
258
+ self.data_extraction.update_page(self._page)
254
259
 
255
260
  # 🔥 STEALTH ALWAYS APPLIED TO EVERY PAGE!
256
261
  stealth_success = await self.stealth_manager.apply_stealth(self._page)
@@ -476,17 +481,6 @@ class BrowserManager:
476
481
  "error": str(e),
477
482
  }
478
483
 
479
- async def get_page_content_async(self) -> Optional[str]:
480
- """Get current page content"""
481
- if not self._page:
482
- return None
483
-
484
- try:
485
- return await self._page.content()
486
- except Exception as e:
487
- self.logger_bridge.log_error(f"❌ Failed to get page content: {e}")
488
- return None
489
-
490
484
  async def execute_script_async(self, script: str) -> Any:
491
485
  """Execute JavaScript on current page"""
492
486
  if not self._page:
@@ -603,11 +597,6 @@ class BrowserManager:
603
597
  resolution_result = await self.captcha_manager.handle_captcha_interactive(self, detection_result, timeout_seconds)
604
598
 
605
599
  if resolution_result["success"]:
606
- # Log successful captcha resolution
607
- if hasattr(self, "_current_proxy") and self._current_proxy:
608
- proxy_host = self._current_proxy.get("host", "unknown")
609
- proxy_port = self._current_proxy.get("port", 0)
610
- self.logger_bridge.log_captcha_solved(proxy_host, proxy_port, manual=True)
611
600
 
612
601
  # Update session status back to active
613
602
  self.session_metadata.current_status = BrowserSessionStatus.ACTIVE
@@ -708,6 +697,7 @@ class BrowserManager:
708
697
  self._page = None
709
698
  self.page_wait.update_page(None)
710
699
  self.script_manager.update_page(None)
700
+ self.data_extraction.update_page(None)
711
701
 
712
702
  # Close context with safety checks
713
703
  if self._context:
@@ -62,7 +62,6 @@ class BrowserStatistics(BaseModel):
62
62
 
63
63
  # Captcha metrics
64
64
  captcha_encounters: int = Field(default=0)
65
- captcha_solved: int = Field(default=0)
66
65
  captcha_timeouts: int = Field(default=0)
67
66
 
68
67
  # Performance
@@ -9,6 +9,7 @@ from .cookies import CookieManager
9
9
  from .captcha import CaptchaDetector
10
10
  from .page_wait_manager import PageWaitManager
11
11
  from .script_manager import ScriptManager
12
+ from .data_extraction_manager import DataExtractionManager
12
13
 
13
14
 
14
15
  __all__ = [
@@ -20,4 +21,5 @@ __all__ = [
20
21
  "CaptchaDetector",
21
22
  "PageWaitManager",
22
23
  "ScriptManager",
24
+ "DataExtractionManager",
23
25
  ]
@@ -310,7 +310,7 @@ class CaptchaDetector:
310
310
 
311
311
  if cookies_saved:
312
312
  self._logger("💾 Cookies saved after captcha resolution", "info")
313
- self._captchas_solved += 1
313
+ # self._captchas_solved += 1 # Solving disabled
314
314
  else:
315
315
  self._logger("⚠️ Failed to save cookies after captcha resolution", "warning")
316
316
 
@@ -499,8 +499,8 @@ class CaptchaDetector:
499
499
 
500
500
  print(f"\n🤖 Captcha Detection Statistics:")
501
501
  print(f" Captchas detected: {stats['captchas_detected']}")
502
- print(f" Captchas solved: {stats['captchas_solved']}")
503
- print(f" Success rate: {stats['success_rate']:.1f}%")
502
+ print(f" Captchas solved: {stats['captchas_solved']} (solving disabled)")
503
+ print(f" Success rate: N/A (solving disabled)")
504
504
  print(f" Detection history: {stats['detection_history_count']} events")
505
505
  print(f" Supported types: {', '.join(stats['supported_types'])}")
506
506
 
@@ -0,0 +1,266 @@
1
+ """
2
+ Data Extraction Manager - Extract different types of data from pages
3
+ """
4
+ import json
5
+ from typing import Optional, Dict, Any, Union
6
+ from playwright.async_api import Page
7
+
8
+ from .logger_bridge import BrowserLoggerBridge as LoggingBridge
9
+
10
+
11
+ class DataExtractionManager:
12
+ """Manager for extracting different types of data from web pages"""
13
+
14
+ def __init__(self, page: Optional[Page], logger_bridge: LoggingBridge):
15
+ self._page = page
16
+ self.logger_bridge = logger_bridge
17
+
18
+ def update_page(self, page: Optional[Page]):
19
+ """Update the page reference"""
20
+ self._page = page
21
+
22
+ async def get_json_content(self) -> Optional[Dict[str, Any]]:
23
+ """Extract JSON content from current page (for API endpoints)."""
24
+ if not self._page:
25
+ self.logger_bridge.log_error("No page available for JSON extraction")
26
+ return None
27
+
28
+ try:
29
+ self.logger_bridge.log_info("🔍 Extracting JSON content from page...")
30
+
31
+ # JavaScript to extract JSON from different page formats
32
+ script = """
33
+ (() => {
34
+ try {
35
+ // Method 1: Try to get from document.body.textContent (for API responses)
36
+ const bodyText = document.body.textContent || document.body.innerText || '';
37
+ const cleanBodyText = bodyText.trim();
38
+
39
+ if (cleanBodyText && (cleanBodyText.startsWith('{') || cleanBodyText.startsWith('['))) {
40
+ return {
41
+ success: true,
42
+ data: JSON.parse(cleanBodyText),
43
+ method: 'body_text'
44
+ };
45
+ }
46
+
47
+ // Method 2: Try to get from <pre> tag (common for JSON APIs)
48
+ const preElement = document.querySelector('pre');
49
+ if (preElement) {
50
+ const preText = (preElement.textContent || preElement.innerText || '').trim();
51
+ if (preText && (preText.startsWith('{') || preText.startsWith('['))) {
52
+ return {
53
+ success: true,
54
+ data: JSON.parse(preText),
55
+ method: 'pre_element'
56
+ };
57
+ }
58
+ }
59
+
60
+ // Method 3: Check if entire document is JSON
61
+ const docText = (document.documentElement.textContent || document.documentElement.innerText || '').trim();
62
+ if (docText && (docText.startsWith('{') || docText.startsWith('['))) {
63
+ return {
64
+ success: true,
65
+ data: JSON.parse(docText),
66
+ method: 'document_text'
67
+ };
68
+ }
69
+
70
+ // Method 4: Look for JSON in script tags
71
+ const scriptTags = document.querySelectorAll('script[type="application/json"]');
72
+ for (const script of scriptTags) {
73
+ const scriptText = (script.textContent || script.innerText || '').trim();
74
+ if (scriptText && (scriptText.startsWith('{') || scriptText.startsWith('['))) {
75
+ return {
76
+ success: true,
77
+ data: JSON.parse(scriptText),
78
+ method: 'script_tag'
79
+ };
80
+ }
81
+ }
82
+
83
+ return {
84
+ success: false,
85
+ error: 'No JSON content found',
86
+ page_text_preview: cleanBodyText.substring(0, 200)
87
+ };
88
+
89
+ } catch (e) {
90
+ return {
91
+ success: false,
92
+ error: 'JSON parse failed: ' + e.message,
93
+ page_text_preview: (document.body.textContent || '').substring(0, 200)
94
+ };
95
+ }
96
+ })();
97
+ """
98
+
99
+ result = await self._page.evaluate(script)
100
+
101
+ if result.get('success'):
102
+ method = result.get('method', 'unknown')
103
+ self.logger_bridge.log_info(f"✅ JSON extracted successfully using method: {method}")
104
+ return result.get('data')
105
+ else:
106
+ error = result.get('error', 'Unknown error')
107
+ preview = result.get('page_text_preview', '')
108
+ self.logger_bridge.log_warning(f"❌ JSON extraction failed: {error}")
109
+ if preview:
110
+ self.logger_bridge.log_info(f"📄 Page preview: {preview}...")
111
+ return None
112
+
113
+ except Exception as e:
114
+ self.logger_bridge.log_error(f"JSON extraction error: {e}")
115
+ return None
116
+
117
+ async def get_page_text(self) -> Optional[str]:
118
+ """Get plain text content from current page."""
119
+ if not self._page:
120
+ return None
121
+
122
+ try:
123
+ self.logger_bridge.log_info("📄 Extracting plain text content...")
124
+
125
+ script = """
126
+ (() => {
127
+ return {
128
+ body_text: document.body.textContent || document.body.innerText || '',
129
+ title: document.title || '',
130
+ url: window.location.href
131
+ };
132
+ })();
133
+ """
134
+
135
+ result = await self._page.evaluate(script)
136
+ text = result.get('body_text', '').strip()
137
+
138
+ if text:
139
+ self.logger_bridge.log_info(f"✅ Text extracted: {len(text)} characters")
140
+ return text
141
+ else:
142
+ self.logger_bridge.log_warning("❌ No text content found")
143
+ return None
144
+
145
+ except Exception as e:
146
+ self.logger_bridge.log_error(f"Text extraction error: {e}")
147
+ return None
148
+
149
+ async def get_structured_data(self) -> Optional[Dict[str, Any]]:
150
+ """Get structured data including JSON, text, and metadata."""
151
+ if not self._page:
152
+ return None
153
+
154
+ try:
155
+ self.logger_bridge.log_info("🔍 Extracting structured data...")
156
+
157
+ # Try JSON first
158
+ json_data = await self.get_json_content()
159
+
160
+ # Get page metadata
161
+ script = """
162
+ (() => {
163
+ return {
164
+ url: window.location.href,
165
+ title: document.title || '',
166
+ content_type: document.contentType || '',
167
+ charset: document.characterSet || '',
168
+ ready_state: document.readyState,
169
+ has_pre_element: !!document.querySelector('pre'),
170
+ body_text_length: (document.body.textContent || '').length
171
+ };
172
+ })();
173
+ """
174
+
175
+ metadata = await self._page.evaluate(script)
176
+
177
+ result = {
178
+ "extraction_success": json_data is not None,
179
+ "json_data": json_data,
180
+ "metadata": metadata,
181
+ "extracted_at": self._get_timestamp()
182
+ }
183
+
184
+ if json_data:
185
+ self.logger_bridge.log_info("✅ Structured data extraction successful")
186
+ else:
187
+ self.logger_bridge.log_warning("⚠️ No JSON data found, but metadata extracted")
188
+
189
+ return result
190
+
191
+ except Exception as e:
192
+ self.logger_bridge.log_error(f"Structured data extraction error: {e}")
193
+ return None
194
+
195
+ async def detect_content_type(self) -> str:
196
+ """Detect the type of content on the current page."""
197
+ if not self._page:
198
+ return "unknown"
199
+
200
+ try:
201
+ script = """
202
+ (() => {
203
+ const bodyText = (document.body.textContent || '').trim();
204
+ const contentType = document.contentType || '';
205
+ const hasPreElement = !!document.querySelector('pre');
206
+
207
+ // Check for JSON
208
+ if (bodyText.startsWith('{') || bodyText.startsWith('[')) {
209
+ return 'json';
210
+ }
211
+
212
+ // Check for XML
213
+ if (bodyText.startsWith('<') && contentType.includes('xml')) {
214
+ return 'xml';
215
+ }
216
+
217
+ // Check for HTML
218
+ if (document.querySelector('html') && document.querySelector('body') && !hasPreElement) {
219
+ return 'html';
220
+ }
221
+
222
+ // Check for plain text
223
+ if (hasPreElement || contentType.includes('text/plain')) {
224
+ return 'text';
225
+ }
226
+
227
+ return 'unknown';
228
+ })();
229
+ """
230
+
231
+ content_type = await self._page.evaluate(script)
232
+ self.logger_bridge.log_info(f"🔍 Detected content type: {content_type}")
233
+ return content_type
234
+
235
+ except Exception as e:
236
+ self.logger_bridge.log_error(f"Content type detection error: {e}")
237
+ return "unknown"
238
+
239
+ async def get_page_html(self) -> Optional[str]:
240
+ """Get full HTML content from current page."""
241
+ if not self._page:
242
+ return None
243
+
244
+ try:
245
+ self.logger_bridge.log_info("📄 Extracting HTML content...")
246
+ html = await self._page.content()
247
+
248
+ if html:
249
+ self.logger_bridge.log_info(f"✅ HTML extracted: {len(html)} characters")
250
+ return html
251
+ else:
252
+ self.logger_bridge.log_warning("❌ No HTML content found")
253
+ return None
254
+
255
+ except Exception as e:
256
+ self.logger_bridge.log_error(f"HTML extraction error: {e}")
257
+ return None
258
+
259
+ def _get_timestamp(self) -> str:
260
+ """Get current timestamp in ISO format."""
261
+ from datetime import datetime
262
+ return datetime.now().isoformat()
263
+
264
+
265
+ # Export
266
+ __all__ = ["DataExtractionManager"]
@@ -81,7 +81,6 @@ class BrowserLoggerBridge:
81
81
  "navigation_failed": 0,
82
82
  "stealth_applied": 0,
83
83
  "captcha_detected": 0,
84
- "captcha_solved": 0,
85
84
  "profile_created": 0,
86
85
  "cookies_saved": 0,
87
86
  }
@@ -194,17 +193,6 @@ class BrowserLoggerBridge:
194
193
  detected_at=result.detected_at.isoformat(),
195
194
  )
196
195
 
197
- def log_captcha_solved(self, proxy_host: str, proxy_port: int, manual: bool = True) -> None:
198
- """Log captcha resolution"""
199
- self._browser_events["captcha_solved"] += 1
200
- self._log_info(
201
- f"Captcha solved for proxy {proxy_host}:{proxy_port}",
202
- proxy_host=proxy_host,
203
- proxy_port=proxy_port,
204
- resolution_method="manual" if manual else "automatic",
205
- cookies_will_be_saved=True,
206
- )
207
-
208
196
  def log_profile_created(self, profile_name: str, proxy_info: Optional[Dict[str, Any]] = None) -> None:
209
197
  """Log profile creation"""
210
198
  self._browser_events["profile_created"] += 1
@@ -105,3 +105,24 @@ __all__ = [
105
105
  "timing",
106
106
 
107
107
  ]
108
+
109
+ # Auto-register platform fixes and cleanup for Windows
110
+ # This ensures all parsers get proper asyncio handling automatically
111
+ try:
112
+ from .platform import apply_platform_fixes, cleanup_asyncio_resources
113
+ import atexit
114
+ import platform
115
+
116
+ # Apply fixes immediately on import
117
+ apply_platform_fixes()
118
+
119
+ # Register cleanup for Windows
120
+ if platform.system() == "Windows":
121
+ atexit.register(cleanup_asyncio_resources)
122
+
123
+ except ImportError:
124
+ # platform module not available, skip platform fixes
125
+ pass
126
+ except Exception:
127
+ # Any other error, fail silently to not break imports
128
+ pass
@@ -21,7 +21,7 @@ from ..utilities.logging import LoggingUtility
21
21
  from ..utilities.serialization import SerializationUtility
22
22
 
23
23
  from ...managers import (
24
- LoggerManager, HttpManager, BrowserManager, CacheManager,
24
+ LoggerManager, HttpManager, HttpxManager, BrowserManager, CacheManager,
25
25
  ProxyManager, ThreadManager, UpdateManager, ManagerRegistry
26
26
  )
27
27
 
@@ -77,6 +77,7 @@ class UniversalDriver:
77
77
  self.manager_registry: Optional[ManagerRegistry] = None
78
78
  self.logger_manager: Optional[LoggerManager] = None
79
79
  self.http: Optional[HttpManager] = None
80
+ self.httpx: Optional[HttpxManager] = None
80
81
  self.browser: Optional[BrowserManager] = None
81
82
  self.cache: Optional[CacheManager] = None
82
83
  self.proxy: Optional[ProxyManager] = None
@@ -8,11 +8,12 @@ import logging
8
8
  from typing import TYPE_CHECKING
9
9
 
10
10
  from ...managers import (
11
- LoggerManager, HttpManager, BrowserManager, CacheManager,
11
+ LoggerManager, HttpManager, HttpxManager, BrowserManager, CacheManager,
12
12
  ProxyManager, ThreadManager, UpdateManager, ManagerRegistry
13
13
  )
14
14
  from ...managers.logger import LoggerManagerConfig
15
15
  from ...managers.http import HttpManagerConfig
16
+ from ...managers.http import HttpxManagerConfig
16
17
  from ...managers.browser import BrowserManagerConfig
17
18
  from ...managers.cache import CacheManagerConfig
18
19
  from ...managers.proxy import ProxyManagerConfig
@@ -45,6 +46,7 @@ class ManagerFactory:
45
46
  # Setup each manager
46
47
  ManagerFactory._setup_logger_manager(driver, manager_registry)
47
48
  ManagerFactory._setup_http_manager(driver, manager_registry)
49
+ ManagerFactory._setup_httpx_manager(driver, manager_registry)
48
50
  ManagerFactory._setup_browser_manager(driver, manager_registry)
49
51
  ManagerFactory._setup_cache_manager(driver, manager_registry)
50
52
  ManagerFactory._setup_proxy_manager(driver, manager_registry)
@@ -78,6 +80,17 @@ class ManagerFactory:
78
80
  driver.http = HttpManager(http_config)
79
81
  registry.register(driver.http)
80
82
 
83
+ @staticmethod
84
+ def _setup_httpx_manager(driver: 'UniversalDriver', registry: ManagerRegistry):
85
+ """Setup HTTPx manager."""
86
+ httpx_config = HttpxManagerConfig(
87
+ enabled=True,
88
+ timeout=driver.config.http_timeout,
89
+ max_retries=driver.config.max_retries
90
+ )
91
+ driver.httpx = HttpxManager(httpx_config)
92
+ registry.register(driver.httpx)
93
+
81
94
  @staticmethod
82
95
  def _setup_browser_manager(driver: 'UniversalDriver', registry: ManagerRegistry):
83
96
  """Setup browser manager."""
@@ -4,7 +4,7 @@ Clean manager system for UnrealOn Driver.
4
4
 
5
5
  from .base import BaseManager, ManagerConfig, ManagerStatus
6
6
  from .logger import LoggerManager, LoggerManagerConfig
7
- from .http import HttpManager, HttpManagerConfig
7
+ from .http import HttpManager, HttpManagerConfig, HttpxManager, HttpxManagerConfig
8
8
  from .browser import BrowserManager, BrowserManagerConfig
9
9
  from .cache import CacheManager, CacheManagerConfig
10
10
  from .proxy import ProxyManager, ProxyManagerConfig
@@ -21,6 +21,7 @@ __all__ = [
21
21
  # Managers
22
22
  "LoggerManager", "LoggerManagerConfig",
23
23
  "HttpManager", "HttpManagerConfig",
24
+ "HttpxManager", "HttpxManagerConfig",
24
25
  "BrowserManager", "BrowserManagerConfig",
25
26
  "CacheManager", "CacheManagerConfig",
26
27
  "ProxyManager", "ProxyManagerConfig",
@@ -113,7 +113,7 @@ class BrowserManager(BaseManager):
113
113
  raise RuntimeError("Failed to initialize browser")
114
114
 
115
115
  try:
116
- html = await self.browser.get_page_content_async()
116
+ html = await self.browser.data_extraction.get_page_html()
117
117
  self.stats.record_operation(True, 0.0)
118
118
  return html
119
119
  except Exception as e:
@@ -121,6 +121,36 @@ class BrowserManager(BaseManager):
121
121
  self.stats.record_operation(False, 0.0)
122
122
  return None
123
123
 
124
+ async def get_json_content(self) -> Optional[dict]:
125
+ """Extract JSON content from current page via DataExtractionManager."""
126
+ # Ensure browser is initialized
127
+ if not await self._ensure_browser_initialized():
128
+ raise RuntimeError("Failed to initialize browser")
129
+
130
+ try:
131
+ result = await self.browser.data_extraction.get_json_content()
132
+ self.stats.record_operation(True, 0.0)
133
+ return result
134
+ except Exception as e:
135
+ self.logger.error(f"JSON extraction failed: {e}")
136
+ self.stats.record_operation(False, 0.0)
137
+ return None
138
+
139
+ async def get_page_text(self) -> Optional[str]:
140
+ """Get plain text content from current page via DataExtractionManager."""
141
+ # Ensure browser is initialized
142
+ if not await self._ensure_browser_initialized():
143
+ raise RuntimeError("Failed to initialize browser")
144
+
145
+ try:
146
+ result = await self.browser.data_extraction.get_page_text()
147
+ self.stats.record_operation(True, 0.0)
148
+ return result
149
+ except Exception as e:
150
+ self.logger.error(f"Text extraction failed: {e}")
151
+ self.stats.record_operation(False, 0.0)
152
+ return None
153
+
124
154
  async def execute_script_async(self, script: str) -> any:
125
155
  """Execute JavaScript on current page via ScriptManager."""
126
156
  # Ensure browser is initialized
@@ -136,3 +166,9 @@ class BrowserManager(BaseManager):
136
166
  self.logger.error(f"Script execution failed: {e}")
137
167
  self.stats.record_operation(False, 0.0)
138
168
  raise
169
+
170
+ async def cleanup(self) -> None:
171
+ """Cleanup browser resources."""
172
+ if self.browser:
173
+ await self.browser.close_async()
174
+ self.browser = None