unrealon 2.0.34__py3-none-any.whl → 2.0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unrealon
3
- Version: 2.0.34
3
+ Version: 2.0.35
4
4
  Summary: Enterprise-grade web scraping platform with AI-powered automation and real-time orchestration capabilities
5
5
  Author-email: UnrealOn Team <team@unrealon.com>
6
6
  License: MIT
@@ -6,7 +6,7 @@ unrealon_browser/cli/cookies_cli.py,sha256=yhZvGrg8bknlH4zlySdi8ue-25Ue-1rI_u1G0
6
6
  unrealon_browser/cli/interactive_mode.py,sha256=gLn9bMH0h0tPX3dP4i4QQxQK4Htkyg5r4KcqdMBaP6Q,12125
7
7
  unrealon_browser/cli/main.py,sha256=XCYcTxJUqaz320KCU_JPKizYMk6bdljb8Boyok3uO-4,1353
8
8
  unrealon_browser/core/__init__.py,sha256=uVL_t4sZelUzflWPdgrwoXGnAkSV1WNQ98-eu0QB2eM,151
9
- unrealon_browser/core/browser_manager.py,sha256=vPWQjoh_QAoKWFlKVQCPIuehNsWIY9HIN7z3QeGoxAc,32398
9
+ unrealon_browser/core/browser_manager.py,sha256=gKx7M5eQQU7B8FYhQsrvJaB11An2ZH1Yc1PnED1HV1Y,31977
10
10
  unrealon_browser/dto/__init__.py,sha256=bApqcLz-KanEi0_MCiFPrQmGBoX3VBijP7XtBUyIfjo,1636
11
11
  unrealon_browser/dto/bot_detection.py,sha256=qXfC0HghV7m4L6qA87t3STi-166jM-QgoP6OYbCb4o4,6884
12
12
  unrealon_browser/dto/models/config.py,sha256=Why5H3rtFclmwbdczuDfhlgf-LDz72Aa8LhDX4_ayfw,1752
@@ -14,11 +14,12 @@ unrealon_browser/dto/models/core.py,sha256=HvbwYG27rmmWtp401uws7lfalN_9QPad0M6ce
14
14
  unrealon_browser/dto/models/dataclasses.py,sha256=zqhJVyzp4CvtuTBsZwm6n6TodVWrZf9gkdDG-0_tgeA,2571
15
15
  unrealon_browser/dto/models/detection.py,sha256=ma9ZNIjPR7HnjqZaAj6ZoskiewPFiSn_FgFXSkgiQc8,2715
16
16
  unrealon_browser/dto/models/enums.py,sha256=Q4WzHdfSKf7dhKyX00i_Pvl2U8w3lBsxOYfSIoaQY3Q,1219
17
- unrealon_browser/dto/models/statistics.py,sha256=aIzJNV5r23VBxjhEoja4tXwI1Z7_UCw5zOaxuPya2E8,2728
18
- unrealon_browser/managers/__init__.py,sha256=lpa93ggEN93ucoi4FqnCG_sn-_aRlP1As7DBRogDSsQ,591
19
- unrealon_browser/managers/captcha.py,sha256=KGBO7sfq9XusAlcPByUFdIg-v6rlruzS2oHx-Zx28wo,21453
17
+ unrealon_browser/dto/models/statistics.py,sha256=RbiMChC6EumFvzIoxfWp2eIqjkW4yOpWZTKsOHNtok8,2685
18
+ unrealon_browser/managers/__init__.py,sha256=YDNpfdA-cRqn1xnX9xurgHC1x1zw0nLiPo7rPjikuzQ,679
19
+ unrealon_browser/managers/captcha.py,sha256=JsAG1gjfwrOrNZd1N1HALtzOuJ6loEhCYZVKTylKubU,21488
20
20
  unrealon_browser/managers/cookies.py,sha256=r4VVnKLXH82vhU7qgtY-dF7KPf0Ie3QxGD3FEi6geFA,15085
21
- unrealon_browser/managers/logger_bridge.py,sha256=b6H9aq6AGv0s_g5bTH6ZFYoFINClsUejevlef3s1srw,10864
21
+ unrealon_browser/managers/data_extraction_manager.py,sha256=dbbNgrqGvtMCgSxpliLxkD0PrAN0NrRrVpbN7iqcQKQ,10575
22
+ unrealon_browser/managers/logger_bridge.py,sha256=aCaDVRS7ZksXYtIKCCQIBqsmD5n0cPpc__0o4c3Iah0,10366
22
23
  unrealon_browser/managers/page_wait_manager.py,sha256=UyZqiSfkjzahrxp9x1odXFIT_sFhZGvdECxWuIMCVBY,7876
23
24
  unrealon_browser/managers/profile.py,sha256=HjddlSeUry_65WPtF8CMkT7cfJ6X3Jap9kJaaZpwtAA,18956
24
25
  unrealon_browser/managers/script_manager.py,sha256=hVnEWDb2LM1rfnptFo1MtE0SGcYCoFA66udykmb5e1g,11581
@@ -101,9 +102,9 @@ unrealon_driver/driver/communication/session.py,sha256=DYN_Q3Qm3XuOi-dM8aNihJfQJ
101
102
  unrealon_driver/driver/communication/websocket_client.py,sha256=VPsICBvGHunuCGZvorvPCF01Qdvp7QWyTDX0hkYXRwo,7910
102
103
  unrealon_driver/driver/core/__init__.py,sha256=ZvJQp1zO7pj6tBNYTJk2fj-0ZMiQTQEk-I9hXalNsfg,235
103
104
  unrealon_driver/driver/core/config.py,sha256=jWJjRll19VlL4iM5Q-J3o9qwYeH89Iuj1_3KayM6fCk,5914
104
- unrealon_driver/driver/core/driver.py,sha256=NI-pdhnduRyHLsfFr8HmP2gp7pR1pWB4vBIJkMJ2cls,7886
105
+ unrealon_driver/driver/core/driver.py,sha256=8rufKfvE7M1axB7ZyK28OINqsrBJcUyL0HVdqjK47ps,7950
105
106
  unrealon_driver/driver/factory/__init__.py,sha256=XrjBhOaLvC3MIG5PAFIYS_xYXFDz5JizpFvmQcwA7mU,189
106
- unrealon_driver/driver/factory/manager_factory.py,sha256=b-3tWKsnicTNygZ3zIDhBlSbGySPRPT1GYrFY32QQgo,5103
107
+ unrealon_driver/driver/factory/manager_factory.py,sha256=BII7cH6-X8i_DMoHvcE72AakD3wuD44gWEK5K8WJJOg,5644
107
108
  unrealon_driver/driver/lifecycle/__init__.py,sha256=KnkXklezAOIbXcCzEU_XSOt32z7tz1zIGclXYXTkO8k,286
108
109
  unrealon_driver/driver/lifecycle/daemon.py,sha256=KHAzpiWFu3HRElRtzSEStmI74bMivFjfCAFlXha87KU,2609
109
110
  unrealon_driver/driver/lifecycle/initialization.py,sha256=R4MgfkSNnfAdMO0Kp1Cx42cfNqq8VIxj_mGX7ECXad4,4406
@@ -115,21 +116,21 @@ unrealon_driver/driver/utilities/logging.py,sha256=2my2QnkAa6Hdw-TfO4oOQ94yGc-Cj
115
116
  unrealon_driver/driver/utilities/serialization.py,sha256=wTCSVrEloykiGN4K1JXbk2aqNKm7W90aWXmzhcLyAZc,2123
116
117
  unrealon_driver/installer/__init__.py,sha256=PraOjOg-cN1zOtuhPSTE5vCGPSMzWtEBYU8A05GWEf8,227
117
118
  unrealon_driver/installer/platform.py,sha256=U_8FJZk0C8M0ujpfzcpOPWEoUrT6asTNEIhsN0n2bCg,5081
118
- unrealon_driver/managers/__init__.py,sha256=zJJsOb6Oodg7l00v4ncKUytnyeaZM887pHY8-eSuWdU,981
119
+ unrealon_driver/managers/__init__.py,sha256=LZUQXwpgqjStC5B4fIdwBJTwlB-4om_eQPhdGs7SJeo,1057
119
120
  unrealon_driver/managers/base.py,sha256=GkuXillg9uqqnx6RL682fmKgK-7JyqYlH6DFUgyN4F8,5445
120
- unrealon_driver/managers/browser.py,sha256=bc6O2NyC4FV82mb9sat48_k8s1c3IGY4i90ddMVWRIo,5432
121
+ unrealon_driver/managers/browser.py,sha256=_b6YEOLqrgcD83eiVJOYvYqC5OJw_Nr4b4FmOa0uHaE,6906
121
122
  unrealon_driver/managers/cache.py,sha256=c0tPKQ5KFd_Un1U8mw3j1WPuycxg863MMWNMveVF_2I,3506
122
- unrealon_driver/managers/http.py,sha256=EjlpoTRuhpsgzzrEARxRlbGczzua7hnKFVq06bvCgTM,3624
123
+ unrealon_driver/managers/http.py,sha256=NZ8VRRpVX2EsE_LMV0AYqb3HriOQCY8Qfkojg3pV7sE,7387
123
124
  unrealon_driver/managers/logger.py,sha256=PL3rA9ZQl12jJU0EiPAkLwJ6eDHQfIzr8-nc8bVivKQ,10526
124
125
  unrealon_driver/managers/proxy.py,sha256=b2w6DteMJWnwxZmL3NfwBMdE_mscchoMwPs-XFKNwnU,3855
125
126
  unrealon_driver/managers/registry.py,sha256=--oNPU-65e8J21ubJufyEOc1TirnzJIvpvuY_j7rH7Q,2666
126
- unrealon_driver/managers/threading.py,sha256=djw5cSC99dfBKmep3IJ_8IgxQceMXtNvCp5fIxHM0TY,1702
127
+ unrealon_driver/managers/threading.py,sha256=yw2RlWxc2MBn0ZbPJ9h3eLIC5OFCvVkr8DQM7DhWO8M,3498
127
128
  unrealon_driver/managers/update.py,sha256=-hohVxGXpj5bZ6ZTQN6NH1RK9Pd6GVzCMtu3GS2SdcQ,3582
128
129
  unrealon_driver/utils/__init__.py,sha256=2Sz3eats5q4O2fDmefDuJt8M_zkN6xrS-9xXntWZWFc,168
129
130
  unrealon_driver/utils/time.py,sha256=Oxk1eicKeZl8ZWbf7gu1Ll716k6CpXmVj67FHSnPIsA,184
130
- unrealon-2.0.34.dist-info/LICENSE,sha256=eEH8mWZW49YMpl4Sh5MtKqkZ8aVTzKQXiNPEnvL14ns,1070
131
- unrealon-2.0.34.dist-info/METADATA,sha256=xZkorKxeW0f0dLynonoXpTlvHYvhbuPCiWzoHFep2Y4,15689
132
- unrealon-2.0.34.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
133
- unrealon-2.0.34.dist-info/entry_points.txt,sha256=k0qM-eotpajkKUq-almJmxj9afhXprZ6IkvQkSdcKhI,104
134
- unrealon-2.0.34.dist-info/top_level.txt,sha256=Gu8IeIfIVfUxdi-h-F0nKMQxo15pjhHZ0aTadXTpRE8,47
135
- unrealon-2.0.34.dist-info/RECORD,,
131
+ unrealon-2.0.35.dist-info/LICENSE,sha256=eEH8mWZW49YMpl4Sh5MtKqkZ8aVTzKQXiNPEnvL14ns,1070
132
+ unrealon-2.0.35.dist-info/METADATA,sha256=5oIiWxcda_6_DCik53Vt5Xd4CsuyGA1yARkZXSvkOIY,15689
133
+ unrealon-2.0.35.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
134
+ unrealon-2.0.35.dist-info/entry_points.txt,sha256=k0qM-eotpajkKUq-almJmxj9afhXprZ6IkvQkSdcKhI,104
135
+ unrealon-2.0.35.dist-info/top_level.txt,sha256=Gu8IeIfIVfUxdi-h-F0nKMQxo15pjhHZ0aTadXTpRE8,47
136
+ unrealon-2.0.35.dist-info/RECORD,,
@@ -31,6 +31,7 @@ from unrealon_browser.managers import (
31
31
  create_browser_logger_bridge,
32
32
  PageWaitManager,
33
33
  ScriptManager,
34
+ DataExtractionManager,
34
35
  )
35
36
 
36
37
 
@@ -71,6 +72,7 @@ class BrowserManager:
71
72
  self.captcha_manager = CaptchaDetector()
72
73
  self.page_wait = PageWaitManager(None, self.logger_bridge)
73
74
  self.script_manager = ScriptManager(None, self.logger_bridge)
75
+ self.data_extraction = DataExtractionManager(None, self.logger_bridge)
74
76
 
75
77
  # Signal handlers for graceful shutdown
76
78
  self._setup_signal_handlers()
@@ -251,6 +253,9 @@ class BrowserManager:
251
253
 
252
254
  # Update script manager with new page
253
255
  self.script_manager.update_page(self._page)
256
+
257
+ # Update data extraction manager with new page
258
+ self.data_extraction.update_page(self._page)
254
259
 
255
260
  # 🔥 STEALTH ALWAYS APPLIED TO EVERY PAGE!
256
261
  stealth_success = await self.stealth_manager.apply_stealth(self._page)
@@ -476,17 +481,6 @@ class BrowserManager:
476
481
  "error": str(e),
477
482
  }
478
483
 
479
- async def get_page_content_async(self) -> Optional[str]:
480
- """Get current page content"""
481
- if not self._page:
482
- return None
483
-
484
- try:
485
- return await self._page.content()
486
- except Exception as e:
487
- self.logger_bridge.log_error(f"❌ Failed to get page content: {e}")
488
- return None
489
-
490
484
  async def execute_script_async(self, script: str) -> Any:
491
485
  """Execute JavaScript on current page"""
492
486
  if not self._page:
@@ -603,11 +597,6 @@ class BrowserManager:
603
597
  resolution_result = await self.captcha_manager.handle_captcha_interactive(self, detection_result, timeout_seconds)
604
598
 
605
599
  if resolution_result["success"]:
606
- # Log successful captcha resolution
607
- if hasattr(self, "_current_proxy") and self._current_proxy:
608
- proxy_host = self._current_proxy.get("host", "unknown")
609
- proxy_port = self._current_proxy.get("port", 0)
610
- self.logger_bridge.log_captcha_solved(proxy_host, proxy_port, manual=True)
611
600
 
612
601
  # Update session status back to active
613
602
  self.session_metadata.current_status = BrowserSessionStatus.ACTIVE
@@ -708,6 +697,7 @@ class BrowserManager:
708
697
  self._page = None
709
698
  self.page_wait.update_page(None)
710
699
  self.script_manager.update_page(None)
700
+ self.data_extraction.update_page(None)
711
701
 
712
702
  # Close context with safety checks
713
703
  if self._context:
@@ -62,7 +62,6 @@ class BrowserStatistics(BaseModel):
62
62
 
63
63
  # Captcha metrics
64
64
  captcha_encounters: int = Field(default=0)
65
- captcha_solved: int = Field(default=0)
66
65
  captcha_timeouts: int = Field(default=0)
67
66
 
68
67
  # Performance
@@ -9,6 +9,7 @@ from .cookies import CookieManager
9
9
  from .captcha import CaptchaDetector
10
10
  from .page_wait_manager import PageWaitManager
11
11
  from .script_manager import ScriptManager
12
+ from .data_extraction_manager import DataExtractionManager
12
13
 
13
14
 
14
15
  __all__ = [
@@ -20,4 +21,5 @@ __all__ = [
20
21
  "CaptchaDetector",
21
22
  "PageWaitManager",
22
23
  "ScriptManager",
24
+ "DataExtractionManager",
23
25
  ]
@@ -310,7 +310,7 @@ class CaptchaDetector:
310
310
 
311
311
  if cookies_saved:
312
312
  self._logger("💾 Cookies saved after captcha resolution", "info")
313
- self._captchas_solved += 1
313
+ # self._captchas_solved += 1 # Solving disabled
314
314
  else:
315
315
  self._logger("⚠️ Failed to save cookies after captcha resolution", "warning")
316
316
 
@@ -499,8 +499,8 @@ class CaptchaDetector:
499
499
 
500
500
  print(f"\n🤖 Captcha Detection Statistics:")
501
501
  print(f" Captchas detected: {stats['captchas_detected']}")
502
- print(f" Captchas solved: {stats['captchas_solved']}")
503
- print(f" Success rate: {stats['success_rate']:.1f}%")
502
+ print(f" Captchas solved: {stats['captchas_solved']} (solving disabled)")
503
+ print(f" Success rate: N/A (solving disabled)")
504
504
  print(f" Detection history: {stats['detection_history_count']} events")
505
505
  print(f" Supported types: {', '.join(stats['supported_types'])}")
506
506
 
@@ -0,0 +1,266 @@
1
+ """
2
+ Data Extraction Manager - Extract different types of data from pages
3
+ """
4
+ import json
5
+ from typing import Optional, Dict, Any, Union
6
+ from playwright.async_api import Page
7
+
8
+ from .logger_bridge import BrowserLoggerBridge as LoggingBridge
9
+
10
+
11
+ class DataExtractionManager:
12
+ """Manager for extracting different types of data from web pages"""
13
+
14
+ def __init__(self, page: Optional[Page], logger_bridge: LoggingBridge):
15
+ self._page = page
16
+ self.logger_bridge = logger_bridge
17
+
18
+ def update_page(self, page: Optional[Page]):
19
+ """Update the page reference"""
20
+ self._page = page
21
+
22
+ async def get_json_content(self) -> Optional[Dict[str, Any]]:
23
+ """Extract JSON content from current page (for API endpoints)."""
24
+ if not self._page:
25
+ self.logger_bridge.log_error("No page available for JSON extraction")
26
+ return None
27
+
28
+ try:
29
+ self.logger_bridge.log_info("🔍 Extracting JSON content from page...")
30
+
31
+ # JavaScript to extract JSON from different page formats
32
+ script = """
33
+ (() => {
34
+ try {
35
+ // Method 1: Try to get from document.body.textContent (for API responses)
36
+ const bodyText = document.body.textContent || document.body.innerText || '';
37
+ const cleanBodyText = bodyText.trim();
38
+
39
+ if (cleanBodyText && (cleanBodyText.startsWith('{') || cleanBodyText.startsWith('['))) {
40
+ return {
41
+ success: true,
42
+ data: JSON.parse(cleanBodyText),
43
+ method: 'body_text'
44
+ };
45
+ }
46
+
47
+ // Method 2: Try to get from <pre> tag (common for JSON APIs)
48
+ const preElement = document.querySelector('pre');
49
+ if (preElement) {
50
+ const preText = (preElement.textContent || preElement.innerText || '').trim();
51
+ if (preText && (preText.startsWith('{') || preText.startsWith('['))) {
52
+ return {
53
+ success: true,
54
+ data: JSON.parse(preText),
55
+ method: 'pre_element'
56
+ };
57
+ }
58
+ }
59
+
60
+ // Method 3: Check if entire document is JSON
61
+ const docText = (document.documentElement.textContent || document.documentElement.innerText || '').trim();
62
+ if (docText && (docText.startsWith('{') || docText.startsWith('['))) {
63
+ return {
64
+ success: true,
65
+ data: JSON.parse(docText),
66
+ method: 'document_text'
67
+ };
68
+ }
69
+
70
+ // Method 4: Look for JSON in script tags
71
+ const scriptTags = document.querySelectorAll('script[type="application/json"]');
72
+ for (const script of scriptTags) {
73
+ const scriptText = (script.textContent || script.innerText || '').trim();
74
+ if (scriptText && (scriptText.startsWith('{') || scriptText.startsWith('['))) {
75
+ return {
76
+ success: true,
77
+ data: JSON.parse(scriptText),
78
+ method: 'script_tag'
79
+ };
80
+ }
81
+ }
82
+
83
+ return {
84
+ success: false,
85
+ error: 'No JSON content found',
86
+ page_text_preview: cleanBodyText.substring(0, 200)
87
+ };
88
+
89
+ } catch (e) {
90
+ return {
91
+ success: false,
92
+ error: 'JSON parse failed: ' + e.message,
93
+ page_text_preview: (document.body.textContent || '').substring(0, 200)
94
+ };
95
+ }
96
+ })();
97
+ """
98
+
99
+ result = await self._page.evaluate(script)
100
+
101
+ if result.get('success'):
102
+ method = result.get('method', 'unknown')
103
+ self.logger_bridge.log_info(f"✅ JSON extracted successfully using method: {method}")
104
+ return result.get('data')
105
+ else:
106
+ error = result.get('error', 'Unknown error')
107
+ preview = result.get('page_text_preview', '')
108
+ self.logger_bridge.log_warning(f"❌ JSON extraction failed: {error}")
109
+ if preview:
110
+ self.logger_bridge.log_info(f"📄 Page preview: {preview}...")
111
+ return None
112
+
113
+ except Exception as e:
114
+ self.logger_bridge.log_error(f"JSON extraction error: {e}")
115
+ return None
116
+
117
+ async def get_page_text(self) -> Optional[str]:
118
+ """Get plain text content from current page."""
119
+ if not self._page:
120
+ return None
121
+
122
+ try:
123
+ self.logger_bridge.log_info("📄 Extracting plain text content...")
124
+
125
+ script = """
126
+ (() => {
127
+ return {
128
+ body_text: document.body.textContent || document.body.innerText || '',
129
+ title: document.title || '',
130
+ url: window.location.href
131
+ };
132
+ })();
133
+ """
134
+
135
+ result = await self._page.evaluate(script)
136
+ text = result.get('body_text', '').strip()
137
+
138
+ if text:
139
+ self.logger_bridge.log_info(f"✅ Text extracted: {len(text)} characters")
140
+ return text
141
+ else:
142
+ self.logger_bridge.log_warning("❌ No text content found")
143
+ return None
144
+
145
+ except Exception as e:
146
+ self.logger_bridge.log_error(f"Text extraction error: {e}")
147
+ return None
148
+
149
+ async def get_structured_data(self) -> Optional[Dict[str, Any]]:
150
+ """Get structured data including JSON, text, and metadata."""
151
+ if not self._page:
152
+ return None
153
+
154
+ try:
155
+ self.logger_bridge.log_info("🔍 Extracting structured data...")
156
+
157
+ # Try JSON first
158
+ json_data = await self.get_json_content()
159
+
160
+ # Get page metadata
161
+ script = """
162
+ (() => {
163
+ return {
164
+ url: window.location.href,
165
+ title: document.title || '',
166
+ content_type: document.contentType || '',
167
+ charset: document.characterSet || '',
168
+ ready_state: document.readyState,
169
+ has_pre_element: !!document.querySelector('pre'),
170
+ body_text_length: (document.body.textContent || '').length
171
+ };
172
+ })();
173
+ """
174
+
175
+ metadata = await self._page.evaluate(script)
176
+
177
+ result = {
178
+ "extraction_success": json_data is not None,
179
+ "json_data": json_data,
180
+ "metadata": metadata,
181
+ "extracted_at": self._get_timestamp()
182
+ }
183
+
184
+ if json_data:
185
+ self.logger_bridge.log_info("✅ Structured data extraction successful")
186
+ else:
187
+ self.logger_bridge.log_warning("⚠️ No JSON data found, but metadata extracted")
188
+
189
+ return result
190
+
191
+ except Exception as e:
192
+ self.logger_bridge.log_error(f"Structured data extraction error: {e}")
193
+ return None
194
+
195
+ async def detect_content_type(self) -> str:
196
+ """Detect the type of content on the current page."""
197
+ if not self._page:
198
+ return "unknown"
199
+
200
+ try:
201
+ script = """
202
+ (() => {
203
+ const bodyText = (document.body.textContent || '').trim();
204
+ const contentType = document.contentType || '';
205
+ const hasPreElement = !!document.querySelector('pre');
206
+
207
+ // Check for JSON
208
+ if (bodyText.startsWith('{') || bodyText.startsWith('[')) {
209
+ return 'json';
210
+ }
211
+
212
+ // Check for XML
213
+ if (bodyText.startsWith('<') && contentType.includes('xml')) {
214
+ return 'xml';
215
+ }
216
+
217
+ // Check for HTML
218
+ if (document.querySelector('html') && document.querySelector('body') && !hasPreElement) {
219
+ return 'html';
220
+ }
221
+
222
+ // Check for plain text
223
+ if (hasPreElement || contentType.includes('text/plain')) {
224
+ return 'text';
225
+ }
226
+
227
+ return 'unknown';
228
+ })();
229
+ """
230
+
231
+ content_type = await self._page.evaluate(script)
232
+ self.logger_bridge.log_info(f"🔍 Detected content type: {content_type}")
233
+ return content_type
234
+
235
+ except Exception as e:
236
+ self.logger_bridge.log_error(f"Content type detection error: {e}")
237
+ return "unknown"
238
+
239
+ async def get_page_html(self) -> Optional[str]:
240
+ """Get full HTML content from current page."""
241
+ if not self._page:
242
+ return None
243
+
244
+ try:
245
+ self.logger_bridge.log_info("📄 Extracting HTML content...")
246
+ html = await self._page.content()
247
+
248
+ if html:
249
+ self.logger_bridge.log_info(f"✅ HTML extracted: {len(html)} characters")
250
+ return html
251
+ else:
252
+ self.logger_bridge.log_warning("❌ No HTML content found")
253
+ return None
254
+
255
+ except Exception as e:
256
+ self.logger_bridge.log_error(f"HTML extraction error: {e}")
257
+ return None
258
+
259
+ def _get_timestamp(self) -> str:
260
+ """Get current timestamp in ISO format."""
261
+ from datetime import datetime
262
+ return datetime.now().isoformat()
263
+
264
+
265
+ # Export
266
+ __all__ = ["DataExtractionManager"]
@@ -81,7 +81,6 @@ class BrowserLoggerBridge:
81
81
  "navigation_failed": 0,
82
82
  "stealth_applied": 0,
83
83
  "captcha_detected": 0,
84
- "captcha_solved": 0,
85
84
  "profile_created": 0,
86
85
  "cookies_saved": 0,
87
86
  }
@@ -194,17 +193,6 @@ class BrowserLoggerBridge:
194
193
  detected_at=result.detected_at.isoformat(),
195
194
  )
196
195
 
197
- def log_captcha_solved(self, proxy_host: str, proxy_port: int, manual: bool = True) -> None:
198
- """Log captcha resolution"""
199
- self._browser_events["captcha_solved"] += 1
200
- self._log_info(
201
- f"Captcha solved for proxy {proxy_host}:{proxy_port}",
202
- proxy_host=proxy_host,
203
- proxy_port=proxy_port,
204
- resolution_method="manual" if manual else "automatic",
205
- cookies_will_be_saved=True,
206
- )
207
-
208
196
  def log_profile_created(self, profile_name: str, proxy_info: Optional[Dict[str, Any]] = None) -> None:
209
197
  """Log profile creation"""
210
198
  self._browser_events["profile_created"] += 1
@@ -21,7 +21,7 @@ from ..utilities.logging import LoggingUtility
21
21
  from ..utilities.serialization import SerializationUtility
22
22
 
23
23
  from ...managers import (
24
- LoggerManager, HttpManager, BrowserManager, CacheManager,
24
+ LoggerManager, HttpManager, HttpxManager, BrowserManager, CacheManager,
25
25
  ProxyManager, ThreadManager, UpdateManager, ManagerRegistry
26
26
  )
27
27
 
@@ -77,6 +77,7 @@ class UniversalDriver:
77
77
  self.manager_registry: Optional[ManagerRegistry] = None
78
78
  self.logger_manager: Optional[LoggerManager] = None
79
79
  self.http: Optional[HttpManager] = None
80
+ self.httpx: Optional[HttpxManager] = None
80
81
  self.browser: Optional[BrowserManager] = None
81
82
  self.cache: Optional[CacheManager] = None
82
83
  self.proxy: Optional[ProxyManager] = None
@@ -8,11 +8,12 @@ import logging
8
8
  from typing import TYPE_CHECKING
9
9
 
10
10
  from ...managers import (
11
- LoggerManager, HttpManager, BrowserManager, CacheManager,
11
+ LoggerManager, HttpManager, HttpxManager, BrowserManager, CacheManager,
12
12
  ProxyManager, ThreadManager, UpdateManager, ManagerRegistry
13
13
  )
14
14
  from ...managers.logger import LoggerManagerConfig
15
15
  from ...managers.http import HttpManagerConfig
16
+ from ...managers.http import HttpxManagerConfig
16
17
  from ...managers.browser import BrowserManagerConfig
17
18
  from ...managers.cache import CacheManagerConfig
18
19
  from ...managers.proxy import ProxyManagerConfig
@@ -45,6 +46,7 @@ class ManagerFactory:
45
46
  # Setup each manager
46
47
  ManagerFactory._setup_logger_manager(driver, manager_registry)
47
48
  ManagerFactory._setup_http_manager(driver, manager_registry)
49
+ ManagerFactory._setup_httpx_manager(driver, manager_registry)
48
50
  ManagerFactory._setup_browser_manager(driver, manager_registry)
49
51
  ManagerFactory._setup_cache_manager(driver, manager_registry)
50
52
  ManagerFactory._setup_proxy_manager(driver, manager_registry)
@@ -78,6 +80,17 @@ class ManagerFactory:
78
80
  driver.http = HttpManager(http_config)
79
81
  registry.register(driver.http)
80
82
 
83
+ @staticmethod
84
+ def _setup_httpx_manager(driver: 'UniversalDriver', registry: ManagerRegistry):
85
+ """Setup HTTPx manager."""
86
+ httpx_config = HttpxManagerConfig(
87
+ enabled=True,
88
+ timeout=driver.config.http_timeout,
89
+ max_retries=driver.config.max_retries
90
+ )
91
+ driver.httpx = HttpxManager(httpx_config)
92
+ registry.register(driver.httpx)
93
+
81
94
  @staticmethod
82
95
  def _setup_browser_manager(driver: 'UniversalDriver', registry: ManagerRegistry):
83
96
  """Setup browser manager."""
@@ -4,7 +4,7 @@ Clean manager system for UnrealOn Driver.
4
4
 
5
5
  from .base import BaseManager, ManagerConfig, ManagerStatus
6
6
  from .logger import LoggerManager, LoggerManagerConfig
7
- from .http import HttpManager, HttpManagerConfig
7
+ from .http import HttpManager, HttpManagerConfig, HttpxManager, HttpxManagerConfig
8
8
  from .browser import BrowserManager, BrowserManagerConfig
9
9
  from .cache import CacheManager, CacheManagerConfig
10
10
  from .proxy import ProxyManager, ProxyManagerConfig
@@ -21,6 +21,7 @@ __all__ = [
21
21
  # Managers
22
22
  "LoggerManager", "LoggerManagerConfig",
23
23
  "HttpManager", "HttpManagerConfig",
24
+ "HttpxManager", "HttpxManagerConfig",
24
25
  "BrowserManager", "BrowserManagerConfig",
25
26
  "CacheManager", "CacheManagerConfig",
26
27
  "ProxyManager", "ProxyManagerConfig",
@@ -113,7 +113,7 @@ class BrowserManager(BaseManager):
113
113
  raise RuntimeError("Failed to initialize browser")
114
114
 
115
115
  try:
116
- html = await self.browser.get_page_content_async()
116
+ html = await self.browser.data_extraction.get_page_html()
117
117
  self.stats.record_operation(True, 0.0)
118
118
  return html
119
119
  except Exception as e:
@@ -121,6 +121,36 @@ class BrowserManager(BaseManager):
121
121
  self.stats.record_operation(False, 0.0)
122
122
  return None
123
123
 
124
+ async def get_json_content(self) -> Optional[dict]:
125
+ """Extract JSON content from current page via DataExtractionManager."""
126
+ # Ensure browser is initialized
127
+ if not await self._ensure_browser_initialized():
128
+ raise RuntimeError("Failed to initialize browser")
129
+
130
+ try:
131
+ result = await self.browser.data_extraction.get_json_content()
132
+ self.stats.record_operation(True, 0.0)
133
+ return result
134
+ except Exception as e:
135
+ self.logger.error(f"JSON extraction failed: {e}")
136
+ self.stats.record_operation(False, 0.0)
137
+ return None
138
+
139
+ async def get_page_text(self) -> Optional[str]:
140
+ """Get plain text content from current page via DataExtractionManager."""
141
+ # Ensure browser is initialized
142
+ if not await self._ensure_browser_initialized():
143
+ raise RuntimeError("Failed to initialize browser")
144
+
145
+ try:
146
+ result = await self.browser.data_extraction.get_page_text()
147
+ self.stats.record_operation(True, 0.0)
148
+ return result
149
+ except Exception as e:
150
+ self.logger.error(f"Text extraction failed: {e}")
151
+ self.stats.record_operation(False, 0.0)
152
+ return None
153
+
124
154
  async def execute_script_async(self, script: str) -> any:
125
155
  """Execute JavaScript on current page via ScriptManager."""
126
156
  # Ensure browser is initialized
@@ -136,3 +166,9 @@ class BrowserManager(BaseManager):
136
166
  self.logger.error(f"Script execution failed: {e}")
137
167
  self.stats.record_operation(False, 0.0)
138
168
  raise
169
+
170
+ async def cleanup(self) -> None:
171
+ """Cleanup browser resources."""
172
+ if self.browser:
173
+ await self.browser.close_async()
174
+ self.browser = None
@@ -1,9 +1,10 @@
1
1
  """
2
- Clean HTTP manager for requests.
2
+ Clean HTTP managers for requests - both aiohttp and httpx with HTTP/2 support.
3
3
  """
4
4
 
5
5
  import asyncio
6
6
  import aiohttp
7
+ import httpx
7
8
  from typing import Dict, Any, Optional
8
9
  from pydantic import Field
9
10
 
@@ -17,8 +18,16 @@ class HttpManagerConfig(ManagerConfig):
17
18
  connector_limit: int = Field(default=30, description="Connector limit per host")
18
19
 
19
20
 
21
+ class HttpxManagerConfig(ManagerConfig):
22
+ """HTTPx manager configuration with HTTP/2 support."""
23
+ user_agent: str = Field(default="UnrealOn-Driver/1.0", description="User agent string")
24
+ max_connections: int = Field(default=100, description="Max concurrent connections")
25
+ connector_limit: int = Field(default=30, description="Connector limit per host")
26
+ http2: bool = Field(default=True, description="Enable HTTP/2 support")
27
+
28
+
20
29
  class HttpManager(BaseManager):
21
- """Clean HTTP manager with aiohttp."""
30
+ """Clean HTTP manager with aiohttp (original)."""
22
31
 
23
32
  def __init__(self, config: HttpManagerConfig):
24
33
  super().__init__(config, "http")
@@ -105,3 +114,98 @@ class HttpManager(BaseManager):
105
114
  finally:
106
115
  duration = asyncio.get_event_loop().time() - start_time
107
116
  self.stats.record_operation(success, duration)
117
+
118
+
119
+ class HttpxManager(BaseManager):
120
+ """Modern HTTP manager with httpx and HTTP/2 support."""
121
+
122
+ def __init__(self, config: HttpxManagerConfig):
123
+ super().__init__(config, "httpx")
124
+ self.config: HttpxManagerConfig = config
125
+ self.client: Optional[httpx.AsyncClient] = None
126
+
127
+ async def _initialize(self) -> bool:
128
+ """Initialize HTTP client with HTTP/2 support."""
129
+ try:
130
+ # Create limits
131
+ limits = httpx.Limits(
132
+ max_keepalive_connections=self.config.max_connections,
133
+ max_connections=self.config.max_connections,
134
+ keepalive_expiry=300
135
+ )
136
+
137
+ # Create timeout
138
+ timeout = httpx.Timeout(self.config.timeout)
139
+
140
+ # Default headers
141
+ headers = {"User-Agent": self.config.user_agent}
142
+
143
+ # Create client with HTTP/2 support
144
+ self.client = httpx.AsyncClient(
145
+ limits=limits,
146
+ timeout=timeout,
147
+ headers=headers,
148
+ http2=self.config.http2,
149
+ verify=True,
150
+ follow_redirects=True
151
+ )
152
+
153
+ return True
154
+
155
+ except Exception as e:
156
+ self.logger.error(f"HTTPx manager initialization failed: {e}")
157
+ return False
158
+
159
+ async def _shutdown(self):
160
+ """Shutdown HTTP client."""
161
+ if self.client:
162
+ await self.client.aclose()
163
+ self.client = None
164
+
165
+ async def get(self, url: str, **kwargs) -> httpx.Response:
166
+ """Make GET request."""
167
+ if not self.client:
168
+ raise RuntimeError("HTTPx manager not initialized")
169
+
170
+ start_time = asyncio.get_event_loop().time()
171
+ success = False
172
+
173
+ try:
174
+ response = await self.client.get(url, **kwargs)
175
+ success = True
176
+ return response
177
+ finally:
178
+ duration = asyncio.get_event_loop().time() - start_time
179
+ self.stats.record_operation(success, duration)
180
+
181
+ async def post(self, url: str, **kwargs) -> httpx.Response:
182
+ """Make POST request."""
183
+ if not self.client:
184
+ raise RuntimeError("HTTPx manager not initialized")
185
+
186
+ start_time = asyncio.get_event_loop().time()
187
+ success = False
188
+
189
+ try:
190
+ response = await self.client.post(url, **kwargs)
191
+ success = True
192
+ return response
193
+ finally:
194
+ duration = asyncio.get_event_loop().time() - start_time
195
+ self.stats.record_operation(success, duration)
196
+
197
+ async def request(self, method: str, url: str, **kwargs) -> httpx.Response:
198
+ """Make generic request."""
199
+ if not self.client:
200
+ raise RuntimeError("HTTPx manager not initialized")
201
+
202
+ start_time = asyncio.get_event_loop().time()
203
+ success = False
204
+
205
+ try:
206
+ response = await self.client.request(method, url, **kwargs)
207
+ success = True
208
+ return response
209
+ finally:
210
+ duration = asyncio.get_event_loop().time() - start_time
211
+ self.stats.record_operation(success, duration)
@@ -3,8 +3,9 @@ Clean threading manager.
3
3
  """
4
4
 
5
5
  import asyncio
6
+ import inspect
6
7
  from concurrent.futures import ThreadPoolExecutor
7
- from typing import Any, Callable, Optional, Dict
8
+ from typing import Any, Callable, Optional, Dict, Coroutine, Union
8
9
  from pydantic import Field
9
10
 
10
11
  from .base import BaseManager, ManagerConfig
@@ -38,12 +39,52 @@ class ThreadManager(BaseManager):
38
39
  self.executor = None
39
40
 
40
41
  async def run_in_thread(self, func: Callable, *args, **kwargs) -> Any:
41
- """Run function in thread pool."""
42
+ """Run function in thread pool (supports both sync and async functions)."""
42
43
  if not self.executor:
43
44
  raise RuntimeError("Thread manager not initialized")
44
45
 
45
- loop = asyncio.get_event_loop()
46
- return await loop.run_in_executor(self.executor, func, *args, **kwargs)
46
+ # Check if function is async
47
+ if inspect.iscoroutinefunction(func):
48
+ # For async functions, we need to run them in a new event loop in the thread
49
+ def run_async_in_thread():
50
+ # Create new event loop for this thread
51
+ new_loop = asyncio.new_event_loop()
52
+ asyncio.set_event_loop(new_loop)
53
+ try:
54
+ return new_loop.run_until_complete(func(*args, **kwargs))
55
+ finally:
56
+ new_loop.close()
57
+
58
+ loop = asyncio.get_event_loop()
59
+ return await loop.run_in_executor(self.executor, run_async_in_thread)
60
+ else:
61
+ # For sync functions, use normal executor
62
+ loop = asyncio.get_event_loop()
63
+ return await loop.run_in_executor(self.executor, func, *args, **kwargs)
64
+
65
+ async def run_concurrent_async(self, async_funcs: list[Callable], max_concurrent: int = None) -> list[Any]:
66
+ """
67
+ Run multiple async functions concurrently using semaphore for control.
68
+
69
+ This is more efficient than ThreadManager for pure async operations,
70
+ but provides controlled concurrency.
71
+ """
72
+ if not async_funcs:
73
+ return []
74
+
75
+ # Use max_workers as default concurrency limit
76
+ max_concurrent = max_concurrent or self.config.max_workers
77
+
78
+ # Create semaphore to limit concurrency
79
+ semaphore = asyncio.Semaphore(max_concurrent)
80
+
81
+ async def run_with_semaphore(func):
82
+ async with semaphore:
83
+ return await func()
84
+
85
+ # Execute all functions concurrently with semaphore control
86
+ tasks = [run_with_semaphore(func) for func in async_funcs]
87
+ return await asyncio.gather(*tasks, return_exceptions=True)
47
88
 
48
89
  async def _health_check(self) -> Dict[str, Any]:
49
90
  """Thread manager health check."""