unrealon 1.1.1__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +16 -6
- unrealon-1.1.5.dist-info/METADATA +621 -0
- unrealon-1.1.5.dist-info/RECORD +54 -0
- {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/entry_points.txt +1 -1
- unrealon_browser/__init__.py +3 -6
- unrealon_browser/core/browser_manager.py +86 -84
- unrealon_browser/dto/models/config.py +2 -0
- unrealon_browser/managers/captcha.py +165 -185
- unrealon_browser/managers/cookies.py +57 -28
- unrealon_browser/managers/logger_bridge.py +94 -34
- unrealon_browser/managers/profile.py +186 -158
- unrealon_browser/managers/stealth.py +58 -47
- unrealon_driver/__init__.py +8 -21
- unrealon_driver/exceptions.py +5 -0
- unrealon_driver/html_analyzer/__init__.py +32 -0
- unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} +330 -405
- unrealon_driver/html_analyzer/config.py +64 -0
- unrealon_driver/html_analyzer/manager.py +247 -0
- unrealon_driver/html_analyzer/models.py +115 -0
- unrealon_driver/html_analyzer/websocket_analyzer.py +157 -0
- unrealon_driver/models/__init__.py +31 -0
- unrealon_driver/models/websocket.py +98 -0
- unrealon_driver/parser/__init__.py +4 -23
- unrealon_driver/parser/cli_manager.py +6 -5
- unrealon_driver/parser/daemon_manager.py +242 -66
- unrealon_driver/parser/managers/__init__.py +0 -21
- unrealon_driver/parser/managers/config.py +15 -3
- unrealon_driver/parser/parser_manager.py +225 -395
- unrealon_driver/smart_logging/__init__.py +24 -0
- unrealon_driver/smart_logging/models.py +44 -0
- unrealon_driver/smart_logging/smart_logger.py +406 -0
- unrealon_driver/smart_logging/unified_logger.py +525 -0
- unrealon_driver/websocket/__init__.py +31 -0
- unrealon_driver/websocket/client.py +249 -0
- unrealon_driver/websocket/config.py +188 -0
- unrealon_driver/websocket/manager.py +90 -0
- unrealon-1.1.1.dist-info/METADATA +0 -722
- unrealon-1.1.1.dist-info/RECORD +0 -82
- unrealon_bridge/__init__.py +0 -114
- unrealon_bridge/cli.py +0 -316
- unrealon_bridge/client/__init__.py +0 -93
- unrealon_bridge/client/base.py +0 -78
- unrealon_bridge/client/commands.py +0 -89
- unrealon_bridge/client/connection.py +0 -90
- unrealon_bridge/client/events.py +0 -65
- unrealon_bridge/client/health.py +0 -38
- unrealon_bridge/client/html_parser.py +0 -146
- unrealon_bridge/client/logging.py +0 -139
- unrealon_bridge/client/proxy.py +0 -70
- unrealon_bridge/client/scheduler.py +0 -450
- unrealon_bridge/client/session.py +0 -70
- unrealon_bridge/configs/__init__.py +0 -14
- unrealon_bridge/configs/bridge_config.py +0 -212
- unrealon_bridge/configs/bridge_config.yaml +0 -39
- unrealon_bridge/models/__init__.py +0 -138
- unrealon_bridge/models/base.py +0 -28
- unrealon_bridge/models/command.py +0 -41
- unrealon_bridge/models/events.py +0 -40
- unrealon_bridge/models/html_parser.py +0 -79
- unrealon_bridge/models/logging.py +0 -55
- unrealon_bridge/models/parser.py +0 -63
- unrealon_bridge/models/proxy.py +0 -41
- unrealon_bridge/models/requests.py +0 -95
- unrealon_bridge/models/responses.py +0 -88
- unrealon_bridge/models/scheduler.py +0 -592
- unrealon_bridge/models/session.py +0 -28
- unrealon_bridge/server/__init__.py +0 -91
- unrealon_bridge/server/base.py +0 -171
- unrealon_bridge/server/handlers/__init__.py +0 -23
- unrealon_bridge/server/handlers/command.py +0 -110
- unrealon_bridge/server/handlers/html_parser.py +0 -139
- unrealon_bridge/server/handlers/logging.py +0 -95
- unrealon_bridge/server/handlers/parser.py +0 -95
- unrealon_bridge/server/handlers/proxy.py +0 -75
- unrealon_bridge/server/handlers/scheduler.py +0 -545
- unrealon_bridge/server/handlers/session.py +0 -66
- unrealon_driver/browser/__init__.py +0 -8
- unrealon_driver/browser/config.py +0 -74
- unrealon_driver/browser/manager.py +0 -416
- unrealon_driver/parser/managers/browser.py +0 -51
- unrealon_driver/parser/managers/logging.py +0 -609
- {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/WHEEL +0 -0
- {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,110 +9,75 @@ Strict compliance with CRITICAL_REQUIREMENTS.md:
|
|
|
9
9
|
- No try blocks in imports
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
import asyncio
|
|
13
12
|
from datetime import datetime, timezone
|
|
14
|
-
from typing import Optional
|
|
15
|
-
from
|
|
16
|
-
from pydantic import BaseModel, Field, ConfigDict, field_validator
|
|
13
|
+
from typing import Optional
|
|
14
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
17
15
|
|
|
18
|
-
from
|
|
19
|
-
from unrealon_rpc.logging import get_logger
|
|
16
|
+
from .managers import ConfigManager, ParserConfig, ResultManager, ErrorManager, RetryConfig
|
|
20
17
|
|
|
21
|
-
from
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
18
|
+
# from unrealon_browser import BrowserManager, BrowserConfig # Temporary comment to avoid circular import
|
|
19
|
+
|
|
20
|
+
# Import UnifiedLogger and HTML Analyzer
|
|
21
|
+
from unrealon_driver.smart_logging import create_unified_logger, LogLevel
|
|
22
|
+
from unrealon_driver.html_analyzer import create_html_analyzer, HTMLCleaningConfig, HTMLParseResult
|
|
23
|
+
from unrealon_driver.websocket import websocket_manager, WebSocketConfig
|
|
24
|
+
from unrealon_browser.core import BrowserManager
|
|
25
|
+
from unrealon_browser.dto.models.config import BrowserConfig
|
|
29
26
|
|
|
30
27
|
|
|
31
28
|
class ParserManagerConfig(BaseModel):
|
|
32
29
|
"""Complete parser manager configuration"""
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
)
|
|
37
|
-
|
|
30
|
+
|
|
31
|
+
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
32
|
+
|
|
38
33
|
# Core configuration
|
|
39
|
-
parser_config: ParserConfig = Field(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
html_config: HTMLCleaningConfig = Field(
|
|
50
|
-
default_factory=HTMLCleaningConfig,
|
|
51
|
-
description="HTML cleaning configuration"
|
|
52
|
-
)
|
|
53
|
-
browser_config: BrowserConfig = Field(
|
|
54
|
-
default_factory=BrowserConfig,
|
|
55
|
-
description="Browser configuration"
|
|
56
|
-
)
|
|
57
|
-
retry_config: RetryConfig = Field(
|
|
58
|
-
default_factory=RetryConfig,
|
|
59
|
-
description="Retry configuration"
|
|
60
|
-
)
|
|
61
|
-
|
|
34
|
+
parser_config: ParserConfig = Field(default_factory=ParserConfig, description="Core parser configuration")
|
|
35
|
+
|
|
36
|
+
# Logging configuration (simplified)
|
|
37
|
+
console_enabled: bool = Field(default=True, description="Enable console logging")
|
|
38
|
+
file_enabled: bool = Field(default=True, description="Enable file logging")
|
|
39
|
+
console_level: LogLevel = Field(default=LogLevel.INFO, description="Console log level")
|
|
40
|
+
file_level: LogLevel = Field(default=LogLevel.DEBUG, description="File log level")
|
|
41
|
+
html_config: HTMLCleaningConfig = Field(default_factory=HTMLCleaningConfig, description="HTML cleaning configuration")
|
|
42
|
+
retry_config: RetryConfig = Field(default_factory=RetryConfig, description="Retry configuration")
|
|
43
|
+
|
|
62
44
|
# Bridge settings
|
|
63
|
-
bridge_enabled: bool = Field(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
def model_post_init(self, __context) -> None:
|
|
73
|
-
"""Sync configurations across managers"""
|
|
74
|
-
# Sync parser name across all configs
|
|
75
|
-
parser_name = self.parser_config.parser_name
|
|
76
|
-
if hasattr(self.logging_config, 'parser_name'):
|
|
77
|
-
self.logging_config.parser_name = parser_name
|
|
78
|
-
|
|
79
|
-
# Sync system directories
|
|
80
|
-
system_dir = self.parser_config.system_dir
|
|
81
|
-
if system_dir:
|
|
82
|
-
self.logging_config.log_dir = system_dir / "logs"
|
|
83
|
-
self.browser_config.screenshots_dir = system_dir / "screenshots"
|
|
84
|
-
self.browser_config.cookies_file = system_dir / "cookies.json"
|
|
45
|
+
bridge_enabled: bool = Field(default=True, description="Enable bridge connection")
|
|
46
|
+
auto_register: bool = Field(default=True, description="Auto-register parser with bridge")
|
|
47
|
+
|
|
48
|
+
# SmartLogger settings
|
|
49
|
+
bridge_logs_url: Optional[str] = Field(default=None, description="Bridge logs WebSocket URL (ws://localhost:8001/logs)")
|
|
50
|
+
log_batch_interval: float = Field(default=5.0, description="Log batch interval in seconds")
|
|
51
|
+
daemon_mode: Optional[bool] = Field(default=None, description="Daemon mode for logging (None = auto-detect)")
|
|
85
52
|
|
|
86
53
|
|
|
87
54
|
class ParserStats(BaseModel):
|
|
88
55
|
"""Comprehensive parser statistics"""
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
)
|
|
93
|
-
|
|
56
|
+
|
|
57
|
+
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
58
|
+
|
|
94
59
|
parser_id: str = Field(...)
|
|
95
60
|
parser_name: str = Field(...)
|
|
96
61
|
session_id: Optional[str] = Field(default=None)
|
|
97
|
-
|
|
62
|
+
|
|
98
63
|
# Timing
|
|
99
64
|
session_start: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
100
65
|
session_duration: float = Field(default=0.0, ge=0.0)
|
|
101
|
-
|
|
66
|
+
|
|
102
67
|
# Operations
|
|
103
68
|
operations_completed: int = Field(default=0, ge=0)
|
|
104
69
|
operations_failed: int = Field(default=0, ge=0)
|
|
105
70
|
success_rate: float = Field(default=0.0, ge=0.0, le=100.0)
|
|
106
|
-
|
|
71
|
+
|
|
107
72
|
# Content processing
|
|
108
73
|
pages_processed: int = Field(default=0, ge=0)
|
|
109
74
|
html_cleaned_count: int = Field(default=0, ge=0)
|
|
110
75
|
total_html_reduction: float = Field(default=0.0, ge=0.0)
|
|
111
|
-
|
|
76
|
+
|
|
112
77
|
# Errors
|
|
113
78
|
total_errors: int = Field(default=0, ge=0)
|
|
114
79
|
retries_attempted: int = Field(default=0, ge=0)
|
|
115
|
-
|
|
80
|
+
|
|
116
81
|
# Bridge
|
|
117
82
|
bridge_connected: bool = Field(default=False)
|
|
118
83
|
bridge_messages_sent: int = Field(default=0, ge=0)
|
|
@@ -120,6 +85,7 @@ class ParserStats(BaseModel):
|
|
|
120
85
|
|
|
121
86
|
class ParserManagerError(Exception):
|
|
122
87
|
"""Base exception for parser manager"""
|
|
88
|
+
|
|
123
89
|
def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
|
|
124
90
|
self.message = message
|
|
125
91
|
self.operation = operation
|
|
@@ -129,18 +95,20 @@ class ParserManagerError(Exception):
|
|
|
129
95
|
|
|
130
96
|
class InitializationError(ParserManagerError):
|
|
131
97
|
"""Raised when parser manager initialization fails"""
|
|
98
|
+
|
|
132
99
|
pass
|
|
133
100
|
|
|
134
101
|
|
|
135
102
|
class OperationError(ParserManagerError):
|
|
136
103
|
"""Raised when parser operation fails"""
|
|
104
|
+
|
|
137
105
|
pass
|
|
138
106
|
|
|
139
107
|
|
|
140
108
|
class ParserManager:
|
|
141
109
|
"""
|
|
142
110
|
🚀 Parser Manager - Unified parser management system
|
|
143
|
-
|
|
111
|
+
|
|
144
112
|
Features:
|
|
145
113
|
- Unified Configuration: Single config for all managers
|
|
146
114
|
- Automatic Lifecycle: Handles initialization, execution, cleanup
|
|
@@ -148,422 +116,303 @@ class ParserManager:
|
|
|
148
116
|
- Performance Monitoring: Comprehensive statistics and metrics
|
|
149
117
|
- Bridge Integration: Seamless communication with Django
|
|
150
118
|
- Type Safety: Full Pydantic v2 compliance
|
|
151
|
-
|
|
119
|
+
|
|
152
120
|
Usage:
|
|
153
121
|
config = ParserManagerConfig(
|
|
154
122
|
parser_config=ParserConfig(parser_name="MyParser"),
|
|
155
123
|
bridge_enabled=True
|
|
156
124
|
)
|
|
157
|
-
|
|
125
|
+
|
|
158
126
|
async with ParserManager(config) as parser:
|
|
159
127
|
# Navigate and extract
|
|
160
128
|
html = await parser.get_html("https://example.com")
|
|
161
129
|
cleaned_html = await parser.clean_html(html)
|
|
162
130
|
result = await parser.analyze_html(cleaned_html)
|
|
163
|
-
|
|
131
|
+
|
|
164
132
|
# Results are automatically tracked
|
|
165
133
|
stats = parser.get_stats()
|
|
166
134
|
"""
|
|
167
|
-
|
|
135
|
+
|
|
168
136
|
def __init__(self, config: ParserManagerConfig):
|
|
169
137
|
self.config = config
|
|
170
|
-
|
|
171
|
-
|
|
138
|
+
|
|
172
139
|
# Initialize managers
|
|
173
140
|
self.config_manager = ConfigManager(self.config.parser_config)
|
|
174
141
|
self.result_manager = ResultManager(self.config.parser_config.parser_id)
|
|
175
|
-
self.error_manager = ErrorManager(
|
|
176
|
-
|
|
177
|
-
self.
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
142
|
+
self.error_manager = ErrorManager()
|
|
143
|
+
# Initialize HTML Analyzer (WebSocket URL auto-detected)
|
|
144
|
+
self.html_analyzer = create_html_analyzer(parser_id=self.config.parser_config.parser_id, api_key=self.config.parser_config.api_key, cleaning_config=self.config.html_config)
|
|
145
|
+
# Create default browser config
|
|
146
|
+
browser_config = BrowserConfig(parser_name=self.config.parser_config.parser_name)
|
|
147
|
+
self.browser_manager = BrowserManager(browser_config, parser_id=self.config.parser_config.parser_id)
|
|
148
|
+
|
|
149
|
+
# Initialize WebSocket connection config
|
|
150
|
+
if self.config.bridge_logs_url:
|
|
151
|
+
self._websocket_config = WebSocketConfig(url=self.config.bridge_logs_url, api_key=self.config.parser_config.api_key, parser_id=self.config.parser_config.parser_id)
|
|
152
|
+
else:
|
|
153
|
+
self._websocket_config = None
|
|
154
|
+
|
|
155
|
+
# Initialize UnifiedLogger
|
|
156
|
+
log_file = None
|
|
157
|
+
if self.config.parser_config.system_dir:
|
|
158
|
+
log_file = self.config.parser_config.system_dir / "logs" / f"{self.config.parser_config.parser_name}.log"
|
|
159
|
+
|
|
160
|
+
self.logger = create_unified_logger(
|
|
161
|
+
parser_id=self.config.parser_config.parser_id,
|
|
162
|
+
parser_name=self.config.parser_config.parser_name,
|
|
163
|
+
bridge_logs_url=self.config.bridge_logs_url,
|
|
164
|
+
log_file=log_file,
|
|
165
|
+
console_enabled=self.config.console_enabled,
|
|
166
|
+
file_enabled=self.config.file_enabled,
|
|
167
|
+
console_level=self.config.console_level,
|
|
168
|
+
file_level=self.config.file_level,
|
|
169
|
+
batch_interval=self.config.log_batch_interval,
|
|
170
|
+
daemon_mode=self.config.daemon_mode,
|
|
171
|
+
)
|
|
172
|
+
|
|
183
173
|
# State
|
|
184
174
|
self._is_initialized = False
|
|
185
175
|
self._session_id: Optional[str] = None
|
|
186
|
-
self._stats = ParserStats(
|
|
187
|
-
|
|
188
|
-
parser_name=self.config.parser_config.parser_name
|
|
189
|
-
)
|
|
190
|
-
|
|
176
|
+
self._stats = ParserStats(parser_id=self.config.parser_config.parser_id, parser_name=self.config.parser_config.parser_name)
|
|
177
|
+
|
|
191
178
|
# Register retry configurations
|
|
192
179
|
self._setup_retry_configs()
|
|
193
|
-
|
|
180
|
+
|
|
194
181
|
# ==========================================
|
|
195
182
|
# LIFECYCLE MANAGEMENT
|
|
196
183
|
# ==========================================
|
|
197
|
-
|
|
184
|
+
|
|
198
185
|
async def initialize(self) -> None:
|
|
199
186
|
"""Initialize all managers and establish connections"""
|
|
200
187
|
if self._is_initialized:
|
|
201
188
|
return
|
|
202
|
-
|
|
189
|
+
|
|
203
190
|
try:
|
|
204
|
-
self.
|
|
205
|
-
|
|
206
|
-
# Initialize
|
|
207
|
-
if self.
|
|
208
|
-
await self.
|
|
209
|
-
|
|
191
|
+
self.logger.info("🚀 Initializing parser manager...")
|
|
192
|
+
|
|
193
|
+
# Initialize WebSocket connection
|
|
194
|
+
if self._websocket_config:
|
|
195
|
+
await websocket_manager.initialize(self._websocket_config)
|
|
196
|
+
if websocket_manager.connected:
|
|
197
|
+
self.logger.info("🔌 WebSocket connected")
|
|
198
|
+
else:
|
|
199
|
+
self.logger.warning("🔌 WebSocket connection failed")
|
|
200
|
+
|
|
210
201
|
# Initialize browser
|
|
211
|
-
await self.browser_manager.
|
|
212
|
-
|
|
213
|
-
# Update logging manager with bridge client
|
|
214
|
-
if self.bridge_client:
|
|
215
|
-
self.logging_manager.update_bridge_client(self.bridge_client)
|
|
216
|
-
|
|
217
|
-
# Register parser if enabled
|
|
218
|
-
if self.config.auto_register and self.bridge_client:
|
|
219
|
-
await self._register_parser()
|
|
220
|
-
|
|
202
|
+
await self.browser_manager.initialize_async()
|
|
203
|
+
|
|
221
204
|
self._is_initialized = True
|
|
222
|
-
self.
|
|
223
|
-
|
|
205
|
+
self.logger.info("✅ Parser manager initialized successfully")
|
|
206
|
+
|
|
224
207
|
except Exception as e:
|
|
225
208
|
self.error_manager.record_error(e, "initialization")
|
|
226
|
-
raise InitializationError(
|
|
227
|
-
|
|
228
|
-
operation="initialization"
|
|
229
|
-
) from e
|
|
230
|
-
|
|
209
|
+
raise InitializationError(message=f"Failed to initialize parser manager: {e}", operation="initialization") from e
|
|
210
|
+
|
|
231
211
|
async def cleanup(self) -> None:
|
|
232
212
|
"""Clean up all resources"""
|
|
233
|
-
self.
|
|
234
|
-
|
|
213
|
+
self.logger.info("🧹 Cleaning up parser manager...")
|
|
214
|
+
|
|
235
215
|
cleanup_errors = []
|
|
236
|
-
|
|
216
|
+
|
|
237
217
|
# End session if active
|
|
238
|
-
if self._session_id
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
except Exception as e:
|
|
242
|
-
cleanup_errors.append(f"end_session: {e}")
|
|
243
|
-
|
|
218
|
+
if self._session_id:
|
|
219
|
+
await self.end_session()
|
|
220
|
+
|
|
244
221
|
# Cleanup browser
|
|
245
222
|
try:
|
|
246
|
-
await self.browser_manager.
|
|
223
|
+
await self.browser_manager.close_async()
|
|
247
224
|
except Exception as e:
|
|
248
225
|
cleanup_errors.append(f"browser_cleanup: {e}")
|
|
249
|
-
|
|
250
|
-
# Disconnect
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
226
|
+
|
|
227
|
+
# Disconnect WebSocket
|
|
228
|
+
try:
|
|
229
|
+
await websocket_manager.disconnect()
|
|
230
|
+
except Exception as e:
|
|
231
|
+
cleanup_errors.append(f"websocket_disconnect: {e}")
|
|
232
|
+
|
|
257
233
|
# Update final stats
|
|
258
234
|
self._update_session_stats()
|
|
259
|
-
|
|
235
|
+
|
|
236
|
+
# Cleanup UnifiedLogger
|
|
237
|
+
try:
|
|
238
|
+
await self.logger.close()
|
|
239
|
+
except Exception as e:
|
|
240
|
+
cleanup_errors.append(f"logger_cleanup: {e}")
|
|
241
|
+
|
|
260
242
|
# Log cleanup errors but don't raise
|
|
261
243
|
if cleanup_errors:
|
|
262
|
-
self.
|
|
263
|
-
|
|
264
|
-
self.
|
|
265
|
-
|
|
244
|
+
self.logger.warning(f"Cleanup errors: {'; '.join(cleanup_errors)}")
|
|
245
|
+
|
|
246
|
+
self.logger.info("✅ Parser manager cleanup completed")
|
|
247
|
+
|
|
266
248
|
# ==========================================
|
|
267
249
|
# CORE PARSING METHODS
|
|
268
250
|
# ==========================================
|
|
269
|
-
|
|
251
|
+
|
|
270
252
|
async def get_html(self, url: str) -> str:
|
|
271
253
|
"""Get HTML content from URL with error handling"""
|
|
272
254
|
if not self._is_initialized:
|
|
273
255
|
await self.initialize()
|
|
274
|
-
|
|
256
|
+
|
|
275
257
|
@self.error_manager.with_retry("get_html", self.config.retry_config)
|
|
276
258
|
async def _get_html_with_retry():
|
|
277
|
-
self.
|
|
259
|
+
self.logger.url_access(url, "fetching")
|
|
278
260
|
html = await self.browser_manager.get_html(url)
|
|
279
261
|
self._stats.pages_processed += 1
|
|
280
262
|
return html
|
|
281
|
-
|
|
263
|
+
|
|
282
264
|
try:
|
|
283
265
|
return await _get_html_with_retry()
|
|
284
266
|
except Exception as e:
|
|
285
267
|
self._stats.total_errors += 1
|
|
286
|
-
raise OperationError(
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
) from e
|
|
291
|
-
|
|
292
|
-
async def clean_html(self, html: str, **kwargs) -> str:
|
|
293
|
-
"""Clean HTML content for LLM analysis"""
|
|
294
|
-
try:
|
|
295
|
-
self.logging_manager.info(f"🧹 Cleaning HTML: {len(html)} characters")
|
|
296
|
-
|
|
297
|
-
cleaned_html = await self.html_manager.clean_html(html, **kwargs)
|
|
298
|
-
|
|
299
|
-
# Update stats
|
|
300
|
-
self._stats.html_cleaned_count += 1
|
|
301
|
-
stats = self.html_manager.get_cleaning_stats(html, cleaned_html)
|
|
302
|
-
self._stats.total_html_reduction += stats.size_reduction_percent
|
|
303
|
-
|
|
304
|
-
self.logging_manager.info(
|
|
305
|
-
f"✅ HTML cleaned: {len(html)} → {len(cleaned_html)} chars "
|
|
306
|
-
f"({stats.size_reduction_percent:.1f}% reduction)"
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
return cleaned_html
|
|
310
|
-
|
|
311
|
-
except Exception as e:
|
|
312
|
-
self._stats.total_errors += 1
|
|
313
|
-
raise OperationError(
|
|
314
|
-
message=f"Failed to clean HTML: {e}",
|
|
315
|
-
operation="clean_html"
|
|
316
|
-
) from e
|
|
317
|
-
|
|
318
|
-
async def analyze_html(
|
|
319
|
-
self,
|
|
320
|
-
html: str,
|
|
321
|
-
instructions: Optional[str] = None,
|
|
322
|
-
**kwargs
|
|
323
|
-
) -> dict[str, str]:
|
|
324
|
-
"""Analyze HTML content via bridge"""
|
|
325
|
-
if not self.bridge_client:
|
|
326
|
-
raise OperationError(
|
|
327
|
-
message="Bridge client not available for HTML analysis",
|
|
328
|
-
operation="analyze_html"
|
|
329
|
-
)
|
|
330
|
-
|
|
331
|
-
try:
|
|
332
|
-
self.logging_manager.info("🤖 Analyzing HTML with LLM...")
|
|
333
|
-
|
|
334
|
-
result = await self.bridge_client.parse_html(
|
|
335
|
-
html_content=html,
|
|
336
|
-
instructions=instructions,
|
|
337
|
-
parse_type="general",
|
|
338
|
-
timeout=kwargs.get("timeout", 60),
|
|
339
|
-
metadata=kwargs.get("metadata", {})
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
return {
|
|
343
|
-
"success": str(result.success),
|
|
344
|
-
"parsed_data": str(result.parsed_data),
|
|
345
|
-
"markdown": result.markdown or "",
|
|
346
|
-
"error_message": result.error_message or ""
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
except Exception as e:
|
|
350
|
-
self._stats.total_errors += 1
|
|
351
|
-
raise OperationError(
|
|
352
|
-
message=f"Failed to analyze HTML: {e}",
|
|
353
|
-
operation="analyze_html"
|
|
354
|
-
) from e
|
|
355
|
-
|
|
356
|
-
async def parse_url(
|
|
357
|
-
self,
|
|
358
|
-
url: str,
|
|
359
|
-
instructions: Optional[str] = None,
|
|
360
|
-
**kwargs
|
|
361
|
-
) -> dict[str, str]:
|
|
362
|
-
"""Complete parsing workflow: fetch → clean → analyze"""
|
|
268
|
+
raise OperationError(message=f"Failed to get HTML from {url}: {e}", operation="get_html", details={"url": url}) from e
|
|
269
|
+
|
|
270
|
+
async def parse_url(self, url: str, instructions: Optional[str] = None, **kwargs) -> HTMLParseResult:
|
|
271
|
+
"""Complete parsing workflow: fetch → clean → analyze via HTML Analyzer"""
|
|
363
272
|
operation = self.result_manager.start_operation()
|
|
364
|
-
|
|
273
|
+
|
|
365
274
|
try:
|
|
366
|
-
self.
|
|
367
|
-
|
|
275
|
+
self.logger.start_operation("parse_url")
|
|
276
|
+
|
|
368
277
|
# Fetch HTML
|
|
369
278
|
html = await self.get_html(url)
|
|
370
|
-
|
|
371
|
-
#
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
#
|
|
375
|
-
|
|
376
|
-
|
|
279
|
+
|
|
280
|
+
# Delegate complete HTML processing to HTML Analyzer
|
|
281
|
+
analysis_result = await self.html_analyzer.parse_html(html=html, url=url, instructions=instructions, session_id=self._session_id, **kwargs)
|
|
282
|
+
|
|
283
|
+
# Update stats from HTML Analyzer
|
|
284
|
+
html_stats = self.html_analyzer.get_stats()
|
|
285
|
+
self._stats.html_cleaned_count += html_stats.cleaned_count
|
|
286
|
+
self._stats.total_html_reduction += html_stats.total_reduction
|
|
287
|
+
|
|
377
288
|
# Complete operation
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
289
|
+
success = analysis_result.success == "true"
|
|
290
|
+
self.result_manager.complete_operation(data=[], source_urls=[url], success=success)
|
|
291
|
+
|
|
292
|
+
if success:
|
|
293
|
+
self._stats.operations_completed += 1
|
|
294
|
+
else:
|
|
295
|
+
self._stats.operations_failed += 1
|
|
296
|
+
|
|
297
|
+
self.logger.end_operation("parse_url", operation.duration_seconds)
|
|
298
|
+
|
|
387
299
|
return analysis_result
|
|
388
|
-
|
|
300
|
+
|
|
389
301
|
except Exception as e:
|
|
390
|
-
self.result_manager.complete_operation(
|
|
391
|
-
|
|
392
|
-
source_urls=[url],
|
|
393
|
-
success=False,
|
|
394
|
-
error_message=str(e)
|
|
395
|
-
)
|
|
396
|
-
|
|
302
|
+
self.result_manager.complete_operation(data=[], source_urls=[url], success=False, error_message=str(e))
|
|
303
|
+
|
|
397
304
|
self._stats.operations_failed += 1
|
|
398
|
-
self.
|
|
305
|
+
self.logger.error(f"❌ Failed parse_url: {str(e)}")
|
|
399
306
|
raise
|
|
400
|
-
|
|
307
|
+
|
|
401
308
|
# ==========================================
|
|
402
|
-
# SESSION MANAGEMENT
|
|
309
|
+
# SESSION MANAGEMENT (Simplified - Local Only)
|
|
403
310
|
# ==========================================
|
|
404
|
-
|
|
311
|
+
|
|
405
312
|
async def start_session(self, session_type: str = "parsing") -> str:
|
|
406
|
-
"""Start a new parsing session"""
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
"parser_name": self.config.parser_config.parser_name,
|
|
418
|
-
"parser_type": self.config.parser_config.parser_type
|
|
419
|
-
}
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
self._session_id = session_id
|
|
423
|
-
self._stats.session_id = session_id
|
|
424
|
-
self.logging_manager.set_session(session_id)
|
|
425
|
-
|
|
426
|
-
self.logging_manager.info(f"📋 Session started: {session_id}")
|
|
427
|
-
return session_id
|
|
428
|
-
|
|
429
|
-
except Exception as e:
|
|
430
|
-
raise OperationError(
|
|
431
|
-
message=f"Failed to start session: {e}",
|
|
432
|
-
operation="start_session"
|
|
433
|
-
) from e
|
|
434
|
-
|
|
313
|
+
"""Start a new parsing session (local only)"""
|
|
314
|
+
import uuid
|
|
315
|
+
|
|
316
|
+
session_id = f"{session_type}_{uuid.uuid4().hex[:8]}"
|
|
317
|
+
self._session_id = session_id
|
|
318
|
+
self._stats.session_id = session_id
|
|
319
|
+
self.logger.set_session(session_id)
|
|
320
|
+
|
|
321
|
+
self.logger.info(f"📋 Local session started: {session_id}")
|
|
322
|
+
return session_id
|
|
323
|
+
|
|
435
324
|
async def end_session(self) -> None:
|
|
436
325
|
"""End current parsing session"""
|
|
437
|
-
if not self._session_id
|
|
326
|
+
if not self._session_id:
|
|
438
327
|
return
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
self._stats.session_id = None
|
|
445
|
-
|
|
446
|
-
except Exception as e:
|
|
447
|
-
self.logging_manager.warning(f"Failed to end session: {e}")
|
|
448
|
-
|
|
328
|
+
|
|
329
|
+
self.logger.info(f"📋 Local session ended: {self._session_id}")
|
|
330
|
+
self._session_id = None
|
|
331
|
+
self._stats.session_id = None
|
|
332
|
+
|
|
449
333
|
# ==========================================
|
|
450
334
|
# STATISTICS AND MONITORING
|
|
451
335
|
# ==========================================
|
|
452
|
-
|
|
336
|
+
|
|
453
337
|
def get_stats(self) -> ParserStats:
|
|
454
338
|
"""Get comprehensive parser statistics"""
|
|
455
339
|
self._update_session_stats()
|
|
456
340
|
return ParserStats.model_validate(self._stats.model_dump())
|
|
457
|
-
|
|
341
|
+
|
|
458
342
|
def get_manager_stats(self) -> dict[str, dict[str, str]]:
|
|
459
343
|
"""Get statistics from all managers"""
|
|
460
344
|
return {
|
|
461
345
|
"result_manager": self.result_manager.get_stats(),
|
|
462
346
|
"error_manager": self.error_manager.get_error_stats(),
|
|
463
|
-
"browser_manager": self.browser_manager.get_stats().model_dump(mode=
|
|
464
|
-
|
|
347
|
+
"browser_manager": self.browser_manager.get_stats().model_dump(mode="json"),
|
|
348
|
+
# Logging stats removed - using UnifiedLogger now
|
|
465
349
|
}
|
|
466
|
-
|
|
350
|
+
|
|
467
351
|
async def health_check(self) -> dict[str, str]:
|
|
468
352
|
"""Comprehensive health check"""
|
|
469
|
-
health = {
|
|
470
|
-
|
|
471
|
-
"parser_id": self.config.parser_config.parser_id,
|
|
472
|
-
"parser_name": self.config.parser_config.parser_name,
|
|
473
|
-
"initialized": str(self._is_initialized),
|
|
474
|
-
"session_active": str(self._session_id is not None)
|
|
475
|
-
}
|
|
476
|
-
|
|
353
|
+
health = {"status": "healthy", "parser_id": self.config.parser_config.parser_id, "parser_name": self.config.parser_config.parser_name, "initialized": str(self._is_initialized), "session_active": str(self._session_id is not None)}
|
|
354
|
+
|
|
477
355
|
# Check browser health
|
|
478
356
|
try:
|
|
479
357
|
browser_health = await self.browser_manager.health_check()
|
|
480
358
|
health["browser_status"] = browser_health.get("status", "unknown")
|
|
481
359
|
except Exception as e:
|
|
482
360
|
health["browser_status"] = f"error: {e}"
|
|
483
|
-
|
|
484
|
-
# Check
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
else:
|
|
488
|
-
health["bridge_connected"] = "false"
|
|
489
|
-
|
|
361
|
+
|
|
362
|
+
# Check WebSocket connection health
|
|
363
|
+
health["websocket_connected"] = str(websocket_manager.connected)
|
|
364
|
+
|
|
490
365
|
return health
|
|
491
|
-
|
|
366
|
+
|
|
492
367
|
# ==========================================
|
|
493
368
|
# INTERNAL METHODS
|
|
494
369
|
# ==========================================
|
|
495
|
-
|
|
496
|
-
async def _initialize_bridge(self) -> None:
|
|
497
|
-
"""Initialize bridge client"""
|
|
498
|
-
self.bridge_client = ParserBridgeClient(
|
|
499
|
-
websocket_url=self.config.parser_config.websocket_url,
|
|
500
|
-
parser_type=self.config.parser_config.parser_type,
|
|
501
|
-
api_key=self.config.parser_config.api_key
|
|
502
|
-
)
|
|
503
|
-
|
|
504
|
-
await self.bridge_client.bridge_client.connect()
|
|
505
|
-
self._stats.bridge_connected = True
|
|
506
|
-
self.logging_manager.info("🔗 Bridge client connected")
|
|
507
|
-
|
|
508
|
-
async def _register_parser(self) -> None:
|
|
509
|
-
"""Register parser with bridge"""
|
|
510
|
-
if not self.bridge_client:
|
|
511
|
-
return
|
|
512
|
-
|
|
513
|
-
parser_info = await self.bridge_client.register_parser(
|
|
514
|
-
metadata={
|
|
515
|
-
"driver_version": "4.0.0",
|
|
516
|
-
"capabilities": "scraping,html_cleaning,llm_integration",
|
|
517
|
-
"managers": "config,result,error,logging,html,browser"
|
|
518
|
-
}
|
|
519
|
-
)
|
|
520
|
-
|
|
521
|
-
# Update parser ID
|
|
522
|
-
self.config.parser_config.parser_id = parser_info.parser_id
|
|
523
|
-
self._stats.parser_id = parser_info.parser_id
|
|
524
|
-
|
|
525
|
-
self.logging_manager.info(f"📝 Parser registered: {parser_info.parser_id}")
|
|
526
|
-
|
|
370
|
+
|
|
527
371
|
def _setup_retry_configs(self) -> None:
|
|
528
372
|
"""Setup retry configurations for different operations"""
|
|
529
373
|
# Navigation retry config
|
|
530
|
-
nav_config = RetryConfig(
|
|
531
|
-
max_attempts=3,
|
|
532
|
-
base_delay=2.0,
|
|
533
|
-
retry_on_exceptions=["NavigationError", "TimeoutError", "ConnectionError"]
|
|
534
|
-
)
|
|
374
|
+
nav_config = RetryConfig(max_attempts=3, base_delay=2.0, retry_on_exceptions=["NavigationError", "TimeoutError", "ConnectionError"])
|
|
535
375
|
self.error_manager.register_retry_config("get_html", nav_config)
|
|
536
|
-
|
|
376
|
+
|
|
537
377
|
# Bridge communication retry config
|
|
538
|
-
bridge_config = RetryConfig(
|
|
539
|
-
max_attempts=2,
|
|
540
|
-
base_delay=1.0,
|
|
541
|
-
retry_on_exceptions=["ConnectionError", "TimeoutError"]
|
|
542
|
-
)
|
|
378
|
+
bridge_config = RetryConfig(max_attempts=2, base_delay=1.0, retry_on_exceptions=["ConnectionError", "TimeoutError"])
|
|
543
379
|
self.error_manager.register_retry_config("analyze_html", bridge_config)
|
|
544
|
-
|
|
380
|
+
|
|
545
381
|
def _update_session_stats(self) -> None:
|
|
546
382
|
"""Update session statistics"""
|
|
547
383
|
self._stats.session_duration = (datetime.now(timezone.utc) - self._stats.session_start).total_seconds()
|
|
548
|
-
|
|
384
|
+
|
|
549
385
|
total_operations = self._stats.operations_completed + self._stats.operations_failed
|
|
550
386
|
if total_operations > 0:
|
|
551
387
|
self._stats.success_rate = (self._stats.operations_completed / total_operations) * 100.0
|
|
552
|
-
|
|
388
|
+
|
|
553
389
|
# ==========================================
|
|
554
390
|
# CONTEXT MANAGER SUPPORT
|
|
555
391
|
# ==========================================
|
|
556
|
-
|
|
392
|
+
|
|
557
393
|
async def __aenter__(self):
|
|
558
394
|
"""Async context manager entry"""
|
|
559
395
|
await self.initialize()
|
|
560
396
|
return self
|
|
561
|
-
|
|
397
|
+
|
|
562
398
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
563
399
|
"""Async context manager exit"""
|
|
564
400
|
await self.cleanup()
|
|
565
401
|
return False
|
|
566
|
-
|
|
402
|
+
|
|
403
|
+
# ==========================================
|
|
404
|
+
# LOGGING CONVENIENCE
|
|
405
|
+
# ==========================================
|
|
406
|
+
|
|
407
|
+
def set_session_id(self, session_id: str):
|
|
408
|
+
"""Set session ID for both internal tracking and logger"""
|
|
409
|
+
self._session_id = session_id
|
|
410
|
+
self.logger.set_session(session_id)
|
|
411
|
+
|
|
412
|
+
async def flush_logs(self):
|
|
413
|
+
"""Force flush all accumulated logs"""
|
|
414
|
+
await self.logger.flush()
|
|
415
|
+
|
|
567
416
|
def __repr__(self) -> str:
|
|
568
417
|
return f"<ParserManager(id='{self.config.parser_config.parser_id}', name='{self.config.parser_config.parser_name}')>"
|
|
569
418
|
|
|
@@ -572,55 +421,36 @@ class ParserManager:
|
|
|
572
421
|
# CONVENIENCE FUNCTIONS
|
|
573
422
|
# ==========================================
|
|
574
423
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
parser_type: str = "generic",
|
|
578
|
-
**kwargs
|
|
579
|
-
) -> ParserManager:
|
|
424
|
+
|
|
425
|
+
def get_parser_manager(parser_name: str, parser_type: str = "generic", **kwargs) -> ParserManager:
|
|
580
426
|
"""
|
|
581
427
|
Get a parser manager instance with minimal configuration
|
|
582
|
-
|
|
428
|
+
|
|
583
429
|
Args:
|
|
584
430
|
parser_name: Name of the parser
|
|
585
431
|
parser_type: Type of parser (generic, ecommerce, news, etc.)
|
|
586
432
|
**kwargs: Additional configuration options
|
|
587
|
-
|
|
433
|
+
|
|
588
434
|
Returns:
|
|
589
435
|
Configured ParserManager instance
|
|
590
436
|
"""
|
|
591
|
-
parser_config = ParserConfig(
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
)
|
|
596
|
-
|
|
597
|
-
# Create logging config with parser name
|
|
598
|
-
logging_config = LoggingConfig(parser_name=parser_name)
|
|
599
|
-
|
|
600
|
-
config = ParserManagerConfig(
|
|
601
|
-
parser_config=parser_config,
|
|
602
|
-
logging_config=logging_config,
|
|
603
|
-
**{k: v for k, v in kwargs.items() if k in ParserManagerConfig.model_fields and k not in ['parser_config', 'logging_config']}
|
|
604
|
-
)
|
|
605
|
-
|
|
437
|
+
parser_config = ParserConfig(parser_name=parser_name, parser_type=parser_type, **{k: v for k, v in kwargs.items() if k in ParserConfig.model_fields})
|
|
438
|
+
|
|
439
|
+
config = ParserManagerConfig(parser_config=parser_config, **{k: v for k, v in kwargs.items() if k in ParserManagerConfig.model_fields and k not in ["parser_config"]})
|
|
440
|
+
|
|
606
441
|
return ParserManager(config)
|
|
607
442
|
|
|
608
443
|
|
|
609
|
-
async def quick_parse(
|
|
610
|
-
url: str,
|
|
611
|
-
parser_name: str = "QuickParser",
|
|
612
|
-
instructions: Optional[str] = None,
|
|
613
|
-
**kwargs
|
|
614
|
-
) -> dict[str, str]:
|
|
444
|
+
async def quick_parse(url: str, parser_name: str = "QuickParser", instructions: Optional[str] = None, **kwargs) -> HTMLParseResult:
|
|
615
445
|
"""
|
|
616
446
|
Quick parsing convenience function
|
|
617
|
-
|
|
447
|
+
|
|
618
448
|
Args:
|
|
619
449
|
url: URL to parse
|
|
620
450
|
parser_name: Name for the parser
|
|
621
451
|
instructions: Optional parsing instructions
|
|
622
452
|
**kwargs: Additional configuration
|
|
623
|
-
|
|
453
|
+
|
|
624
454
|
Returns:
|
|
625
455
|
Parsing result
|
|
626
456
|
"""
|