unrealon 1.1.1__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. unrealon/__init__.py +16 -6
  2. unrealon-1.1.4.dist-info/METADATA +658 -0
  3. unrealon-1.1.4.dist-info/RECORD +54 -0
  4. {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/entry_points.txt +1 -1
  5. unrealon_browser/__init__.py +3 -6
  6. unrealon_browser/core/browser_manager.py +86 -84
  7. unrealon_browser/dto/models/config.py +2 -0
  8. unrealon_browser/managers/captcha.py +165 -185
  9. unrealon_browser/managers/cookies.py +57 -28
  10. unrealon_browser/managers/logger_bridge.py +94 -34
  11. unrealon_browser/managers/profile.py +186 -158
  12. unrealon_browser/managers/stealth.py +58 -47
  13. unrealon_driver/__init__.py +8 -21
  14. unrealon_driver/exceptions.py +5 -0
  15. unrealon_driver/html_analyzer/__init__.py +32 -0
  16. unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} +330 -405
  17. unrealon_driver/html_analyzer/config.py +64 -0
  18. unrealon_driver/html_analyzer/manager.py +247 -0
  19. unrealon_driver/html_analyzer/models.py +115 -0
  20. unrealon_driver/html_analyzer/websocket_analyzer.py +157 -0
  21. unrealon_driver/models/__init__.py +31 -0
  22. unrealon_driver/models/websocket.py +98 -0
  23. unrealon_driver/parser/__init__.py +4 -23
  24. unrealon_driver/parser/cli_manager.py +6 -5
  25. unrealon_driver/parser/daemon_manager.py +242 -66
  26. unrealon_driver/parser/managers/__init__.py +0 -21
  27. unrealon_driver/parser/managers/config.py +15 -3
  28. unrealon_driver/parser/parser_manager.py +225 -395
  29. unrealon_driver/smart_logging/__init__.py +24 -0
  30. unrealon_driver/smart_logging/models.py +44 -0
  31. unrealon_driver/smart_logging/smart_logger.py +406 -0
  32. unrealon_driver/smart_logging/unified_logger.py +525 -0
  33. unrealon_driver/websocket/__init__.py +31 -0
  34. unrealon_driver/websocket/client.py +249 -0
  35. unrealon_driver/websocket/config.py +188 -0
  36. unrealon_driver/websocket/manager.py +90 -0
  37. unrealon-1.1.1.dist-info/METADATA +0 -722
  38. unrealon-1.1.1.dist-info/RECORD +0 -82
  39. unrealon_bridge/__init__.py +0 -114
  40. unrealon_bridge/cli.py +0 -316
  41. unrealon_bridge/client/__init__.py +0 -93
  42. unrealon_bridge/client/base.py +0 -78
  43. unrealon_bridge/client/commands.py +0 -89
  44. unrealon_bridge/client/connection.py +0 -90
  45. unrealon_bridge/client/events.py +0 -65
  46. unrealon_bridge/client/health.py +0 -38
  47. unrealon_bridge/client/html_parser.py +0 -146
  48. unrealon_bridge/client/logging.py +0 -139
  49. unrealon_bridge/client/proxy.py +0 -70
  50. unrealon_bridge/client/scheduler.py +0 -450
  51. unrealon_bridge/client/session.py +0 -70
  52. unrealon_bridge/configs/__init__.py +0 -14
  53. unrealon_bridge/configs/bridge_config.py +0 -212
  54. unrealon_bridge/configs/bridge_config.yaml +0 -39
  55. unrealon_bridge/models/__init__.py +0 -138
  56. unrealon_bridge/models/base.py +0 -28
  57. unrealon_bridge/models/command.py +0 -41
  58. unrealon_bridge/models/events.py +0 -40
  59. unrealon_bridge/models/html_parser.py +0 -79
  60. unrealon_bridge/models/logging.py +0 -55
  61. unrealon_bridge/models/parser.py +0 -63
  62. unrealon_bridge/models/proxy.py +0 -41
  63. unrealon_bridge/models/requests.py +0 -95
  64. unrealon_bridge/models/responses.py +0 -88
  65. unrealon_bridge/models/scheduler.py +0 -592
  66. unrealon_bridge/models/session.py +0 -28
  67. unrealon_bridge/server/__init__.py +0 -91
  68. unrealon_bridge/server/base.py +0 -171
  69. unrealon_bridge/server/handlers/__init__.py +0 -23
  70. unrealon_bridge/server/handlers/command.py +0 -110
  71. unrealon_bridge/server/handlers/html_parser.py +0 -139
  72. unrealon_bridge/server/handlers/logging.py +0 -95
  73. unrealon_bridge/server/handlers/parser.py +0 -95
  74. unrealon_bridge/server/handlers/proxy.py +0 -75
  75. unrealon_bridge/server/handlers/scheduler.py +0 -545
  76. unrealon_bridge/server/handlers/session.py +0 -66
  77. unrealon_driver/browser/__init__.py +0 -8
  78. unrealon_driver/browser/config.py +0 -74
  79. unrealon_driver/browser/manager.py +0 -416
  80. unrealon_driver/parser/managers/browser.py +0 -51
  81. unrealon_driver/parser/managers/logging.py +0 -609
  82. {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/WHEEL +0 -0
  83. {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/licenses/LICENSE +0 -0
@@ -9,110 +9,75 @@ Strict compliance with CRITICAL_REQUIREMENTS.md:
9
9
  - No try blocks in imports
10
10
  """
11
11
 
12
- import asyncio
13
12
  from datetime import datetime, timezone
14
- from typing import Optional, List, Union, Any
15
- from pathlib import Path
16
- from pydantic import BaseModel, Field, ConfigDict, field_validator
13
+ from typing import Optional
14
+ from pydantic import BaseModel, Field, ConfigDict
17
15
 
18
- from unrealon_bridge import ParserBridgeClient
19
- from unrealon_rpc.logging import get_logger
16
+ from .managers import ConfigManager, ParserConfig, ResultManager, ErrorManager, RetryConfig
20
17
 
21
- from .managers import (
22
- ConfigManager, ParserConfig,
23
- ResultManager, ParseResult, ParseMetrics,
24
- ErrorManager, RetryConfig, ErrorInfo,
25
- LoggingManager, LoggingConfig, LogLevel,
26
- HTMLManager, HTMLCleaningConfig,
27
- BrowserManager, BrowserConfig
28
- )
18
+ # from unrealon_browser import BrowserManager, BrowserConfig # Temporary comment to avoid circular import
19
+
20
+ # Import UnifiedLogger and HTML Analyzer
21
+ from unrealon_driver.smart_logging import create_unified_logger, LogLevel
22
+ from unrealon_driver.html_analyzer import create_html_analyzer, HTMLCleaningConfig, HTMLParseResult
23
+ from unrealon_driver.websocket import websocket_manager, WebSocketConfig
24
+ from unrealon_browser.core import BrowserManager
25
+ from unrealon_browser.dto.models.config import BrowserConfig
29
26
 
30
27
 
31
28
  class ParserManagerConfig(BaseModel):
32
29
  """Complete parser manager configuration"""
33
- model_config = ConfigDict(
34
- validate_assignment=True,
35
- extra="forbid"
36
- )
37
-
30
+
31
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
32
+
38
33
  # Core configuration
39
- parser_config: ParserConfig = Field(
40
- default_factory=ParserConfig,
41
- description="Core parser configuration"
42
- )
43
-
44
- # Manager configurations
45
- logging_config: LoggingConfig = Field(
46
- default_factory=LoggingConfig,
47
- description="Logging configuration"
48
- )
49
- html_config: HTMLCleaningConfig = Field(
50
- default_factory=HTMLCleaningConfig,
51
- description="HTML cleaning configuration"
52
- )
53
- browser_config: BrowserConfig = Field(
54
- default_factory=BrowserConfig,
55
- description="Browser configuration"
56
- )
57
- retry_config: RetryConfig = Field(
58
- default_factory=RetryConfig,
59
- description="Retry configuration"
60
- )
61
-
34
+ parser_config: ParserConfig = Field(default_factory=ParserConfig, description="Core parser configuration")
35
+
36
+ # Logging configuration (simplified)
37
+ console_enabled: bool = Field(default=True, description="Enable console logging")
38
+ file_enabled: bool = Field(default=True, description="Enable file logging")
39
+ console_level: LogLevel = Field(default=LogLevel.INFO, description="Console log level")
40
+ file_level: LogLevel = Field(default=LogLevel.DEBUG, description="File log level")
41
+ html_config: HTMLCleaningConfig = Field(default_factory=HTMLCleaningConfig, description="HTML cleaning configuration")
42
+ retry_config: RetryConfig = Field(default_factory=RetryConfig, description="Retry configuration")
43
+
62
44
  # Bridge settings
63
- bridge_enabled: bool = Field(
64
- default=True,
65
- description="Enable bridge connection"
66
- )
67
- auto_register: bool = Field(
68
- default=True,
69
- description="Auto-register parser with bridge"
70
- )
71
-
72
- def model_post_init(self, __context) -> None:
73
- """Sync configurations across managers"""
74
- # Sync parser name across all configs
75
- parser_name = self.parser_config.parser_name
76
- if hasattr(self.logging_config, 'parser_name'):
77
- self.logging_config.parser_name = parser_name
78
-
79
- # Sync system directories
80
- system_dir = self.parser_config.system_dir
81
- if system_dir:
82
- self.logging_config.log_dir = system_dir / "logs"
83
- self.browser_config.screenshots_dir = system_dir / "screenshots"
84
- self.browser_config.cookies_file = system_dir / "cookies.json"
45
+ bridge_enabled: bool = Field(default=True, description="Enable bridge connection")
46
+ auto_register: bool = Field(default=True, description="Auto-register parser with bridge")
47
+
48
+ # SmartLogger settings
49
+ bridge_logs_url: Optional[str] = Field(default=None, description="Bridge logs WebSocket URL (ws://localhost:8001/logs)")
50
+ log_batch_interval: float = Field(default=5.0, description="Log batch interval in seconds")
51
+ daemon_mode: Optional[bool] = Field(default=None, description="Daemon mode for logging (None = auto-detect)")
85
52
 
86
53
 
87
54
  class ParserStats(BaseModel):
88
55
  """Comprehensive parser statistics"""
89
- model_config = ConfigDict(
90
- validate_assignment=True,
91
- extra="forbid"
92
- )
93
-
56
+
57
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
58
+
94
59
  parser_id: str = Field(...)
95
60
  parser_name: str = Field(...)
96
61
  session_id: Optional[str] = Field(default=None)
97
-
62
+
98
63
  # Timing
99
64
  session_start: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
100
65
  session_duration: float = Field(default=0.0, ge=0.0)
101
-
66
+
102
67
  # Operations
103
68
  operations_completed: int = Field(default=0, ge=0)
104
69
  operations_failed: int = Field(default=0, ge=0)
105
70
  success_rate: float = Field(default=0.0, ge=0.0, le=100.0)
106
-
71
+
107
72
  # Content processing
108
73
  pages_processed: int = Field(default=0, ge=0)
109
74
  html_cleaned_count: int = Field(default=0, ge=0)
110
75
  total_html_reduction: float = Field(default=0.0, ge=0.0)
111
-
76
+
112
77
  # Errors
113
78
  total_errors: int = Field(default=0, ge=0)
114
79
  retries_attempted: int = Field(default=0, ge=0)
115
-
80
+
116
81
  # Bridge
117
82
  bridge_connected: bool = Field(default=False)
118
83
  bridge_messages_sent: int = Field(default=0, ge=0)
@@ -120,6 +85,7 @@ class ParserStats(BaseModel):
120
85
 
121
86
  class ParserManagerError(Exception):
122
87
  """Base exception for parser manager"""
88
+
123
89
  def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
124
90
  self.message = message
125
91
  self.operation = operation
@@ -129,18 +95,20 @@ class ParserManagerError(Exception):
129
95
 
130
96
  class InitializationError(ParserManagerError):
131
97
  """Raised when parser manager initialization fails"""
98
+
132
99
  pass
133
100
 
134
101
 
135
102
  class OperationError(ParserManagerError):
136
103
  """Raised when parser operation fails"""
104
+
137
105
  pass
138
106
 
139
107
 
140
108
  class ParserManager:
141
109
  """
142
110
  🚀 Parser Manager - Unified parser management system
143
-
111
+
144
112
  Features:
145
113
  - Unified Configuration: Single config for all managers
146
114
  - Automatic Lifecycle: Handles initialization, execution, cleanup
@@ -148,422 +116,303 @@ class ParserManager:
148
116
  - Performance Monitoring: Comprehensive statistics and metrics
149
117
  - Bridge Integration: Seamless communication with Django
150
118
  - Type Safety: Full Pydantic v2 compliance
151
-
119
+
152
120
  Usage:
153
121
  config = ParserManagerConfig(
154
122
  parser_config=ParserConfig(parser_name="MyParser"),
155
123
  bridge_enabled=True
156
124
  )
157
-
125
+
158
126
  async with ParserManager(config) as parser:
159
127
  # Navigate and extract
160
128
  html = await parser.get_html("https://example.com")
161
129
  cleaned_html = await parser.clean_html(html)
162
130
  result = await parser.analyze_html(cleaned_html)
163
-
131
+
164
132
  # Results are automatically tracked
165
133
  stats = parser.get_stats()
166
134
  """
167
-
135
+
168
136
  def __init__(self, config: ParserManagerConfig):
169
137
  self.config = config
170
- self.internal_logger = get_logger()
171
-
138
+
172
139
  # Initialize managers
173
140
  self.config_manager = ConfigManager(self.config.parser_config)
174
141
  self.result_manager = ResultManager(self.config.parser_config.parser_id)
175
- self.error_manager = ErrorManager(self.internal_logger)
176
- self.logging_manager = LoggingManager(self.config.logging_config)
177
- self.html_manager = HTMLManager(self.config.html_config)
178
- self.browser_manager = BrowserManager(self.config.browser_config)
179
-
180
- # Bridge client
181
- self.bridge_client: Optional[ParserBridgeClient] = None
182
-
142
+ self.error_manager = ErrorManager()
143
+ # Initialize HTML Analyzer (WebSocket URL auto-detected)
144
+ self.html_analyzer = create_html_analyzer(parser_id=self.config.parser_config.parser_id, api_key=self.config.parser_config.api_key, cleaning_config=self.config.html_config)
145
+ # Create default browser config
146
+ browser_config = BrowserConfig(parser_name=self.config.parser_config.parser_name)
147
+ self.browser_manager = BrowserManager(browser_config, parser_id=self.config.parser_config.parser_id)
148
+
149
+ # Initialize WebSocket connection config
150
+ if self.config.bridge_logs_url:
151
+ self._websocket_config = WebSocketConfig(url=self.config.bridge_logs_url, api_key=self.config.parser_config.api_key, parser_id=self.config.parser_config.parser_id)
152
+ else:
153
+ self._websocket_config = None
154
+
155
+ # Initialize UnifiedLogger
156
+ log_file = None
157
+ if self.config.parser_config.system_dir:
158
+ log_file = self.config.parser_config.system_dir / "logs" / f"{self.config.parser_config.parser_name}.log"
159
+
160
+ self.logger = create_unified_logger(
161
+ parser_id=self.config.parser_config.parser_id,
162
+ parser_name=self.config.parser_config.parser_name,
163
+ bridge_logs_url=self.config.bridge_logs_url,
164
+ log_file=log_file,
165
+ console_enabled=self.config.console_enabled,
166
+ file_enabled=self.config.file_enabled,
167
+ console_level=self.config.console_level,
168
+ file_level=self.config.file_level,
169
+ batch_interval=self.config.log_batch_interval,
170
+ daemon_mode=self.config.daemon_mode,
171
+ )
172
+
183
173
  # State
184
174
  self._is_initialized = False
185
175
  self._session_id: Optional[str] = None
186
- self._stats = ParserStats(
187
- parser_id=self.config.parser_config.parser_id,
188
- parser_name=self.config.parser_config.parser_name
189
- )
190
-
176
+ self._stats = ParserStats(parser_id=self.config.parser_config.parser_id, parser_name=self.config.parser_config.parser_name)
177
+
191
178
  # Register retry configurations
192
179
  self._setup_retry_configs()
193
-
180
+
194
181
  # ==========================================
195
182
  # LIFECYCLE MANAGEMENT
196
183
  # ==========================================
197
-
184
+
198
185
  async def initialize(self) -> None:
199
186
  """Initialize all managers and establish connections"""
200
187
  if self._is_initialized:
201
188
  return
202
-
189
+
203
190
  try:
204
- self.logging_manager.info("🚀 Initializing parser manager...")
205
-
206
- # Initialize bridge client
207
- if self.config.bridge_enabled:
208
- await self._initialize_bridge()
209
-
191
+ self.logger.info("🚀 Initializing parser manager...")
192
+
193
+ # Initialize WebSocket connection
194
+ if self._websocket_config:
195
+ await websocket_manager.initialize(self._websocket_config)
196
+ if websocket_manager.connected:
197
+ self.logger.info("🔌 WebSocket connected")
198
+ else:
199
+ self.logger.warning("🔌 WebSocket connection failed")
200
+
210
201
  # Initialize browser
211
- await self.browser_manager.initialize()
212
-
213
- # Update logging manager with bridge client
214
- if self.bridge_client:
215
- self.logging_manager.update_bridge_client(self.bridge_client)
216
-
217
- # Register parser if enabled
218
- if self.config.auto_register and self.bridge_client:
219
- await self._register_parser()
220
-
202
+ await self.browser_manager.initialize_async()
203
+
221
204
  self._is_initialized = True
222
- self.logging_manager.info("✅ Parser manager initialized successfully")
223
-
205
+ self.logger.info("✅ Parser manager initialized successfully")
206
+
224
207
  except Exception as e:
225
208
  self.error_manager.record_error(e, "initialization")
226
- raise InitializationError(
227
- message=f"Failed to initialize parser manager: {e}",
228
- operation="initialization"
229
- ) from e
230
-
209
+ raise InitializationError(message=f"Failed to initialize parser manager: {e}", operation="initialization") from e
210
+
231
211
  async def cleanup(self) -> None:
232
212
  """Clean up all resources"""
233
- self.logging_manager.info("🧹 Cleaning up parser manager...")
234
-
213
+ self.logger.info("🧹 Cleaning up parser manager...")
214
+
235
215
  cleanup_errors = []
236
-
216
+
237
217
  # End session if active
238
- if self._session_id and self.bridge_client:
239
- try:
240
- await self.bridge_client.end_session()
241
- except Exception as e:
242
- cleanup_errors.append(f"end_session: {e}")
243
-
218
+ if self._session_id:
219
+ await self.end_session()
220
+
244
221
  # Cleanup browser
245
222
  try:
246
- await self.browser_manager.cleanup()
223
+ await self.browser_manager.close_async()
247
224
  except Exception as e:
248
225
  cleanup_errors.append(f"browser_cleanup: {e}")
249
-
250
- # Disconnect bridge
251
- if self.bridge_client:
252
- try:
253
- await self.bridge_client.disconnect()
254
- except Exception as e:
255
- cleanup_errors.append(f"bridge_disconnect: {e}")
256
-
226
+
227
+ # Disconnect WebSocket
228
+ try:
229
+ await websocket_manager.disconnect()
230
+ except Exception as e:
231
+ cleanup_errors.append(f"websocket_disconnect: {e}")
232
+
257
233
  # Update final stats
258
234
  self._update_session_stats()
259
-
235
+
236
+ # Cleanup UnifiedLogger
237
+ try:
238
+ await self.logger.close()
239
+ except Exception as e:
240
+ cleanup_errors.append(f"logger_cleanup: {e}")
241
+
260
242
  # Log cleanup errors but don't raise
261
243
  if cleanup_errors:
262
- self.logging_manager.warning(f"Cleanup errors: {'; '.join(cleanup_errors)}")
263
-
264
- self.logging_manager.info("✅ Parser manager cleanup completed")
265
-
244
+ self.logger.warning(f"Cleanup errors: {'; '.join(cleanup_errors)}")
245
+
246
+ self.logger.info("✅ Parser manager cleanup completed")
247
+
266
248
  # ==========================================
267
249
  # CORE PARSING METHODS
268
250
  # ==========================================
269
-
251
+
270
252
  async def get_html(self, url: str) -> str:
271
253
  """Get HTML content from URL with error handling"""
272
254
  if not self._is_initialized:
273
255
  await self.initialize()
274
-
256
+
275
257
  @self.error_manager.with_retry("get_html", self.config.retry_config)
276
258
  async def _get_html_with_retry():
277
- self.logging_manager.url_access(url, "fetching")
259
+ self.logger.url_access(url, "fetching")
278
260
  html = await self.browser_manager.get_html(url)
279
261
  self._stats.pages_processed += 1
280
262
  return html
281
-
263
+
282
264
  try:
283
265
  return await _get_html_with_retry()
284
266
  except Exception as e:
285
267
  self._stats.total_errors += 1
286
- raise OperationError(
287
- message=f"Failed to get HTML from {url}: {e}",
288
- operation="get_html",
289
- details={"url": url}
290
- ) from e
291
-
292
- async def clean_html(self, html: str, **kwargs) -> str:
293
- """Clean HTML content for LLM analysis"""
294
- try:
295
- self.logging_manager.info(f"🧹 Cleaning HTML: {len(html)} characters")
296
-
297
- cleaned_html = await self.html_manager.clean_html(html, **kwargs)
298
-
299
- # Update stats
300
- self._stats.html_cleaned_count += 1
301
- stats = self.html_manager.get_cleaning_stats(html, cleaned_html)
302
- self._stats.total_html_reduction += stats.size_reduction_percent
303
-
304
- self.logging_manager.info(
305
- f"✅ HTML cleaned: {len(html)} → {len(cleaned_html)} chars "
306
- f"({stats.size_reduction_percent:.1f}% reduction)"
307
- )
308
-
309
- return cleaned_html
310
-
311
- except Exception as e:
312
- self._stats.total_errors += 1
313
- raise OperationError(
314
- message=f"Failed to clean HTML: {e}",
315
- operation="clean_html"
316
- ) from e
317
-
318
- async def analyze_html(
319
- self,
320
- html: str,
321
- instructions: Optional[str] = None,
322
- **kwargs
323
- ) -> dict[str, str]:
324
- """Analyze HTML content via bridge"""
325
- if not self.bridge_client:
326
- raise OperationError(
327
- message="Bridge client not available for HTML analysis",
328
- operation="analyze_html"
329
- )
330
-
331
- try:
332
- self.logging_manager.info("🤖 Analyzing HTML with LLM...")
333
-
334
- result = await self.bridge_client.parse_html(
335
- html_content=html,
336
- instructions=instructions,
337
- parse_type="general",
338
- timeout=kwargs.get("timeout", 60),
339
- metadata=kwargs.get("metadata", {})
340
- )
341
-
342
- return {
343
- "success": str(result.success),
344
- "parsed_data": str(result.parsed_data),
345
- "markdown": result.markdown or "",
346
- "error_message": result.error_message or ""
347
- }
348
-
349
- except Exception as e:
350
- self._stats.total_errors += 1
351
- raise OperationError(
352
- message=f"Failed to analyze HTML: {e}",
353
- operation="analyze_html"
354
- ) from e
355
-
356
- async def parse_url(
357
- self,
358
- url: str,
359
- instructions: Optional[str] = None,
360
- **kwargs
361
- ) -> dict[str, str]:
362
- """Complete parsing workflow: fetch → clean → analyze"""
268
+ raise OperationError(message=f"Failed to get HTML from {url}: {e}", operation="get_html", details={"url": url}) from e
269
+
270
+ async def parse_url(self, url: str, instructions: Optional[str] = None, **kwargs) -> HTMLParseResult:
271
+ """Complete parsing workflow: fetch → clean → analyze via HTML Analyzer"""
363
272
  operation = self.result_manager.start_operation()
364
-
273
+
365
274
  try:
366
- self.logging_manager.start_operation("parse_url")
367
-
275
+ self.logger.start_operation("parse_url")
276
+
368
277
  # Fetch HTML
369
278
  html = await self.get_html(url)
370
-
371
- # Clean HTML
372
- cleaned_html = await self.clean_html(html, **kwargs)
373
-
374
- # Analyze HTML
375
- analysis_result = await self.analyze_html(cleaned_html, instructions, **kwargs)
376
-
279
+
280
+ # Delegate complete HTML processing to HTML Analyzer
281
+ analysis_result = await self.html_analyzer.parse_html(html=html, url=url, instructions=instructions, session_id=self._session_id, **kwargs)
282
+
283
+ # Update stats from HTML Analyzer
284
+ html_stats = self.html_analyzer.get_stats()
285
+ self._stats.html_cleaned_count += html_stats.cleaned_count
286
+ self._stats.total_html_reduction += html_stats.total_reduction
287
+
377
288
  # Complete operation
378
- self.result_manager.complete_operation(
379
- data=[], # Analysis result is returned directly
380
- source_urls=[url],
381
- success=analysis_result.get("success", "false") == "true"
382
- )
383
-
384
- self._stats.operations_completed += 1
385
- self.logging_manager.end_operation("parse_url", operation.duration_seconds)
386
-
289
+ success = analysis_result.success == "true"
290
+ self.result_manager.complete_operation(data=[], source_urls=[url], success=success)
291
+
292
+ if success:
293
+ self._stats.operations_completed += 1
294
+ else:
295
+ self._stats.operations_failed += 1
296
+
297
+ self.logger.end_operation("parse_url", operation.duration_seconds)
298
+
387
299
  return analysis_result
388
-
300
+
389
301
  except Exception as e:
390
- self.result_manager.complete_operation(
391
- data=[],
392
- source_urls=[url],
393
- success=False,
394
- error_message=str(e)
395
- )
396
-
302
+ self.result_manager.complete_operation(data=[], source_urls=[url], success=False, error_message=str(e))
303
+
397
304
  self._stats.operations_failed += 1
398
- self.logging_manager.fail_operation("parse_url", str(e))
305
+ self.logger.error(f"❌ Failed parse_url: {str(e)}")
399
306
  raise
400
-
307
+
401
308
  # ==========================================
402
- # SESSION MANAGEMENT
309
+ # SESSION MANAGEMENT (Simplified - Local Only)
403
310
  # ==========================================
404
-
311
+
405
312
  async def start_session(self, session_type: str = "parsing") -> str:
406
- """Start a new parsing session"""
407
- if not self.bridge_client:
408
- raise OperationError(
409
- message="Bridge client not available for session management",
410
- operation="start_session"
411
- )
412
-
413
- try:
414
- session_id = await self.bridge_client.start_session(
415
- session_type=session_type,
416
- metadata={
417
- "parser_name": self.config.parser_config.parser_name,
418
- "parser_type": self.config.parser_config.parser_type
419
- }
420
- )
421
-
422
- self._session_id = session_id
423
- self._stats.session_id = session_id
424
- self.logging_manager.set_session(session_id)
425
-
426
- self.logging_manager.info(f"📋 Session started: {session_id}")
427
- return session_id
428
-
429
- except Exception as e:
430
- raise OperationError(
431
- message=f"Failed to start session: {e}",
432
- operation="start_session"
433
- ) from e
434
-
313
+ """Start a new parsing session (local only)"""
314
+ import uuid
315
+
316
+ session_id = f"{session_type}_{uuid.uuid4().hex[:8]}"
317
+ self._session_id = session_id
318
+ self._stats.session_id = session_id
319
+ self.logger.set_session(session_id)
320
+
321
+ self.logger.info(f"📋 Local session started: {session_id}")
322
+ return session_id
323
+
435
324
  async def end_session(self) -> None:
436
325
  """End current parsing session"""
437
- if not self._session_id or not self.bridge_client:
326
+ if not self._session_id:
438
327
  return
439
-
440
- try:
441
- await self.bridge_client.end_session()
442
- self.logging_manager.info(f"📋 Session ended: {self._session_id}")
443
- self._session_id = None
444
- self._stats.session_id = None
445
-
446
- except Exception as e:
447
- self.logging_manager.warning(f"Failed to end session: {e}")
448
-
328
+
329
+ self.logger.info(f"📋 Local session ended: {self._session_id}")
330
+ self._session_id = None
331
+ self._stats.session_id = None
332
+
449
333
  # ==========================================
450
334
  # STATISTICS AND MONITORING
451
335
  # ==========================================
452
-
336
+
453
337
  def get_stats(self) -> ParserStats:
454
338
  """Get comprehensive parser statistics"""
455
339
  self._update_session_stats()
456
340
  return ParserStats.model_validate(self._stats.model_dump())
457
-
341
+
458
342
  def get_manager_stats(self) -> dict[str, dict[str, str]]:
459
343
  """Get statistics from all managers"""
460
344
  return {
461
345
  "result_manager": self.result_manager.get_stats(),
462
346
  "error_manager": self.error_manager.get_error_stats(),
463
- "browser_manager": self.browser_manager.get_stats().model_dump(mode='json'),
464
- "logging_manager": self.logging_manager.get_log_stats()
347
+ "browser_manager": self.browser_manager.get_stats().model_dump(mode="json"),
348
+ # Logging stats removed - using UnifiedLogger now
465
349
  }
466
-
350
+
467
351
  async def health_check(self) -> dict[str, str]:
468
352
  """Comprehensive health check"""
469
- health = {
470
- "status": "healthy",
471
- "parser_id": self.config.parser_config.parser_id,
472
- "parser_name": self.config.parser_config.parser_name,
473
- "initialized": str(self._is_initialized),
474
- "session_active": str(self._session_id is not None)
475
- }
476
-
353
+ health = {"status": "healthy", "parser_id": self.config.parser_config.parser_id, "parser_name": self.config.parser_config.parser_name, "initialized": str(self._is_initialized), "session_active": str(self._session_id is not None)}
354
+
477
355
  # Check browser health
478
356
  try:
479
357
  browser_health = await self.browser_manager.health_check()
480
358
  health["browser_status"] = browser_health.get("status", "unknown")
481
359
  except Exception as e:
482
360
  health["browser_status"] = f"error: {e}"
483
-
484
- # Check bridge health
485
- if self.bridge_client:
486
- health["bridge_connected"] = "true"
487
- else:
488
- health["bridge_connected"] = "false"
489
-
361
+
362
+ # Check WebSocket connection health
363
+ health["websocket_connected"] = str(websocket_manager.connected)
364
+
490
365
  return health
491
-
366
+
492
367
  # ==========================================
493
368
  # INTERNAL METHODS
494
369
  # ==========================================
495
-
496
- async def _initialize_bridge(self) -> None:
497
- """Initialize bridge client"""
498
- self.bridge_client = ParserBridgeClient(
499
- websocket_url=self.config.parser_config.websocket_url,
500
- parser_type=self.config.parser_config.parser_type,
501
- api_key=self.config.parser_config.api_key
502
- )
503
-
504
- await self.bridge_client.bridge_client.connect()
505
- self._stats.bridge_connected = True
506
- self.logging_manager.info("🔗 Bridge client connected")
507
-
508
- async def _register_parser(self) -> None:
509
- """Register parser with bridge"""
510
- if not self.bridge_client:
511
- return
512
-
513
- parser_info = await self.bridge_client.register_parser(
514
- metadata={
515
- "driver_version": "4.0.0",
516
- "capabilities": "scraping,html_cleaning,llm_integration",
517
- "managers": "config,result,error,logging,html,browser"
518
- }
519
- )
520
-
521
- # Update parser ID
522
- self.config.parser_config.parser_id = parser_info.parser_id
523
- self._stats.parser_id = parser_info.parser_id
524
-
525
- self.logging_manager.info(f"📝 Parser registered: {parser_info.parser_id}")
526
-
370
+
527
371
  def _setup_retry_configs(self) -> None:
528
372
  """Setup retry configurations for different operations"""
529
373
  # Navigation retry config
530
- nav_config = RetryConfig(
531
- max_attempts=3,
532
- base_delay=2.0,
533
- retry_on_exceptions=["NavigationError", "TimeoutError", "ConnectionError"]
534
- )
374
+ nav_config = RetryConfig(max_attempts=3, base_delay=2.0, retry_on_exceptions=["NavigationError", "TimeoutError", "ConnectionError"])
535
375
  self.error_manager.register_retry_config("get_html", nav_config)
536
-
376
+
537
377
  # Bridge communication retry config
538
- bridge_config = RetryConfig(
539
- max_attempts=2,
540
- base_delay=1.0,
541
- retry_on_exceptions=["ConnectionError", "TimeoutError"]
542
- )
378
+ bridge_config = RetryConfig(max_attempts=2, base_delay=1.0, retry_on_exceptions=["ConnectionError", "TimeoutError"])
543
379
  self.error_manager.register_retry_config("analyze_html", bridge_config)
544
-
380
+
545
381
  def _update_session_stats(self) -> None:
546
382
  """Update session statistics"""
547
383
  self._stats.session_duration = (datetime.now(timezone.utc) - self._stats.session_start).total_seconds()
548
-
384
+
549
385
  total_operations = self._stats.operations_completed + self._stats.operations_failed
550
386
  if total_operations > 0:
551
387
  self._stats.success_rate = (self._stats.operations_completed / total_operations) * 100.0
552
-
388
+
553
389
  # ==========================================
554
390
  # CONTEXT MANAGER SUPPORT
555
391
  # ==========================================
556
-
392
+
557
393
  async def __aenter__(self):
558
394
  """Async context manager entry"""
559
395
  await self.initialize()
560
396
  return self
561
-
397
+
562
398
  async def __aexit__(self, exc_type, exc_val, exc_tb):
563
399
  """Async context manager exit"""
564
400
  await self.cleanup()
565
401
  return False
566
-
402
+
403
+ # ==========================================
404
+ # LOGGING CONVENIENCE
405
+ # ==========================================
406
+
407
+ def set_session_id(self, session_id: str):
408
+ """Set session ID for both internal tracking and logger"""
409
+ self._session_id = session_id
410
+ self.logger.set_session(session_id)
411
+
412
+ async def flush_logs(self):
413
+ """Force flush all accumulated logs"""
414
+ await self.logger.flush()
415
+
567
416
  def __repr__(self) -> str:
568
417
  return f"<ParserManager(id='{self.config.parser_config.parser_id}', name='{self.config.parser_config.parser_name}')>"
569
418
 
@@ -572,55 +421,36 @@ class ParserManager:
572
421
  # CONVENIENCE FUNCTIONS
573
422
  # ==========================================
574
423
 
575
- def get_parser_manager(
576
- parser_name: str,
577
- parser_type: str = "generic",
578
- **kwargs
579
- ) -> ParserManager:
424
+
425
+ def get_parser_manager(parser_name: str, parser_type: str = "generic", **kwargs) -> ParserManager:
580
426
  """
581
427
  Get a parser manager instance with minimal configuration
582
-
428
+
583
429
  Args:
584
430
  parser_name: Name of the parser
585
431
  parser_type: Type of parser (generic, ecommerce, news, etc.)
586
432
  **kwargs: Additional configuration options
587
-
433
+
588
434
  Returns:
589
435
  Configured ParserManager instance
590
436
  """
591
- parser_config = ParserConfig(
592
- parser_name=parser_name,
593
- parser_type=parser_type,
594
- **{k: v for k, v in kwargs.items() if k in ParserConfig.model_fields}
595
- )
596
-
597
- # Create logging config with parser name
598
- logging_config = LoggingConfig(parser_name=parser_name)
599
-
600
- config = ParserManagerConfig(
601
- parser_config=parser_config,
602
- logging_config=logging_config,
603
- **{k: v for k, v in kwargs.items() if k in ParserManagerConfig.model_fields and k not in ['parser_config', 'logging_config']}
604
- )
605
-
437
+ parser_config = ParserConfig(parser_name=parser_name, parser_type=parser_type, **{k: v for k, v in kwargs.items() if k in ParserConfig.model_fields})
438
+
439
+ config = ParserManagerConfig(parser_config=parser_config, **{k: v for k, v in kwargs.items() if k in ParserManagerConfig.model_fields and k not in ["parser_config"]})
440
+
606
441
  return ParserManager(config)
607
442
 
608
443
 
609
- async def quick_parse(
610
- url: str,
611
- parser_name: str = "QuickParser",
612
- instructions: Optional[str] = None,
613
- **kwargs
614
- ) -> dict[str, str]:
444
+ async def quick_parse(url: str, parser_name: str = "QuickParser", instructions: Optional[str] = None, **kwargs) -> HTMLParseResult:
615
445
  """
616
446
  Quick parsing convenience function
617
-
447
+
618
448
  Args:
619
449
  url: URL to parse
620
450
  parser_name: Name for the parser
621
451
  instructions: Optional parsing instructions
622
452
  **kwargs: Additional configuration
623
-
453
+
624
454
  Returns:
625
455
  Parsing result
626
456
  """