unrealon 1.1.1__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. unrealon/__init__.py +16 -6
  2. unrealon-1.1.5.dist-info/METADATA +621 -0
  3. unrealon-1.1.5.dist-info/RECORD +54 -0
  4. {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/entry_points.txt +1 -1
  5. unrealon_browser/__init__.py +3 -6
  6. unrealon_browser/core/browser_manager.py +86 -84
  7. unrealon_browser/dto/models/config.py +2 -0
  8. unrealon_browser/managers/captcha.py +165 -185
  9. unrealon_browser/managers/cookies.py +57 -28
  10. unrealon_browser/managers/logger_bridge.py +94 -34
  11. unrealon_browser/managers/profile.py +186 -158
  12. unrealon_browser/managers/stealth.py +58 -47
  13. unrealon_driver/__init__.py +8 -21
  14. unrealon_driver/exceptions.py +5 -0
  15. unrealon_driver/html_analyzer/__init__.py +32 -0
  16. unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} +330 -405
  17. unrealon_driver/html_analyzer/config.py +64 -0
  18. unrealon_driver/html_analyzer/manager.py +247 -0
  19. unrealon_driver/html_analyzer/models.py +115 -0
  20. unrealon_driver/html_analyzer/websocket_analyzer.py +157 -0
  21. unrealon_driver/models/__init__.py +31 -0
  22. unrealon_driver/models/websocket.py +98 -0
  23. unrealon_driver/parser/__init__.py +4 -23
  24. unrealon_driver/parser/cli_manager.py +6 -5
  25. unrealon_driver/parser/daemon_manager.py +242 -66
  26. unrealon_driver/parser/managers/__init__.py +0 -21
  27. unrealon_driver/parser/managers/config.py +15 -3
  28. unrealon_driver/parser/parser_manager.py +225 -395
  29. unrealon_driver/smart_logging/__init__.py +24 -0
  30. unrealon_driver/smart_logging/models.py +44 -0
  31. unrealon_driver/smart_logging/smart_logger.py +406 -0
  32. unrealon_driver/smart_logging/unified_logger.py +525 -0
  33. unrealon_driver/websocket/__init__.py +31 -0
  34. unrealon_driver/websocket/client.py +249 -0
  35. unrealon_driver/websocket/config.py +188 -0
  36. unrealon_driver/websocket/manager.py +90 -0
  37. unrealon-1.1.1.dist-info/METADATA +0 -722
  38. unrealon-1.1.1.dist-info/RECORD +0 -82
  39. unrealon_bridge/__init__.py +0 -114
  40. unrealon_bridge/cli.py +0 -316
  41. unrealon_bridge/client/__init__.py +0 -93
  42. unrealon_bridge/client/base.py +0 -78
  43. unrealon_bridge/client/commands.py +0 -89
  44. unrealon_bridge/client/connection.py +0 -90
  45. unrealon_bridge/client/events.py +0 -65
  46. unrealon_bridge/client/health.py +0 -38
  47. unrealon_bridge/client/html_parser.py +0 -146
  48. unrealon_bridge/client/logging.py +0 -139
  49. unrealon_bridge/client/proxy.py +0 -70
  50. unrealon_bridge/client/scheduler.py +0 -450
  51. unrealon_bridge/client/session.py +0 -70
  52. unrealon_bridge/configs/__init__.py +0 -14
  53. unrealon_bridge/configs/bridge_config.py +0 -212
  54. unrealon_bridge/configs/bridge_config.yaml +0 -39
  55. unrealon_bridge/models/__init__.py +0 -138
  56. unrealon_bridge/models/base.py +0 -28
  57. unrealon_bridge/models/command.py +0 -41
  58. unrealon_bridge/models/events.py +0 -40
  59. unrealon_bridge/models/html_parser.py +0 -79
  60. unrealon_bridge/models/logging.py +0 -55
  61. unrealon_bridge/models/parser.py +0 -63
  62. unrealon_bridge/models/proxy.py +0 -41
  63. unrealon_bridge/models/requests.py +0 -95
  64. unrealon_bridge/models/responses.py +0 -88
  65. unrealon_bridge/models/scheduler.py +0 -592
  66. unrealon_bridge/models/session.py +0 -28
  67. unrealon_bridge/server/__init__.py +0 -91
  68. unrealon_bridge/server/base.py +0 -171
  69. unrealon_bridge/server/handlers/__init__.py +0 -23
  70. unrealon_bridge/server/handlers/command.py +0 -110
  71. unrealon_bridge/server/handlers/html_parser.py +0 -139
  72. unrealon_bridge/server/handlers/logging.py +0 -95
  73. unrealon_bridge/server/handlers/parser.py +0 -95
  74. unrealon_bridge/server/handlers/proxy.py +0 -75
  75. unrealon_bridge/server/handlers/scheduler.py +0 -545
  76. unrealon_bridge/server/handlers/session.py +0 -66
  77. unrealon_driver/browser/__init__.py +0 -8
  78. unrealon_driver/browser/config.py +0 -74
  79. unrealon_driver/browser/manager.py +0 -416
  80. unrealon_driver/parser/managers/browser.py +0 -51
  81. unrealon_driver/parser/managers/logging.py +0 -609
  82. {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/WHEEL +0 -0
  83. {unrealon-1.1.1.dist-info → unrealon-1.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,64 @@
1
+ """
2
+ Configuration models for HTML Analyzer.
3
+ """
4
+
5
+ from typing import Optional, List
6
+ from pathlib import Path
7
+ from pydantic import BaseModel, Field, ConfigDict
8
+
9
+ from unrealon_driver.websocket import get_websocket_url
10
+
11
+
12
+ class HTMLCleaningConfig(BaseModel):
13
+ """HTML cleaning configuration with strict typing"""
14
+
15
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
16
+
17
+ # Cleaning modes
18
+ aggressive_cleaning: bool = Field(default=True, description="Enable aggressive cleaning")
19
+ preserve_js_data: bool = Field(default=True, description="Preserve JavaScript data during cleaning")
20
+
21
+ # Content preservation
22
+ preserve_images: bool = Field(default=False, description="Preserve image tags")
23
+ preserve_links: bool = Field(default=True, description="Preserve link tags")
24
+ preserve_forms: bool = Field(default=False, description="Preserve form elements")
25
+
26
+ # Size limits
27
+ max_html_size: int = Field(default=1000000, ge=1000, le=10000000, description="Maximum HTML size in characters")
28
+ max_text_length: int = Field(default=300, ge=50, le=1000, description="Maximum text content length per element")
29
+ max_url_length: int = Field(default=500, ge=100, le=2000, description="Maximum URL length")
30
+
31
+ # Noise removal
32
+ remove_comments: bool = Field(default=True, description="Remove HTML comments")
33
+ remove_scripts: bool = Field(default=True, description="Remove script tags")
34
+ remove_styles: bool = Field(default=True, description="Remove style tags")
35
+ remove_tracking: bool = Field(default=True, description="Remove tracking URLs and attributes")
36
+
37
+ # Whitespace handling
38
+ normalize_whitespace: bool = Field(default=True, description="Normalize whitespace")
39
+ remove_empty_elements: bool = Field(default=True, description="Remove empty elements")
40
+
41
+ # Custom selectors
42
+ noise_selectors: List[str] = Field(
43
+ default_factory=lambda: ['[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]', '[class*="footer"]', '[class*="header"]', '[class*="ads"]', '[class*="popup"]', '[class*="modal"]', '[class*="cookie"]'], description="CSS selectors for noise elements to remove"
44
+ )
45
+
46
+
47
+ class HTMLAnalyzerConfig(BaseModel):
48
+ """Configuration for HTML Analyzer"""
49
+
50
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
51
+
52
+ # Parser identity
53
+ parser_id: str = Field(..., min_length=1)
54
+
55
+ # Cleaning configuration
56
+ cleaning_config: HTMLCleaningConfig = Field(default_factory=HTMLCleaningConfig)
57
+
58
+ # WebSocket configuration (auto-detected)
59
+ websocket_url: Optional[str] = Field(default_factory=lambda: get_websocket_url(), description="WebSocket URL for analysis requests (auto-detected based on environment)")
60
+ api_key: Optional[str] = Field(default=None, description="API key for authentication")
61
+
62
+ # Analysis settings
63
+ default_timeout: float = Field(default=60.0, gt=0.0, description="Default analysis timeout")
64
+ enable_websocket_analysis: bool = Field(default=True, description="Enable WebSocket-based analysis")
@@ -0,0 +1,247 @@
1
+ """
2
+ HTML Analyzer Manager - Main interface for HTML analysis operations.
3
+ """
4
+
5
+ from typing import Optional, Tuple
6
+ from unrealon_driver.smart_logging import create_smart_logger
7
+
8
+ from .config import HTMLAnalyzerConfig, HTMLCleaningConfig
9
+ from .cleaner import HTMLCleaner, HTMLCleaningStats
10
+ from .websocket_analyzer import WebSocketHTMLAnalyzer
11
+ from .models import HTMLAnalysisResult, HTMLParseResult, HTMLAnalyzerStats, HTMLAnalysisRequest, HTMLParseRequest, HTMLAnalyzerError, HTMLCleaningError, HTMLAnalysisError
12
+
13
+
14
+ class HTMLAnalyzer:
15
+ """
16
+ 🧠 HTML Analyzer - Complete HTML processing and analysis system
17
+
18
+ Features:
19
+ - Smart HTML cleaning with noise removal
20
+ - JavaScript data extraction
21
+ - WebSocket-based LLM analysis
22
+ - Token-optimized output
23
+ - Configurable cleaning strategies
24
+ """
25
+
26
+ def __init__(self, config: HTMLAnalyzerConfig):
27
+ self.config = config
28
+ self.logger = create_smart_logger(parser_id=config.parser_id)
29
+
30
+ # Initialize components
31
+ self.cleaner = HTMLCleaner(parser_id=config.parser_id, config=config.cleaning_config)
32
+
33
+ # Initialize WebSocket analyzer if enabled
34
+ if config.enable_websocket_analysis and config.websocket_url:
35
+ self.websocket_analyzer = WebSocketHTMLAnalyzer(config)
36
+ else:
37
+ self.websocket_analyzer = None
38
+
39
+ async def analyze_html(self, html: str, instructions: Optional[str] = None, session_id: Optional[str] = None, clean_first: bool = True, preserve_js_data: bool = True, aggressive_cleaning: bool = False, **kwargs) -> HTMLAnalysisResult:
40
+ """
41
+ Complete HTML analysis workflow
42
+
43
+ Args:
44
+ html: Raw HTML content
45
+ instructions: Analysis instructions for LLM
46
+ session_id: Session identifier
47
+ clean_first: Whether to clean HTML before analysis
48
+ preserve_js_data: Whether to extract JavaScript data
49
+ aggressive_cleaning: Whether to apply aggressive cleaning
50
+ **kwargs: Additional parameters
51
+
52
+ Returns:
53
+ Analysis result with cleaned HTML and extracted data
54
+ """
55
+ try:
56
+ self.logger.info("🧠 Starting HTML analysis workflow")
57
+
58
+ # Initialize result with proper typing
59
+ result_data = {"success": True, "original_html_size": len(html), "cleaned_html": html, "extracted_data": {}, "analysis_result": {}, "cleaning_stats": {}, "error_message": ""}
60
+
61
+ # Step 1: Clean HTML if requested
62
+ if clean_first:
63
+ cleaned_html, extracted_data = await self.cleaner.clean_html(html, preserve_js_data=preserve_js_data, aggressive_cleaning=aggressive_cleaning)
64
+
65
+ result_data["cleaned_html"] = cleaned_html
66
+ result_data["extracted_data"] = extracted_data
67
+ result_data["cleaned_html_size"] = len(cleaned_html)
68
+
69
+ # Get cleaning statistics
70
+ stats = self.cleaner.get_cleaning_stats(html, cleaned_html)
71
+ result_data["cleaning_stats"] = stats.model_dump()
72
+
73
+ self.logger.info(f"✅ HTML cleaned: {len(html)} → {len(cleaned_html)} chars")
74
+ else:
75
+ result_data["cleaned_html_size"] = len(html)
76
+
77
+ # Step 2: Perform LLM analysis via WebSocket if available
78
+ if self.websocket_analyzer and instructions:
79
+ analysis_result = await self.websocket_analyzer.analyze_html(result_data["cleaned_html"], instructions=instructions, session_id=session_id, **kwargs)
80
+ result_data["analysis_result"] = analysis_result
81
+
82
+ if analysis_result.get("success") == "true":
83
+ self.logger.info("✅ LLM analysis completed successfully")
84
+ else:
85
+ self.logger.warning(f"⚠️ LLM analysis failed: {analysis_result.get('error_message')}")
86
+ else:
87
+ if not self.websocket_analyzer:
88
+ self.logger.info("ℹ️ WebSocket analyzer not configured - skipping LLM analysis")
89
+ else:
90
+ self.logger.info("ℹ️ No instructions provided - skipping LLM analysis")
91
+
92
+ return HTMLAnalysisResult.model_validate(result_data)
93
+
94
+ except Exception as e:
95
+ self.logger.error(f"❌ HTML analysis failed: {str(e)}")
96
+ error_result = {"success": False, "original_html_size": len(html), "cleaned_html": "", "cleaned_html_size": 0, "extracted_data": {}, "analysis_result": {}, "cleaning_stats": {}, "error_message": str(e)}
97
+ return HTMLAnalysisResult.model_validate(error_result)
98
+
99
+ async def clean_html_only(self, html: str, preserve_js_data: bool = True, aggressive_cleaning: bool = False) -> Tuple[str, dict[str, str]]:
100
+ """
101
+ Clean HTML without LLM analysis
102
+
103
+ Args:
104
+ html: Raw HTML content
105
+ preserve_js_data: Whether to extract JavaScript data
106
+ aggressive_cleaning: Whether to apply aggressive cleaning
107
+
108
+ Returns:
109
+ Tuple of (cleaned_html, extracted_data)
110
+ """
111
+ return await self.cleaner.clean_html(html, preserve_js_data=preserve_js_data, aggressive_cleaning=aggressive_cleaning)
112
+
113
+ async def analyze_with_llm_only(self, html: str, instructions: str, session_id: Optional[str] = None, **kwargs) -> dict[str, str]:
114
+ """
115
+ Perform LLM analysis without cleaning
116
+
117
+ Args:
118
+ html: HTML content (should be pre-cleaned)
119
+ instructions: Analysis instructions
120
+ session_id: Session identifier
121
+ **kwargs: Additional parameters
122
+
123
+ Returns:
124
+ LLM analysis result
125
+ """
126
+ if not self.websocket_analyzer:
127
+ return {"success": "false", "parsed_data": "", "markdown": "", "error_message": "WebSocket analyzer not configured"}
128
+
129
+ return await self.websocket_analyzer.analyze_html(html, instructions=instructions, session_id=session_id, **kwargs)
130
+
131
+ async def parse_html(self, html: str, url: Optional[str] = None, instructions: Optional[str] = None, session_id: Optional[str] = None, **kwargs) -> HTMLParseResult:
132
+ """
133
+ Complete HTML parsing workflow: clean → analyze.
134
+
135
+ This is the main method that should be used by ParserManager.
136
+ Returns standardized string-based result format.
137
+
138
+ Args:
139
+ html: Raw HTML content
140
+ url: Source URL (for logging)
141
+ instructions: Optional analysis instructions
142
+ session_id: Optional session ID
143
+ **kwargs: Additional parameters
144
+
145
+ Returns:
146
+ Standardized parsing result dictionary with string values
147
+ """
148
+ try:
149
+ if url:
150
+ self.logger.info(f"🔄 Processing HTML from {url}: {len(html)} characters")
151
+ else:
152
+ self.logger.info(f"🔄 Processing HTML: {len(html)} characters")
153
+
154
+ # Use existing analyze_html method
155
+ result = await self.analyze_html(html=html, instructions=instructions, session_id=session_id, **kwargs)
156
+
157
+ # Convert to standardized string format for ParserManager
158
+ if result.success:
159
+ analysis_result = result.analysis_result
160
+ return HTMLParseResult(success="true", parsed_data=str(analysis_result.get("parsed_data", "")), markdown=str(analysis_result.get("markdown", "")), error_message="")
161
+ else:
162
+ return HTMLParseResult(success="false", parsed_data="", markdown="", error_message=result.error_message or "Analysis failed")
163
+
164
+ except Exception as e:
165
+ self.logger.error(f"❌ HTML parsing failed: {str(e)}")
166
+ return HTMLParseResult(success="false", parsed_data="", markdown="", error_message=str(e))
167
+
168
+ def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
169
+ """Get cleaning statistics"""
170
+ return self.cleaner.get_cleaning_stats(original_html, cleaned_html)
171
+
172
+ def get_stats(self) -> HTMLAnalyzerStats:
173
+ """Get HTML analyzer statistics"""
174
+ return HTMLAnalyzerStats(cleaned_count=getattr(self.cleaner, "_cleaned_count", 0), total_reduction=getattr(self.cleaner, "_total_reduction", 0.0), websocket_enabled=self.websocket_analyzer is not None)
175
+
176
+ async def close(self):
177
+ """Close all resources"""
178
+ if self.websocket_analyzer:
179
+ await self.websocket_analyzer.close()
180
+ self.logger.info("🔌 HTML Analyzer closed")
181
+
182
+
183
+ def create_html_analyzer(parser_id: str, websocket_url: Optional[str] = None, api_key: Optional[str] = None, cleaning_config: Optional[HTMLCleaningConfig] = None, **kwargs) -> HTMLAnalyzer:
184
+ """
185
+ Create HTML analyzer with configuration
186
+
187
+ Args:
188
+ parser_id: Parser identifier
189
+ websocket_url: WebSocket URL for LLM analysis (optional, auto-detected if not provided)
190
+ api_key: API key for authentication
191
+ cleaning_config: HTML cleaning configuration
192
+ **kwargs: Additional configuration options
193
+
194
+ Returns:
195
+ Configured HTMLAnalyzer instance
196
+ """
197
+ # Only pass websocket_url if explicitly provided, otherwise use auto-detection
198
+ config_kwargs = {"parser_id": parser_id, "api_key": api_key, "cleaning_config": cleaning_config or HTMLCleaningConfig(), **kwargs}
199
+ if websocket_url is not None:
200
+ config_kwargs["websocket_url"] = websocket_url
201
+
202
+ config = HTMLAnalyzerConfig(**config_kwargs)
203
+
204
+ return HTMLAnalyzer(config)
205
+
206
+
207
+ # Convenience functions
208
+ async def quick_analyze_html(html: str, parser_id: str, instructions: Optional[str] = None, websocket_url: Optional[str] = None, **kwargs) -> HTMLAnalysisResult:
209
+ """
210
+ Quick HTML analysis convenience function
211
+
212
+ Args:
213
+ html: Raw HTML content
214
+ instructions: Analysis instructions
215
+ parser_id: Parser identifier
216
+ websocket_url: WebSocket URL for analysis (optional, auto-detected if not provided)
217
+ **kwargs: Additional options
218
+
219
+ Returns:
220
+ Analysis result
221
+ """
222
+ analyzer = create_html_analyzer(parser_id=parser_id, websocket_url=websocket_url, **kwargs)
223
+
224
+ try:
225
+ return await analyzer.analyze_html(html, instructions=instructions, **kwargs)
226
+ finally:
227
+ await analyzer.close()
228
+
229
+
230
+ async def quick_clean_html(html: str, parser_id: str, **kwargs) -> Tuple[str, dict[str, str]]:
231
+ """
232
+ Quick HTML cleaning convenience function
233
+
234
+ Args:
235
+ html: Raw HTML content
236
+ parser_id: Parser identifier
237
+ **kwargs: Cleaning options
238
+
239
+ Returns:
240
+ Tuple of (cleaned_html, extracted_data)
241
+ """
242
+ analyzer = create_html_analyzer(parser_id=parser_id)
243
+
244
+ try:
245
+ return await analyzer.clean_html_only(html, **kwargs)
246
+ finally:
247
+ await analyzer.close()
@@ -0,0 +1,115 @@
1
+ """
2
+ HTML Analyzer Models - Pydantic v2 models for HTML analysis operations.
3
+
4
+ Strict compliance with CRITICAL_REQUIREMENTS.md:
5
+ - No Dict[str, Any] usage
6
+ - Complete type annotations
7
+ - Pydantic v2 models everywhere
8
+ - Custom exception hierarchy
9
+ - No try blocks in imports
10
+ """
11
+
12
+ from typing import Optional
13
+ from pydantic import BaseModel, Field, ConfigDict
14
+
15
+
16
+ class HTMLAnalysisResult(BaseModel):
17
+ """Complete HTML analysis result with proper typing."""
18
+
19
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
20
+
21
+ success: bool = Field(..., description="Analysis success status")
22
+ original_html_size: int = Field(..., ge=0, description="Original HTML size in characters")
23
+ cleaned_html: str = Field(..., description="Cleaned HTML content")
24
+ cleaned_html_size: int = Field(..., ge=0, description="Cleaned HTML size in characters")
25
+ extracted_data: dict[str, str] = Field(default_factory=dict, description="Extracted JavaScript data")
26
+ analysis_result: dict[str, str] = Field(default_factory=dict, description="LLM analysis result")
27
+ cleaning_stats: dict[str, float] = Field(default_factory=dict, description="Cleaning statistics")
28
+ error_message: str = Field(default="", description="Error message if failed")
29
+
30
+
31
+ class HTMLParseResult(BaseModel):
32
+ """Standardized HTML parsing result for ParserManager."""
33
+
34
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
35
+
36
+ success: str = Field(..., description="Success status as string (true/false)")
37
+ parsed_data: str = Field(..., description="Parsed data as string")
38
+ markdown: str = Field(..., description="Markdown representation")
39
+ error_message: str = Field(..., description="Error message if failed")
40
+
41
+
42
+ class HTMLAnalyzerStats(BaseModel):
43
+ """HTML analyzer statistics."""
44
+
45
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
46
+
47
+ cleaned_count: int = Field(default=0, ge=0, description="Number of HTML documents cleaned")
48
+ total_reduction: float = Field(default=0.0, ge=0.0, description="Total size reduction percentage")
49
+ websocket_enabled: bool = Field(..., description="Whether WebSocket analyzer is enabled")
50
+
51
+
52
+ class HTMLCleaningRequest(BaseModel):
53
+ """Request model for HTML cleaning operations."""
54
+
55
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
56
+
57
+ html: str = Field(..., min_length=1, description="HTML content to clean")
58
+ preserve_js_data: bool = Field(default=True, description="Whether to extract JavaScript data")
59
+ aggressive_cleaning: bool = Field(default=False, description="Whether to apply aggressive cleaning")
60
+
61
+
62
+ class HTMLAnalysisRequest(BaseModel):
63
+ """Request model for HTML analysis operations."""
64
+
65
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
66
+
67
+ html: str = Field(..., min_length=1, description="HTML content to analyze")
68
+ instructions: Optional[str] = Field(default=None, description="Analysis instructions for LLM")
69
+ session_id: Optional[str] = Field(default=None, description="Session identifier")
70
+ url: Optional[str] = Field(default=None, description="Source URL for logging")
71
+ clean_first: bool = Field(default=True, description="Whether to clean HTML before analysis")
72
+ preserve_js_data: bool = Field(default=True, description="Whether to extract JavaScript data")
73
+ aggressive_cleaning: bool = Field(default=False, description="Whether to apply aggressive cleaning")
74
+
75
+
76
+ class HTMLParseRequest(BaseModel):
77
+ """Complete HTML parsing request model."""
78
+
79
+ model_config = ConfigDict(validate_assignment=True, extra="forbid")
80
+
81
+ html: str = Field(..., min_length=1, description="HTML content to parse")
82
+ url: Optional[str] = Field(default=None, description="Source URL for logging")
83
+ instructions: Optional[str] = Field(default=None, description="Analysis instructions")
84
+ session_id: Optional[str] = Field(default=None, description="Session identifier")
85
+ clean_first: bool = Field(default=True, description="Whether to clean HTML before analysis")
86
+ preserve_js_data: bool = Field(default=True, description="Whether to extract JavaScript data")
87
+ aggressive_cleaning: bool = Field(default=False, description="Whether to apply aggressive cleaning")
88
+
89
+
90
+ class HTMLAnalyzerError(Exception):
91
+ """Base exception for HTML analyzer operations."""
92
+
93
+ def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
94
+ self.message = message
95
+ self.operation = operation
96
+ self.details = details or {}
97
+ super().__init__(message)
98
+
99
+
100
+ class HTMLCleaningError(HTMLAnalyzerError):
101
+ """Raised when HTML cleaning fails."""
102
+
103
+ pass
104
+
105
+
106
+ class HTMLAnalysisError(HTMLAnalyzerError):
107
+ """Raised when HTML analysis fails."""
108
+
109
+ pass
110
+
111
+
112
+ class WebSocketAnalysisError(HTMLAnalyzerError):
113
+ """Raised when WebSocket analysis fails."""
114
+
115
+ pass
@@ -0,0 +1,157 @@
1
+ """
2
+ WebSocket HTML Analyzer - Handles HTML analysis via WebSocket communication.
3
+ """
4
+
5
+ from typing import Optional, Dict, Any
6
+ from unrealon_driver.smart_logging import create_smart_logger
7
+ from unrealon_driver.websocket import websocket_manager, WebSocketConfig
8
+
9
+ from .config import HTMLAnalyzerConfig
10
+
11
+
12
+ class WebSocketHTMLAnalyzer:
13
+ """
14
+ WebSocket-based HTML analyzer that sends HTML to server for LLM analysis.
15
+ """
16
+
17
+ def __init__(self, config: HTMLAnalyzerConfig):
18
+ self.config = config
19
+ self.logger = create_smart_logger(parser_id=config.parser_id)
20
+
21
+ # Initialize WebSocket if configured
22
+ if config.websocket_url and config.enable_websocket_analysis:
23
+ self._websocket_config = WebSocketConfig(
24
+ url=config.websocket_url,
25
+ api_key=config.api_key,
26
+ parser_id=config.parser_id
27
+ )
28
+ self._websocket_initialized = False
29
+ else:
30
+ self._websocket_config = None
31
+ self._websocket_initialized = False
32
+
33
+ async def analyze_html(
34
+ self,
35
+ html: str,
36
+ instructions: Optional[str] = None,
37
+ session_id: Optional[str] = None,
38
+ **kwargs
39
+ ) -> Dict[str, str]:
40
+ """
41
+ Analyze HTML content via WebSocket
42
+
43
+ Args:
44
+ html: HTML content to analyze
45
+ instructions: Analysis instructions
46
+ session_id: Session identifier
47
+ **kwargs: Additional parameters
48
+
49
+ Returns:
50
+ Analysis result dictionary
51
+ """
52
+ if not self._websocket_config:
53
+ self.logger.warning("🔌 WebSocket not configured for HTML analysis")
54
+ return {
55
+ "success": "false",
56
+ "parsed_data": "",
57
+ "markdown": "",
58
+ "error_message": "WebSocket not configured"
59
+ }
60
+
61
+ try:
62
+ # Ensure WebSocket connection
63
+ if not self._websocket_initialized:
64
+ await self._initialize_websocket()
65
+
66
+ self.logger.info("🤖 Analyzing HTML with LLM via WebSocket...")
67
+
68
+ # Prepare analysis request
69
+ analysis_request = {
70
+ "type": "html_analysis_request",
71
+ "parser_id": self.config.parser_id,
72
+ "session_id": session_id,
73
+ "html_content": html,
74
+ "instructions": instructions or "Extract and structure the data from this HTML",
75
+ "parse_type": "general",
76
+ "timeout": kwargs.get("timeout", self.config.default_timeout),
77
+ "metadata": kwargs.get("metadata", {})
78
+ }
79
+
80
+ # Send request via WebSocket
81
+ if websocket_manager.connected:
82
+ response = await websocket_manager.send_request(
83
+ analysis_request,
84
+ timeout=kwargs.get("timeout", self.config.default_timeout)
85
+ )
86
+
87
+ if response and response.get("success"):
88
+ self.logger.info("✅ HTML analysis completed successfully")
89
+ return {
90
+ "success": "true",
91
+ "parsed_data": str(response.get("parsed_data", "")),
92
+ "markdown": response.get("markdown", ""),
93
+ "error_message": ""
94
+ }
95
+ else:
96
+ error_msg = response.get("error_message", "Analysis failed") if response else "No response"
97
+ self.logger.error(f"❌ HTML analysis failed: {error_msg}")
98
+ return {
99
+ "success": "false",
100
+ "parsed_data": "",
101
+ "markdown": "",
102
+ "error_message": error_msg
103
+ }
104
+ else:
105
+ self.logger.warning("🔌 WebSocket not connected for HTML analysis")
106
+ return {
107
+ "success": "false",
108
+ "parsed_data": "",
109
+ "markdown": "",
110
+ "error_message": "WebSocket not connected"
111
+ }
112
+
113
+ except Exception as e:
114
+ self.logger.error(f"❌ HTML analysis failed: {str(e)}")
115
+ return {
116
+ "success": "false",
117
+ "parsed_data": "",
118
+ "markdown": "",
119
+ "error_message": str(e)
120
+ }
121
+
122
+ async def _initialize_websocket(self) -> bool:
123
+ """Initialize WebSocket connection"""
124
+ if not self._websocket_config:
125
+ return False
126
+
127
+ try:
128
+ success = await websocket_manager.initialize(self._websocket_config)
129
+ if success:
130
+ self._websocket_initialized = True
131
+ self.logger.info("🔌 WebSocket initialized for HTML analysis")
132
+ else:
133
+ self.logger.warning("🔌 WebSocket initialization failed")
134
+ return success
135
+ except Exception as e:
136
+ self.logger.error(f"❌ WebSocket initialization error: {e}")
137
+ return False
138
+
139
+ async def close(self):
140
+ """Close WebSocket connection"""
141
+ if self._websocket_initialized:
142
+ await websocket_manager.disconnect()
143
+ self._websocket_initialized = False
144
+ self.logger.info("🔌 WebSocket connection closed")
145
+
146
+
147
+ def create_websocket_analyzer(config: HTMLAnalyzerConfig) -> WebSocketHTMLAnalyzer:
148
+ """
149
+ Create WebSocket HTML analyzer
150
+
151
+ Args:
152
+ config: HTML analyzer configuration
153
+
154
+ Returns:
155
+ Configured WebSocketHTMLAnalyzer instance
156
+ """
157
+ return WebSocketHTMLAnalyzer(config)
@@ -0,0 +1,31 @@
1
+ """
2
+ Models for unrealon_driver.
3
+
4
+ Pydantic v2 models for type safety and validation.
5
+ """
6
+
7
+ from .websocket import (
8
+ MessageType,
9
+ BridgeMessageType,
10
+ RegistrationMessage,
11
+ CommandMessage,
12
+ CommandResponseMessage,
13
+ StatusMessage,
14
+ HeartbeatMessage,
15
+ BridgeRegistrationPayload,
16
+ BridgeMessage,
17
+ BridgeRegistrationMessage
18
+ )
19
+
20
+ __all__ = [
21
+ "MessageType",
22
+ "BridgeMessageType",
23
+ "RegistrationMessage",
24
+ "CommandMessage",
25
+ "CommandResponseMessage",
26
+ "StatusMessage",
27
+ "HeartbeatMessage",
28
+ "BridgeRegistrationPayload",
29
+ "BridgeMessage",
30
+ "BridgeRegistrationMessage"
31
+ ]