unrealon 1.1.6__py3-none-any.whl → 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unrealon-1.1.6.dist-info/licenses → unrealon-2.0.5.dist-info}/LICENSE +1 -1
- unrealon-2.0.5.dist-info/METADATA +491 -0
- unrealon-2.0.5.dist-info/RECORD +128 -0
- {unrealon-1.1.6.dist-info → unrealon-2.0.5.dist-info}/WHEEL +2 -1
- unrealon-2.0.5.dist-info/entry_points.txt +3 -0
- unrealon-2.0.5.dist-info/top_level.txt +3 -0
- unrealon_browser/__init__.py +5 -6
- unrealon_browser/cli/browser_cli.py +18 -9
- unrealon_browser/cli/interactive_mode.py +13 -4
- unrealon_browser/core/browser_manager.py +29 -16
- unrealon_browser/dto/__init__.py +21 -0
- unrealon_browser/dto/bot_detection.py +175 -0
- unrealon_browser/dto/models/config.py +9 -3
- unrealon_browser/managers/__init__.py +1 -1
- unrealon_browser/managers/logger_bridge.py +1 -4
- unrealon_browser/stealth/__init__.py +27 -0
- unrealon_browser/stealth/bypass_techniques.pyc +0 -0
- unrealon_browser/stealth/manager.pyc +0 -0
- unrealon_browser/stealth/nodriver_stealth.pyc +0 -0
- unrealon_browser/stealth/playwright_stealth.pyc +0 -0
- unrealon_browser/stealth/scanner_tester.pyc +0 -0
- unrealon_browser/stealth/undetected_chrome.pyc +0 -0
- unrealon_core/__init__.py +172 -0
- unrealon_core/config/__init__.py +16 -0
- unrealon_core/config/environment.py +151 -0
- unrealon_core/config/urls.py +94 -0
- unrealon_core/enums/__init__.py +24 -0
- unrealon_core/enums/status.py +216 -0
- unrealon_core/enums/types.py +240 -0
- unrealon_core/error_handling/__init__.py +45 -0
- unrealon_core/error_handling/circuit_breaker.py +292 -0
- unrealon_core/error_handling/error_context.py +324 -0
- unrealon_core/error_handling/recovery.py +371 -0
- unrealon_core/error_handling/retry.py +268 -0
- unrealon_core/exceptions/__init__.py +46 -0
- unrealon_core/exceptions/base.py +292 -0
- unrealon_core/exceptions/communication.py +22 -0
- unrealon_core/exceptions/driver.py +11 -0
- unrealon_core/exceptions/proxy.py +11 -0
- unrealon_core/exceptions/task.py +12 -0
- unrealon_core/exceptions/validation.py +17 -0
- unrealon_core/models/__init__.py +79 -0
- unrealon_core/models/arq_context.py +252 -0
- unrealon_core/models/arq_responses.py +125 -0
- unrealon_core/models/base.py +291 -0
- unrealon_core/models/bridge_stats.py +58 -0
- unrealon_core/models/communication.py +39 -0
- unrealon_core/models/connection_stats.py +47 -0
- unrealon_core/models/driver.py +30 -0
- unrealon_core/models/driver_details.py +98 -0
- unrealon_core/models/logging.py +28 -0
- unrealon_core/models/task.py +21 -0
- unrealon_core/models/typed_responses.py +210 -0
- unrealon_core/models/websocket/__init__.py +91 -0
- unrealon_core/models/websocket/base.py +49 -0
- unrealon_core/models/websocket/config.py +200 -0
- unrealon_core/models/websocket/driver.py +215 -0
- unrealon_core/models/websocket/errors.py +138 -0
- unrealon_core/models/websocket/heartbeat.py +100 -0
- unrealon_core/models/websocket/logging.py +261 -0
- unrealon_core/models/websocket/proxy.py +496 -0
- unrealon_core/models/websocket/tasks.py +275 -0
- unrealon_core/models/websocket/utils.py +153 -0
- unrealon_core/models/websocket_session.py +144 -0
- unrealon_core/monitoring/__init__.py +43 -0
- unrealon_core/monitoring/alerts.py +398 -0
- unrealon_core/monitoring/dashboard.py +307 -0
- unrealon_core/monitoring/health_check.py +354 -0
- unrealon_core/monitoring/metrics.py +352 -0
- unrealon_core/utils/__init__.py +11 -0
- unrealon_core/utils/time.py +61 -0
- unrealon_core/version.py +219 -0
- unrealon_driver/__init__.py +90 -51
- unrealon_driver/core_module/__init__.py +34 -0
- unrealon_driver/core_module/base.py +184 -0
- unrealon_driver/core_module/config.py +30 -0
- unrealon_driver/core_module/event_manager.py +127 -0
- unrealon_driver/core_module/protocols.py +98 -0
- unrealon_driver/core_module/registry.py +146 -0
- unrealon_driver/decorators/__init__.py +15 -0
- unrealon_driver/decorators/retry.py +117 -0
- unrealon_driver/decorators/schedule.py +137 -0
- unrealon_driver/decorators/task.py +61 -0
- unrealon_driver/decorators/timing.py +132 -0
- unrealon_driver/driver/__init__.py +20 -0
- unrealon_driver/driver/communication/__init__.py +10 -0
- unrealon_driver/driver/communication/session.py +203 -0
- unrealon_driver/driver/communication/websocket_client.py +205 -0
- unrealon_driver/driver/core/__init__.py +10 -0
- unrealon_driver/driver/core/config.py +175 -0
- unrealon_driver/driver/core/driver.py +221 -0
- unrealon_driver/driver/factory/__init__.py +9 -0
- unrealon_driver/driver/factory/manager_factory.py +130 -0
- unrealon_driver/driver/lifecycle/__init__.py +11 -0
- unrealon_driver/driver/lifecycle/daemon.py +76 -0
- unrealon_driver/driver/lifecycle/initialization.py +97 -0
- unrealon_driver/driver/lifecycle/shutdown.py +48 -0
- unrealon_driver/driver/monitoring/__init__.py +9 -0
- unrealon_driver/driver/monitoring/health.py +63 -0
- unrealon_driver/driver/utilities/__init__.py +10 -0
- unrealon_driver/driver/utilities/logging.py +51 -0
- unrealon_driver/driver/utilities/serialization.py +61 -0
- unrealon_driver/managers/__init__.py +32 -0
- unrealon_driver/managers/base.py +174 -0
- unrealon_driver/managers/browser.py +98 -0
- unrealon_driver/managers/cache.py +116 -0
- unrealon_driver/managers/http.py +107 -0
- unrealon_driver/managers/logger.py +286 -0
- unrealon_driver/managers/proxy.py +99 -0
- unrealon_driver/managers/registry.py +87 -0
- unrealon_driver/managers/threading.py +54 -0
- unrealon_driver/managers/update.py +107 -0
- unrealon_driver/utils/__init__.py +9 -0
- unrealon_driver/utils/time.py +10 -0
- unrealon-1.1.6.dist-info/METADATA +0 -625
- unrealon-1.1.6.dist-info/RECORD +0 -55
- unrealon-1.1.6.dist-info/entry_points.txt +0 -9
- unrealon_browser/managers/stealth.py +0 -388
- unrealon_driver/README.md +0 -0
- unrealon_driver/exceptions.py +0 -33
- unrealon_driver/html_analyzer/__init__.py +0 -32
- unrealon_driver/html_analyzer/cleaner.py +0 -657
- unrealon_driver/html_analyzer/config.py +0 -64
- unrealon_driver/html_analyzer/manager.py +0 -247
- unrealon_driver/html_analyzer/models.py +0 -115
- unrealon_driver/html_analyzer/websocket_analyzer.py +0 -157
- unrealon_driver/models/__init__.py +0 -31
- unrealon_driver/models/websocket.py +0 -98
- unrealon_driver/parser/__init__.py +0 -36
- unrealon_driver/parser/cli_manager.py +0 -142
- unrealon_driver/parser/daemon_manager.py +0 -403
- unrealon_driver/parser/managers/__init__.py +0 -25
- unrealon_driver/parser/managers/config.py +0 -293
- unrealon_driver/parser/managers/error.py +0 -412
- unrealon_driver/parser/managers/result.py +0 -321
- unrealon_driver/parser/parser_manager.py +0 -458
- unrealon_driver/smart_logging/__init__.py +0 -24
- unrealon_driver/smart_logging/models.py +0 -44
- unrealon_driver/smart_logging/smart_logger.py +0 -406
- unrealon_driver/smart_logging/unified_logger.py +0 -525
- unrealon_driver/websocket/__init__.py +0 -31
- unrealon_driver/websocket/client.py +0 -249
- unrealon_driver/websocket/config.py +0 -188
- unrealon_driver/websocket/manager.py +0 -90
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Configuration models for HTML Analyzer.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from typing import Optional, List
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from pydantic import BaseModel, Field, ConfigDict
|
|
8
|
-
|
|
9
|
-
from unrealon_driver.websocket import get_websocket_url
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class HTMLCleaningConfig(BaseModel):
|
|
13
|
-
"""HTML cleaning configuration with strict typing"""
|
|
14
|
-
|
|
15
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
16
|
-
|
|
17
|
-
# Cleaning modes
|
|
18
|
-
aggressive_cleaning: bool = Field(default=True, description="Enable aggressive cleaning")
|
|
19
|
-
preserve_js_data: bool = Field(default=True, description="Preserve JavaScript data during cleaning")
|
|
20
|
-
|
|
21
|
-
# Content preservation
|
|
22
|
-
preserve_images: bool = Field(default=False, description="Preserve image tags")
|
|
23
|
-
preserve_links: bool = Field(default=True, description="Preserve link tags")
|
|
24
|
-
preserve_forms: bool = Field(default=False, description="Preserve form elements")
|
|
25
|
-
|
|
26
|
-
# Size limits
|
|
27
|
-
max_html_size: int = Field(default=1000000, ge=1000, le=10000000, description="Maximum HTML size in characters")
|
|
28
|
-
max_text_length: int = Field(default=300, ge=50, le=1000, description="Maximum text content length per element")
|
|
29
|
-
max_url_length: int = Field(default=500, ge=100, le=2000, description="Maximum URL length")
|
|
30
|
-
|
|
31
|
-
# Noise removal
|
|
32
|
-
remove_comments: bool = Field(default=True, description="Remove HTML comments")
|
|
33
|
-
remove_scripts: bool = Field(default=True, description="Remove script tags")
|
|
34
|
-
remove_styles: bool = Field(default=True, description="Remove style tags")
|
|
35
|
-
remove_tracking: bool = Field(default=True, description="Remove tracking URLs and attributes")
|
|
36
|
-
|
|
37
|
-
# Whitespace handling
|
|
38
|
-
normalize_whitespace: bool = Field(default=True, description="Normalize whitespace")
|
|
39
|
-
remove_empty_elements: bool = Field(default=True, description="Remove empty elements")
|
|
40
|
-
|
|
41
|
-
# Custom selectors
|
|
42
|
-
noise_selectors: List[str] = Field(
|
|
43
|
-
default_factory=lambda: ['[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]', '[class*="footer"]', '[class*="header"]', '[class*="ads"]', '[class*="popup"]', '[class*="modal"]', '[class*="cookie"]'], description="CSS selectors for noise elements to remove"
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class HTMLAnalyzerConfig(BaseModel):
|
|
48
|
-
"""Configuration for HTML Analyzer"""
|
|
49
|
-
|
|
50
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
51
|
-
|
|
52
|
-
# Parser identity
|
|
53
|
-
parser_id: str = Field(..., min_length=1)
|
|
54
|
-
|
|
55
|
-
# Cleaning configuration
|
|
56
|
-
cleaning_config: HTMLCleaningConfig = Field(default_factory=HTMLCleaningConfig)
|
|
57
|
-
|
|
58
|
-
# WebSocket configuration (auto-detected)
|
|
59
|
-
websocket_url: Optional[str] = Field(default_factory=lambda: get_websocket_url(), description="WebSocket URL for analysis requests (auto-detected based on environment)")
|
|
60
|
-
api_key: Optional[str] = Field(default=None, description="API key for authentication")
|
|
61
|
-
|
|
62
|
-
# Analysis settings
|
|
63
|
-
default_timeout: float = Field(default=60.0, gt=0.0, description="Default analysis timeout")
|
|
64
|
-
enable_websocket_analysis: bool = Field(default=True, description="Enable WebSocket-based analysis")
|
|
@@ -1,247 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
HTML Analyzer Manager - Main interface for HTML analysis operations.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from typing import Optional, Tuple
|
|
6
|
-
from unrealon_driver.smart_logging import create_smart_logger
|
|
7
|
-
|
|
8
|
-
from .config import HTMLAnalyzerConfig, HTMLCleaningConfig
|
|
9
|
-
from .cleaner import HTMLCleaner, HTMLCleaningStats
|
|
10
|
-
from .websocket_analyzer import WebSocketHTMLAnalyzer
|
|
11
|
-
from .models import HTMLAnalysisResult, HTMLParseResult, HTMLAnalyzerStats, HTMLAnalysisRequest, HTMLParseRequest, HTMLAnalyzerError, HTMLCleaningError, HTMLAnalysisError
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class HTMLAnalyzer:
|
|
15
|
-
"""
|
|
16
|
-
🧠 HTML Analyzer - Complete HTML processing and analysis system
|
|
17
|
-
|
|
18
|
-
Features:
|
|
19
|
-
- Smart HTML cleaning with noise removal
|
|
20
|
-
- JavaScript data extraction
|
|
21
|
-
- WebSocket-based LLM analysis
|
|
22
|
-
- Token-optimized output
|
|
23
|
-
- Configurable cleaning strategies
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(self, config: HTMLAnalyzerConfig):
|
|
27
|
-
self.config = config
|
|
28
|
-
self.logger = create_smart_logger(parser_id=config.parser_id)
|
|
29
|
-
|
|
30
|
-
# Initialize components
|
|
31
|
-
self.cleaner = HTMLCleaner(parser_id=config.parser_id, config=config.cleaning_config)
|
|
32
|
-
|
|
33
|
-
# Initialize WebSocket analyzer if enabled
|
|
34
|
-
if config.enable_websocket_analysis and config.websocket_url:
|
|
35
|
-
self.websocket_analyzer = WebSocketHTMLAnalyzer(config)
|
|
36
|
-
else:
|
|
37
|
-
self.websocket_analyzer = None
|
|
38
|
-
|
|
39
|
-
async def analyze_html(self, html: str, instructions: Optional[str] = None, session_id: Optional[str] = None, clean_first: bool = True, preserve_js_data: bool = True, aggressive_cleaning: bool = False, **kwargs) -> HTMLAnalysisResult:
|
|
40
|
-
"""
|
|
41
|
-
Complete HTML analysis workflow
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
html: Raw HTML content
|
|
45
|
-
instructions: Analysis instructions for LLM
|
|
46
|
-
session_id: Session identifier
|
|
47
|
-
clean_first: Whether to clean HTML before analysis
|
|
48
|
-
preserve_js_data: Whether to extract JavaScript data
|
|
49
|
-
aggressive_cleaning: Whether to apply aggressive cleaning
|
|
50
|
-
**kwargs: Additional parameters
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
Analysis result with cleaned HTML and extracted data
|
|
54
|
-
"""
|
|
55
|
-
try:
|
|
56
|
-
self.logger.info("🧠 Starting HTML analysis workflow")
|
|
57
|
-
|
|
58
|
-
# Initialize result with proper typing
|
|
59
|
-
result_data = {"success": True, "original_html_size": len(html), "cleaned_html": html, "extracted_data": {}, "analysis_result": {}, "cleaning_stats": {}, "error_message": ""}
|
|
60
|
-
|
|
61
|
-
# Step 1: Clean HTML if requested
|
|
62
|
-
if clean_first:
|
|
63
|
-
cleaned_html, extracted_data = await self.cleaner.clean_html(html, preserve_js_data=preserve_js_data, aggressive_cleaning=aggressive_cleaning)
|
|
64
|
-
|
|
65
|
-
result_data["cleaned_html"] = cleaned_html
|
|
66
|
-
result_data["extracted_data"] = extracted_data
|
|
67
|
-
result_data["cleaned_html_size"] = len(cleaned_html)
|
|
68
|
-
|
|
69
|
-
# Get cleaning statistics
|
|
70
|
-
stats = self.cleaner.get_cleaning_stats(html, cleaned_html)
|
|
71
|
-
result_data["cleaning_stats"] = stats.model_dump()
|
|
72
|
-
|
|
73
|
-
self.logger.info(f"✅ HTML cleaned: {len(html)} → {len(cleaned_html)} chars")
|
|
74
|
-
else:
|
|
75
|
-
result_data["cleaned_html_size"] = len(html)
|
|
76
|
-
|
|
77
|
-
# Step 2: Perform LLM analysis via WebSocket if available
|
|
78
|
-
if self.websocket_analyzer and instructions:
|
|
79
|
-
analysis_result = await self.websocket_analyzer.analyze_html(result_data["cleaned_html"], instructions=instructions, session_id=session_id, **kwargs)
|
|
80
|
-
result_data["analysis_result"] = analysis_result
|
|
81
|
-
|
|
82
|
-
if analysis_result.get("success") == "true":
|
|
83
|
-
self.logger.info("✅ LLM analysis completed successfully")
|
|
84
|
-
else:
|
|
85
|
-
self.logger.warning(f"⚠️ LLM analysis failed: {analysis_result.get('error_message')}")
|
|
86
|
-
else:
|
|
87
|
-
if not self.websocket_analyzer:
|
|
88
|
-
self.logger.info("ℹ️ WebSocket analyzer not configured - skipping LLM analysis")
|
|
89
|
-
else:
|
|
90
|
-
self.logger.info("ℹ️ No instructions provided - skipping LLM analysis")
|
|
91
|
-
|
|
92
|
-
return HTMLAnalysisResult.model_validate(result_data)
|
|
93
|
-
|
|
94
|
-
except Exception as e:
|
|
95
|
-
self.logger.error(f"❌ HTML analysis failed: {str(e)}")
|
|
96
|
-
error_result = {"success": False, "original_html_size": len(html), "cleaned_html": "", "cleaned_html_size": 0, "extracted_data": {}, "analysis_result": {}, "cleaning_stats": {}, "error_message": str(e)}
|
|
97
|
-
return HTMLAnalysisResult.model_validate(error_result)
|
|
98
|
-
|
|
99
|
-
async def clean_html_only(self, html: str, preserve_js_data: bool = True, aggressive_cleaning: bool = False) -> Tuple[str, dict[str, str]]:
|
|
100
|
-
"""
|
|
101
|
-
Clean HTML without LLM analysis
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
html: Raw HTML content
|
|
105
|
-
preserve_js_data: Whether to extract JavaScript data
|
|
106
|
-
aggressive_cleaning: Whether to apply aggressive cleaning
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
Tuple of (cleaned_html, extracted_data)
|
|
110
|
-
"""
|
|
111
|
-
return await self.cleaner.clean_html(html, preserve_js_data=preserve_js_data, aggressive_cleaning=aggressive_cleaning)
|
|
112
|
-
|
|
113
|
-
async def analyze_with_llm_only(self, html: str, instructions: str, session_id: Optional[str] = None, **kwargs) -> dict[str, str]:
|
|
114
|
-
"""
|
|
115
|
-
Perform LLM analysis without cleaning
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
html: HTML content (should be pre-cleaned)
|
|
119
|
-
instructions: Analysis instructions
|
|
120
|
-
session_id: Session identifier
|
|
121
|
-
**kwargs: Additional parameters
|
|
122
|
-
|
|
123
|
-
Returns:
|
|
124
|
-
LLM analysis result
|
|
125
|
-
"""
|
|
126
|
-
if not self.websocket_analyzer:
|
|
127
|
-
return {"success": "false", "parsed_data": "", "markdown": "", "error_message": "WebSocket analyzer not configured"}
|
|
128
|
-
|
|
129
|
-
return await self.websocket_analyzer.analyze_html(html, instructions=instructions, session_id=session_id, **kwargs)
|
|
130
|
-
|
|
131
|
-
async def parse_html(self, html: str, url: Optional[str] = None, instructions: Optional[str] = None, session_id: Optional[str] = None, **kwargs) -> HTMLParseResult:
|
|
132
|
-
"""
|
|
133
|
-
Complete HTML parsing workflow: clean → analyze.
|
|
134
|
-
|
|
135
|
-
This is the main method that should be used by ParserManager.
|
|
136
|
-
Returns standardized string-based result format.
|
|
137
|
-
|
|
138
|
-
Args:
|
|
139
|
-
html: Raw HTML content
|
|
140
|
-
url: Source URL (for logging)
|
|
141
|
-
instructions: Optional analysis instructions
|
|
142
|
-
session_id: Optional session ID
|
|
143
|
-
**kwargs: Additional parameters
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
Standardized parsing result dictionary with string values
|
|
147
|
-
"""
|
|
148
|
-
try:
|
|
149
|
-
if url:
|
|
150
|
-
self.logger.info(f"🔄 Processing HTML from {url}: {len(html)} characters")
|
|
151
|
-
else:
|
|
152
|
-
self.logger.info(f"🔄 Processing HTML: {len(html)} characters")
|
|
153
|
-
|
|
154
|
-
# Use existing analyze_html method
|
|
155
|
-
result = await self.analyze_html(html=html, instructions=instructions, session_id=session_id, **kwargs)
|
|
156
|
-
|
|
157
|
-
# Convert to standardized string format for ParserManager
|
|
158
|
-
if result.success:
|
|
159
|
-
analysis_result = result.analysis_result
|
|
160
|
-
return HTMLParseResult(success="true", parsed_data=str(analysis_result.get("parsed_data", "")), markdown=str(analysis_result.get("markdown", "")), error_message="")
|
|
161
|
-
else:
|
|
162
|
-
return HTMLParseResult(success="false", parsed_data="", markdown="", error_message=result.error_message or "Analysis failed")
|
|
163
|
-
|
|
164
|
-
except Exception as e:
|
|
165
|
-
self.logger.error(f"❌ HTML parsing failed: {str(e)}")
|
|
166
|
-
return HTMLParseResult(success="false", parsed_data="", markdown="", error_message=str(e))
|
|
167
|
-
|
|
168
|
-
def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
|
|
169
|
-
"""Get cleaning statistics"""
|
|
170
|
-
return self.cleaner.get_cleaning_stats(original_html, cleaned_html)
|
|
171
|
-
|
|
172
|
-
def get_stats(self) -> HTMLAnalyzerStats:
|
|
173
|
-
"""Get HTML analyzer statistics"""
|
|
174
|
-
return HTMLAnalyzerStats(cleaned_count=getattr(self.cleaner, "_cleaned_count", 0), total_reduction=getattr(self.cleaner, "_total_reduction", 0.0), websocket_enabled=self.websocket_analyzer is not None)
|
|
175
|
-
|
|
176
|
-
async def close(self):
|
|
177
|
-
"""Close all resources"""
|
|
178
|
-
if self.websocket_analyzer:
|
|
179
|
-
await self.websocket_analyzer.close()
|
|
180
|
-
self.logger.info("🔌 HTML Analyzer closed")
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
def create_html_analyzer(parser_id: str, websocket_url: Optional[str] = None, api_key: Optional[str] = None, cleaning_config: Optional[HTMLCleaningConfig] = None, **kwargs) -> HTMLAnalyzer:
|
|
184
|
-
"""
|
|
185
|
-
Create HTML analyzer with configuration
|
|
186
|
-
|
|
187
|
-
Args:
|
|
188
|
-
parser_id: Parser identifier
|
|
189
|
-
websocket_url: WebSocket URL for LLM analysis (optional, auto-detected if not provided)
|
|
190
|
-
api_key: API key for authentication
|
|
191
|
-
cleaning_config: HTML cleaning configuration
|
|
192
|
-
**kwargs: Additional configuration options
|
|
193
|
-
|
|
194
|
-
Returns:
|
|
195
|
-
Configured HTMLAnalyzer instance
|
|
196
|
-
"""
|
|
197
|
-
# Only pass websocket_url if explicitly provided, otherwise use auto-detection
|
|
198
|
-
config_kwargs = {"parser_id": parser_id, "api_key": api_key, "cleaning_config": cleaning_config or HTMLCleaningConfig(), **kwargs}
|
|
199
|
-
if websocket_url is not None:
|
|
200
|
-
config_kwargs["websocket_url"] = websocket_url
|
|
201
|
-
|
|
202
|
-
config = HTMLAnalyzerConfig(**config_kwargs)
|
|
203
|
-
|
|
204
|
-
return HTMLAnalyzer(config)
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
# Convenience functions
|
|
208
|
-
async def quick_analyze_html(html: str, parser_id: str, instructions: Optional[str] = None, websocket_url: Optional[str] = None, **kwargs) -> HTMLAnalysisResult:
|
|
209
|
-
"""
|
|
210
|
-
Quick HTML analysis convenience function
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
html: Raw HTML content
|
|
214
|
-
instructions: Analysis instructions
|
|
215
|
-
parser_id: Parser identifier
|
|
216
|
-
websocket_url: WebSocket URL for analysis (optional, auto-detected if not provided)
|
|
217
|
-
**kwargs: Additional options
|
|
218
|
-
|
|
219
|
-
Returns:
|
|
220
|
-
Analysis result
|
|
221
|
-
"""
|
|
222
|
-
analyzer = create_html_analyzer(parser_id=parser_id, websocket_url=websocket_url, **kwargs)
|
|
223
|
-
|
|
224
|
-
try:
|
|
225
|
-
return await analyzer.analyze_html(html, instructions=instructions, **kwargs)
|
|
226
|
-
finally:
|
|
227
|
-
await analyzer.close()
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
async def quick_clean_html(html: str, parser_id: str, **kwargs) -> Tuple[str, dict[str, str]]:
|
|
231
|
-
"""
|
|
232
|
-
Quick HTML cleaning convenience function
|
|
233
|
-
|
|
234
|
-
Args:
|
|
235
|
-
html: Raw HTML content
|
|
236
|
-
parser_id: Parser identifier
|
|
237
|
-
**kwargs: Cleaning options
|
|
238
|
-
|
|
239
|
-
Returns:
|
|
240
|
-
Tuple of (cleaned_html, extracted_data)
|
|
241
|
-
"""
|
|
242
|
-
analyzer = create_html_analyzer(parser_id=parser_id)
|
|
243
|
-
|
|
244
|
-
try:
|
|
245
|
-
return await analyzer.clean_html_only(html, **kwargs)
|
|
246
|
-
finally:
|
|
247
|
-
await analyzer.close()
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
HTML Analyzer Models - Pydantic v2 models for HTML analysis operations.
|
|
3
|
-
|
|
4
|
-
Strict compliance with CRITICAL_REQUIREMENTS.md:
|
|
5
|
-
- No Dict[str, Any] usage
|
|
6
|
-
- Complete type annotations
|
|
7
|
-
- Pydantic v2 models everywhere
|
|
8
|
-
- Custom exception hierarchy
|
|
9
|
-
- No try blocks in imports
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
from typing import Optional
|
|
13
|
-
from pydantic import BaseModel, Field, ConfigDict
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class HTMLAnalysisResult(BaseModel):
|
|
17
|
-
"""Complete HTML analysis result with proper typing."""
|
|
18
|
-
|
|
19
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
20
|
-
|
|
21
|
-
success: bool = Field(..., description="Analysis success status")
|
|
22
|
-
original_html_size: int = Field(..., ge=0, description="Original HTML size in characters")
|
|
23
|
-
cleaned_html: str = Field(..., description="Cleaned HTML content")
|
|
24
|
-
cleaned_html_size: int = Field(..., ge=0, description="Cleaned HTML size in characters")
|
|
25
|
-
extracted_data: dict[str, str] = Field(default_factory=dict, description="Extracted JavaScript data")
|
|
26
|
-
analysis_result: dict[str, str] = Field(default_factory=dict, description="LLM analysis result")
|
|
27
|
-
cleaning_stats: dict[str, float] = Field(default_factory=dict, description="Cleaning statistics")
|
|
28
|
-
error_message: str = Field(default="", description="Error message if failed")
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class HTMLParseResult(BaseModel):
|
|
32
|
-
"""Standardized HTML parsing result for ParserManager."""
|
|
33
|
-
|
|
34
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
35
|
-
|
|
36
|
-
success: str = Field(..., description="Success status as string (true/false)")
|
|
37
|
-
parsed_data: str = Field(..., description="Parsed data as string")
|
|
38
|
-
markdown: str = Field(..., description="Markdown representation")
|
|
39
|
-
error_message: str = Field(..., description="Error message if failed")
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class HTMLAnalyzerStats(BaseModel):
|
|
43
|
-
"""HTML analyzer statistics."""
|
|
44
|
-
|
|
45
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
46
|
-
|
|
47
|
-
cleaned_count: int = Field(default=0, ge=0, description="Number of HTML documents cleaned")
|
|
48
|
-
total_reduction: float = Field(default=0.0, ge=0.0, description="Total size reduction percentage")
|
|
49
|
-
websocket_enabled: bool = Field(..., description="Whether WebSocket analyzer is enabled")
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class HTMLCleaningRequest(BaseModel):
|
|
53
|
-
"""Request model for HTML cleaning operations."""
|
|
54
|
-
|
|
55
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
56
|
-
|
|
57
|
-
html: str = Field(..., min_length=1, description="HTML content to clean")
|
|
58
|
-
preserve_js_data: bool = Field(default=True, description="Whether to extract JavaScript data")
|
|
59
|
-
aggressive_cleaning: bool = Field(default=False, description="Whether to apply aggressive cleaning")
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class HTMLAnalysisRequest(BaseModel):
|
|
63
|
-
"""Request model for HTML analysis operations."""
|
|
64
|
-
|
|
65
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
66
|
-
|
|
67
|
-
html: str = Field(..., min_length=1, description="HTML content to analyze")
|
|
68
|
-
instructions: Optional[str] = Field(default=None, description="Analysis instructions for LLM")
|
|
69
|
-
session_id: Optional[str] = Field(default=None, description="Session identifier")
|
|
70
|
-
url: Optional[str] = Field(default=None, description="Source URL for logging")
|
|
71
|
-
clean_first: bool = Field(default=True, description="Whether to clean HTML before analysis")
|
|
72
|
-
preserve_js_data: bool = Field(default=True, description="Whether to extract JavaScript data")
|
|
73
|
-
aggressive_cleaning: bool = Field(default=False, description="Whether to apply aggressive cleaning")
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
class HTMLParseRequest(BaseModel):
|
|
77
|
-
"""Complete HTML parsing request model."""
|
|
78
|
-
|
|
79
|
-
model_config = ConfigDict(validate_assignment=True, extra="forbid")
|
|
80
|
-
|
|
81
|
-
html: str = Field(..., min_length=1, description="HTML content to parse")
|
|
82
|
-
url: Optional[str] = Field(default=None, description="Source URL for logging")
|
|
83
|
-
instructions: Optional[str] = Field(default=None, description="Analysis instructions")
|
|
84
|
-
session_id: Optional[str] = Field(default=None, description="Session identifier")
|
|
85
|
-
clean_first: bool = Field(default=True, description="Whether to clean HTML before analysis")
|
|
86
|
-
preserve_js_data: bool = Field(default=True, description="Whether to extract JavaScript data")
|
|
87
|
-
aggressive_cleaning: bool = Field(default=False, description="Whether to apply aggressive cleaning")
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
class HTMLAnalyzerError(Exception):
|
|
91
|
-
"""Base exception for HTML analyzer operations."""
|
|
92
|
-
|
|
93
|
-
def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
|
|
94
|
-
self.message = message
|
|
95
|
-
self.operation = operation
|
|
96
|
-
self.details = details or {}
|
|
97
|
-
super().__init__(message)
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class HTMLCleaningError(HTMLAnalyzerError):
|
|
101
|
-
"""Raised when HTML cleaning fails."""
|
|
102
|
-
|
|
103
|
-
pass
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
class HTMLAnalysisError(HTMLAnalyzerError):
|
|
107
|
-
"""Raised when HTML analysis fails."""
|
|
108
|
-
|
|
109
|
-
pass
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class WebSocketAnalysisError(HTMLAnalyzerError):
|
|
113
|
-
"""Raised when WebSocket analysis fails."""
|
|
114
|
-
|
|
115
|
-
pass
|
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
WebSocket HTML Analyzer - Handles HTML analysis via WebSocket communication.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from typing import Optional, Dict, Any
|
|
6
|
-
from unrealon_driver.smart_logging import create_smart_logger
|
|
7
|
-
from unrealon_driver.websocket import websocket_manager, WebSocketConfig
|
|
8
|
-
|
|
9
|
-
from .config import HTMLAnalyzerConfig
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class WebSocketHTMLAnalyzer:
|
|
13
|
-
"""
|
|
14
|
-
WebSocket-based HTML analyzer that sends HTML to server for LLM analysis.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, config: HTMLAnalyzerConfig):
|
|
18
|
-
self.config = config
|
|
19
|
-
self.logger = create_smart_logger(parser_id=config.parser_id)
|
|
20
|
-
|
|
21
|
-
# Initialize WebSocket if configured
|
|
22
|
-
if config.websocket_url and config.enable_websocket_analysis:
|
|
23
|
-
self._websocket_config = WebSocketConfig(
|
|
24
|
-
url=config.websocket_url,
|
|
25
|
-
api_key=config.api_key,
|
|
26
|
-
parser_id=config.parser_id
|
|
27
|
-
)
|
|
28
|
-
self._websocket_initialized = False
|
|
29
|
-
else:
|
|
30
|
-
self._websocket_config = None
|
|
31
|
-
self._websocket_initialized = False
|
|
32
|
-
|
|
33
|
-
async def analyze_html(
|
|
34
|
-
self,
|
|
35
|
-
html: str,
|
|
36
|
-
instructions: Optional[str] = None,
|
|
37
|
-
session_id: Optional[str] = None,
|
|
38
|
-
**kwargs
|
|
39
|
-
) -> Dict[str, str]:
|
|
40
|
-
"""
|
|
41
|
-
Analyze HTML content via WebSocket
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
html: HTML content to analyze
|
|
45
|
-
instructions: Analysis instructions
|
|
46
|
-
session_id: Session identifier
|
|
47
|
-
**kwargs: Additional parameters
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
Analysis result dictionary
|
|
51
|
-
"""
|
|
52
|
-
if not self._websocket_config:
|
|
53
|
-
self.logger.warning("🔌 WebSocket not configured for HTML analysis")
|
|
54
|
-
return {
|
|
55
|
-
"success": "false",
|
|
56
|
-
"parsed_data": "",
|
|
57
|
-
"markdown": "",
|
|
58
|
-
"error_message": "WebSocket not configured"
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
try:
|
|
62
|
-
# Ensure WebSocket connection
|
|
63
|
-
if not self._websocket_initialized:
|
|
64
|
-
await self._initialize_websocket()
|
|
65
|
-
|
|
66
|
-
self.logger.info("🤖 Analyzing HTML with LLM via WebSocket...")
|
|
67
|
-
|
|
68
|
-
# Prepare analysis request
|
|
69
|
-
analysis_request = {
|
|
70
|
-
"type": "html_analysis_request",
|
|
71
|
-
"parser_id": self.config.parser_id,
|
|
72
|
-
"session_id": session_id,
|
|
73
|
-
"html_content": html,
|
|
74
|
-
"instructions": instructions or "Extract and structure the data from this HTML",
|
|
75
|
-
"parse_type": "general",
|
|
76
|
-
"timeout": kwargs.get("timeout", self.config.default_timeout),
|
|
77
|
-
"metadata": kwargs.get("metadata", {})
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
# Send request via WebSocket
|
|
81
|
-
if websocket_manager.connected:
|
|
82
|
-
response = await websocket_manager.send_request(
|
|
83
|
-
analysis_request,
|
|
84
|
-
timeout=kwargs.get("timeout", self.config.default_timeout)
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
if response and response.get("success"):
|
|
88
|
-
self.logger.info("✅ HTML analysis completed successfully")
|
|
89
|
-
return {
|
|
90
|
-
"success": "true",
|
|
91
|
-
"parsed_data": str(response.get("parsed_data", "")),
|
|
92
|
-
"markdown": response.get("markdown", ""),
|
|
93
|
-
"error_message": ""
|
|
94
|
-
}
|
|
95
|
-
else:
|
|
96
|
-
error_msg = response.get("error_message", "Analysis failed") if response else "No response"
|
|
97
|
-
self.logger.error(f"❌ HTML analysis failed: {error_msg}")
|
|
98
|
-
return {
|
|
99
|
-
"success": "false",
|
|
100
|
-
"parsed_data": "",
|
|
101
|
-
"markdown": "",
|
|
102
|
-
"error_message": error_msg
|
|
103
|
-
}
|
|
104
|
-
else:
|
|
105
|
-
self.logger.warning("🔌 WebSocket not connected for HTML analysis")
|
|
106
|
-
return {
|
|
107
|
-
"success": "false",
|
|
108
|
-
"parsed_data": "",
|
|
109
|
-
"markdown": "",
|
|
110
|
-
"error_message": "WebSocket not connected"
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
except Exception as e:
|
|
114
|
-
self.logger.error(f"❌ HTML analysis failed: {str(e)}")
|
|
115
|
-
return {
|
|
116
|
-
"success": "false",
|
|
117
|
-
"parsed_data": "",
|
|
118
|
-
"markdown": "",
|
|
119
|
-
"error_message": str(e)
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
async def _initialize_websocket(self) -> bool:
|
|
123
|
-
"""Initialize WebSocket connection"""
|
|
124
|
-
if not self._websocket_config:
|
|
125
|
-
return False
|
|
126
|
-
|
|
127
|
-
try:
|
|
128
|
-
success = await websocket_manager.initialize(self._websocket_config)
|
|
129
|
-
if success:
|
|
130
|
-
self._websocket_initialized = True
|
|
131
|
-
self.logger.info("🔌 WebSocket initialized for HTML analysis")
|
|
132
|
-
else:
|
|
133
|
-
self.logger.warning("🔌 WebSocket initialization failed")
|
|
134
|
-
return success
|
|
135
|
-
except Exception as e:
|
|
136
|
-
self.logger.error(f"❌ WebSocket initialization error: {e}")
|
|
137
|
-
return False
|
|
138
|
-
|
|
139
|
-
async def close(self):
|
|
140
|
-
"""Close WebSocket connection"""
|
|
141
|
-
if self._websocket_initialized:
|
|
142
|
-
await websocket_manager.disconnect()
|
|
143
|
-
self._websocket_initialized = False
|
|
144
|
-
self.logger.info("🔌 WebSocket connection closed")
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def create_websocket_analyzer(config: HTMLAnalyzerConfig) -> WebSocketHTMLAnalyzer:
|
|
148
|
-
"""
|
|
149
|
-
Create WebSocket HTML analyzer
|
|
150
|
-
|
|
151
|
-
Args:
|
|
152
|
-
config: HTML analyzer configuration
|
|
153
|
-
|
|
154
|
-
Returns:
|
|
155
|
-
Configured WebSocketHTMLAnalyzer instance
|
|
156
|
-
"""
|
|
157
|
-
return WebSocketHTMLAnalyzer(config)
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Models for unrealon_driver.
|
|
3
|
-
|
|
4
|
-
Pydantic v2 models for type safety and validation.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from .websocket import (
|
|
8
|
-
MessageType,
|
|
9
|
-
BridgeMessageType,
|
|
10
|
-
RegistrationMessage,
|
|
11
|
-
CommandMessage,
|
|
12
|
-
CommandResponseMessage,
|
|
13
|
-
StatusMessage,
|
|
14
|
-
HeartbeatMessage,
|
|
15
|
-
BridgeRegistrationPayload,
|
|
16
|
-
BridgeMessage,
|
|
17
|
-
BridgeRegistrationMessage
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
__all__ = [
|
|
21
|
-
"MessageType",
|
|
22
|
-
"BridgeMessageType",
|
|
23
|
-
"RegistrationMessage",
|
|
24
|
-
"CommandMessage",
|
|
25
|
-
"CommandResponseMessage",
|
|
26
|
-
"StatusMessage",
|
|
27
|
-
"HeartbeatMessage",
|
|
28
|
-
"BridgeRegistrationPayload",
|
|
29
|
-
"BridgeMessage",
|
|
30
|
-
"BridgeRegistrationMessage"
|
|
31
|
-
]
|