unrealon 1.1.1__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +16 -6
- unrealon-1.1.4.dist-info/METADATA +658 -0
- unrealon-1.1.4.dist-info/RECORD +54 -0
- {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/entry_points.txt +1 -1
- unrealon_browser/__init__.py +3 -6
- unrealon_browser/core/browser_manager.py +86 -84
- unrealon_browser/dto/models/config.py +2 -0
- unrealon_browser/managers/captcha.py +165 -185
- unrealon_browser/managers/cookies.py +57 -28
- unrealon_browser/managers/logger_bridge.py +94 -34
- unrealon_browser/managers/profile.py +186 -158
- unrealon_browser/managers/stealth.py +58 -47
- unrealon_driver/__init__.py +8 -21
- unrealon_driver/exceptions.py +5 -0
- unrealon_driver/html_analyzer/__init__.py +32 -0
- unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} +330 -405
- unrealon_driver/html_analyzer/config.py +64 -0
- unrealon_driver/html_analyzer/manager.py +247 -0
- unrealon_driver/html_analyzer/models.py +115 -0
- unrealon_driver/html_analyzer/websocket_analyzer.py +157 -0
- unrealon_driver/models/__init__.py +31 -0
- unrealon_driver/models/websocket.py +98 -0
- unrealon_driver/parser/__init__.py +4 -23
- unrealon_driver/parser/cli_manager.py +6 -5
- unrealon_driver/parser/daemon_manager.py +242 -66
- unrealon_driver/parser/managers/__init__.py +0 -21
- unrealon_driver/parser/managers/config.py +15 -3
- unrealon_driver/parser/parser_manager.py +225 -395
- unrealon_driver/smart_logging/__init__.py +24 -0
- unrealon_driver/smart_logging/models.py +44 -0
- unrealon_driver/smart_logging/smart_logger.py +406 -0
- unrealon_driver/smart_logging/unified_logger.py +525 -0
- unrealon_driver/websocket/__init__.py +31 -0
- unrealon_driver/websocket/client.py +249 -0
- unrealon_driver/websocket/config.py +188 -0
- unrealon_driver/websocket/manager.py +90 -0
- unrealon-1.1.1.dist-info/METADATA +0 -722
- unrealon-1.1.1.dist-info/RECORD +0 -82
- unrealon_bridge/__init__.py +0 -114
- unrealon_bridge/cli.py +0 -316
- unrealon_bridge/client/__init__.py +0 -93
- unrealon_bridge/client/base.py +0 -78
- unrealon_bridge/client/commands.py +0 -89
- unrealon_bridge/client/connection.py +0 -90
- unrealon_bridge/client/events.py +0 -65
- unrealon_bridge/client/health.py +0 -38
- unrealon_bridge/client/html_parser.py +0 -146
- unrealon_bridge/client/logging.py +0 -139
- unrealon_bridge/client/proxy.py +0 -70
- unrealon_bridge/client/scheduler.py +0 -450
- unrealon_bridge/client/session.py +0 -70
- unrealon_bridge/configs/__init__.py +0 -14
- unrealon_bridge/configs/bridge_config.py +0 -212
- unrealon_bridge/configs/bridge_config.yaml +0 -39
- unrealon_bridge/models/__init__.py +0 -138
- unrealon_bridge/models/base.py +0 -28
- unrealon_bridge/models/command.py +0 -41
- unrealon_bridge/models/events.py +0 -40
- unrealon_bridge/models/html_parser.py +0 -79
- unrealon_bridge/models/logging.py +0 -55
- unrealon_bridge/models/parser.py +0 -63
- unrealon_bridge/models/proxy.py +0 -41
- unrealon_bridge/models/requests.py +0 -95
- unrealon_bridge/models/responses.py +0 -88
- unrealon_bridge/models/scheduler.py +0 -592
- unrealon_bridge/models/session.py +0 -28
- unrealon_bridge/server/__init__.py +0 -91
- unrealon_bridge/server/base.py +0 -171
- unrealon_bridge/server/handlers/__init__.py +0 -23
- unrealon_bridge/server/handlers/command.py +0 -110
- unrealon_bridge/server/handlers/html_parser.py +0 -139
- unrealon_bridge/server/handlers/logging.py +0 -95
- unrealon_bridge/server/handlers/parser.py +0 -95
- unrealon_bridge/server/handlers/proxy.py +0 -75
- unrealon_bridge/server/handlers/scheduler.py +0 -545
- unrealon_bridge/server/handlers/session.py +0 -66
- unrealon_driver/browser/__init__.py +0 -8
- unrealon_driver/browser/config.py +0 -74
- unrealon_driver/browser/manager.py +0 -416
- unrealon_driver/parser/managers/browser.py +0 -51
- unrealon_driver/parser/managers/logging.py +0 -609
- {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/WHEEL +0 -0
- {unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/licenses/LICENSE +0 -0
unrealon_bridge/server/base.py
DELETED
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Base Parser Bridge Server.
|
|
3
|
-
|
|
4
|
-
Core server functionality and state management.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import asyncio
|
|
8
|
-
from typing import Dict, Callable, Optional, List
|
|
9
|
-
from unrealon_rpc.bridge import WebSocketBridge
|
|
10
|
-
from unrealon_rpc.rpc import RPCServer
|
|
11
|
-
from unrealon_rpc.pubsub import PubSubSubscriber
|
|
12
|
-
from unrealon_rpc.logging import get_logger
|
|
13
|
-
|
|
14
|
-
from ..models import (
|
|
15
|
-
ParserInfo, ParserCommand, ParserSession, ParserEvent, ParserSystemStats
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
logger = get_logger(__name__)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class ParserBridgeServerBase:
|
|
22
|
-
"""
|
|
23
|
-
Base parser bridge server with core functionality.
|
|
24
|
-
|
|
25
|
-
Manages server state and provides foundation for specialized handlers.
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
def __init__(self, redis_url: str = "redis://localhost:6379/0", rpc_channel: str = "parser_rpc", pubsub_prefix: str = "parser", **kwargs):
|
|
29
|
-
"""
|
|
30
|
-
Initialize parser bridge server.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
redis_url: Redis connection URL
|
|
34
|
-
rpc_channel: RPC channel name
|
|
35
|
-
pubsub_prefix: PubSub channel prefix
|
|
36
|
-
**kwargs: Additional arguments for WebSocketBridge
|
|
37
|
-
"""
|
|
38
|
-
self.redis_url = redis_url
|
|
39
|
-
self.rpc_channel = rpc_channel
|
|
40
|
-
self.pubsub_prefix = pubsub_prefix
|
|
41
|
-
|
|
42
|
-
# Initialize bridge components
|
|
43
|
-
self.bridge = WebSocketBridge(
|
|
44
|
-
redis_url=redis_url,
|
|
45
|
-
rpc_channel=rpc_channel,
|
|
46
|
-
pubsub_prefix=pubsub_prefix,
|
|
47
|
-
**kwargs
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
# Initialize RPC and PubSub
|
|
51
|
-
self.parser_rpc = RPCServer(channel=rpc_channel, redis_url=redis_url)
|
|
52
|
-
self.parser_pubsub = PubSubSubscriber(channel_prefix=pubsub_prefix, redis_url=redis_url)
|
|
53
|
-
|
|
54
|
-
# Server state
|
|
55
|
-
self.parsers: Dict[str, ParserInfo] = {}
|
|
56
|
-
self.sessions: Dict[str, ParserSession] = {}
|
|
57
|
-
self.commands: Dict[str, ParserCommand] = {}
|
|
58
|
-
self.proxies: Dict[str, any] = {} # Will be typed properly in proxy handler
|
|
59
|
-
|
|
60
|
-
# Mapping between parser_id and client_id for WebSocket forwarding
|
|
61
|
-
self.parser_to_client: Dict[str, str] = {}
|
|
62
|
-
|
|
63
|
-
# Custom command handlers
|
|
64
|
-
self.command_handlers: Dict[str, Callable] = {}
|
|
65
|
-
|
|
66
|
-
# Background tasks
|
|
67
|
-
self._tasks: List[asyncio.Task] = []
|
|
68
|
-
self._running = False
|
|
69
|
-
|
|
70
|
-
def get_client_by_parser_id(self, parser_id: str):
|
|
71
|
-
"""Get WebSocket client by parser_id."""
|
|
72
|
-
client_id = self.parser_to_client.get(parser_id)
|
|
73
|
-
if client_id and client_id in self.bridge.connections:
|
|
74
|
-
return self.bridge.connections[client_id]
|
|
75
|
-
return None
|
|
76
|
-
|
|
77
|
-
async def start(self) -> None:
|
|
78
|
-
"""Start the parser bridge server."""
|
|
79
|
-
if self._running:
|
|
80
|
-
return
|
|
81
|
-
|
|
82
|
-
logger.info("Starting Parser Bridge Server...")
|
|
83
|
-
|
|
84
|
-
# Start bridge components
|
|
85
|
-
await self.bridge.start()
|
|
86
|
-
await self.parser_rpc.start()
|
|
87
|
-
await self.parser_pubsub.start()
|
|
88
|
-
|
|
89
|
-
# Start background tasks
|
|
90
|
-
self._tasks.append(asyncio.create_task(self._pubsub_listener()))
|
|
91
|
-
|
|
92
|
-
self._running = True
|
|
93
|
-
logger.info("Parser Bridge Server started")
|
|
94
|
-
|
|
95
|
-
async def stop(self) -> None:
|
|
96
|
-
"""Stop the parser bridge server."""
|
|
97
|
-
if not self._running:
|
|
98
|
-
return
|
|
99
|
-
|
|
100
|
-
logger.info("Stopping Parser Bridge Server...")
|
|
101
|
-
|
|
102
|
-
# Cancel background tasks
|
|
103
|
-
for task in self._tasks:
|
|
104
|
-
task.cancel()
|
|
105
|
-
|
|
106
|
-
# Wait for tasks to complete
|
|
107
|
-
if self._tasks:
|
|
108
|
-
await asyncio.gather(*self._tasks, return_exceptions=True)
|
|
109
|
-
self._tasks.clear()
|
|
110
|
-
|
|
111
|
-
# Stop bridge components
|
|
112
|
-
await self.parser_pubsub.stop()
|
|
113
|
-
await self.parser_rpc.stop()
|
|
114
|
-
await self.bridge.stop()
|
|
115
|
-
|
|
116
|
-
self._running = False
|
|
117
|
-
logger.info("Parser Bridge Server stopped")
|
|
118
|
-
|
|
119
|
-
async def _pubsub_listener(self) -> None:
|
|
120
|
-
"""Listen to parser events via PubSub."""
|
|
121
|
-
try:
|
|
122
|
-
# Register handler for parser events
|
|
123
|
-
@self.parser_pubsub.subscribe("parser_events")
|
|
124
|
-
async def event_handler(payload: dict):
|
|
125
|
-
try:
|
|
126
|
-
event = ParserEvent.model_validate(payload)
|
|
127
|
-
await self._handle_parser_event(event)
|
|
128
|
-
except Exception as e:
|
|
129
|
-
logger.error(f"Error processing parser event: {e}")
|
|
130
|
-
|
|
131
|
-
# Start the subscriber (this will run indefinitely)
|
|
132
|
-
await self.parser_pubsub.start()
|
|
133
|
-
|
|
134
|
-
except asyncio.CancelledError:
|
|
135
|
-
logger.info("PubSub listener cancelled")
|
|
136
|
-
except Exception as e:
|
|
137
|
-
logger.error(f"PubSub listener error: {e}")
|
|
138
|
-
|
|
139
|
-
async def _handle_parser_event(self, event: ParserEvent) -> None:
|
|
140
|
-
"""
|
|
141
|
-
Handle parser event from PubSub.
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
event: Parser event to handle
|
|
145
|
-
"""
|
|
146
|
-
logger.debug(f"Parser event: {event.event_type} from {event.parser_id}")
|
|
147
|
-
|
|
148
|
-
def register_command_handler(self, command_type: str, handler: Callable) -> None:
|
|
149
|
-
"""
|
|
150
|
-
Register custom command handler.
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
command_type: Type of command to handle
|
|
154
|
-
handler: Async handler function
|
|
155
|
-
"""
|
|
156
|
-
self.command_handlers[command_type] = handler
|
|
157
|
-
logger.info(f"Registered command handler for: {command_type}")
|
|
158
|
-
|
|
159
|
-
def get_parser_stats(self) -> ParserSystemStats:
|
|
160
|
-
"""Get parser statistics."""
|
|
161
|
-
parser_types = {}
|
|
162
|
-
for parser in self.parsers.values():
|
|
163
|
-
parser_types[parser.parser_type] = parser_types.get(parser.parser_type, 0) + 1
|
|
164
|
-
|
|
165
|
-
return ParserSystemStats(
|
|
166
|
-
total_parsers=len(self.parsers),
|
|
167
|
-
active_sessions=len([s for s in self.sessions.values() if s.status == "active"]),
|
|
168
|
-
total_commands=len(self.commands),
|
|
169
|
-
allocated_proxies=len(self.proxies),
|
|
170
|
-
parser_types=parser_types
|
|
171
|
-
)
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
RPC Handlers for Parser Bridge Server.
|
|
3
|
-
|
|
4
|
-
Modular handlers for different types of RPC operations.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from .parser import ParserHandlers
|
|
8
|
-
from .session import SessionHandlers
|
|
9
|
-
from .command import CommandHandlers
|
|
10
|
-
from .proxy import ProxyHandlers
|
|
11
|
-
from .html_parser import HTMLParserHandlers
|
|
12
|
-
from .logging import LoggingHandlers
|
|
13
|
-
from .scheduler import SchedulerHandlers
|
|
14
|
-
|
|
15
|
-
__all__ = [
|
|
16
|
-
"ParserHandlers",
|
|
17
|
-
"SessionHandlers",
|
|
18
|
-
"CommandHandlers",
|
|
19
|
-
"ProxyHandlers",
|
|
20
|
-
"HTMLParserHandlers",
|
|
21
|
-
"LoggingHandlers",
|
|
22
|
-
"SchedulerHandlers"
|
|
23
|
-
]
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Command-related RPC handlers.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import uuid
|
|
6
|
-
from unrealon_rpc.logging import get_logger
|
|
7
|
-
|
|
8
|
-
from ...models import (
|
|
9
|
-
ParserCommand, CommandResult,
|
|
10
|
-
CommandExecuteRequest, CommandExecuteResponse,
|
|
11
|
-
CommandCreateRequest, CommandCreateResponse,
|
|
12
|
-
CommandStatusRequest, CommandStatusResponse
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
logger = get_logger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class CommandHandlers:
|
|
19
|
-
"""Handlers for command-related RPC operations."""
|
|
20
|
-
|
|
21
|
-
async def handle_command_execute(self, parser_id: str, command_type: str, parameters: dict, timeout: int = 30) -> dict:
|
|
22
|
-
"""Handle command execution."""
|
|
23
|
-
try:
|
|
24
|
-
# Create request object for validation
|
|
25
|
-
request = CommandExecuteRequest(
|
|
26
|
-
parser_id=parser_id,
|
|
27
|
-
command_type=command_type,
|
|
28
|
-
parameters=parameters,
|
|
29
|
-
timeout=timeout
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
command = ParserCommand(
|
|
33
|
-
command_id=str(uuid.uuid4()),
|
|
34
|
-
command_type=request.command_type,
|
|
35
|
-
parser_id=request.parser_id,
|
|
36
|
-
parameters=request.parameters,
|
|
37
|
-
timeout=request.timeout
|
|
38
|
-
)
|
|
39
|
-
self.commands[command.command_id] = command
|
|
40
|
-
|
|
41
|
-
# Forward command to daemon via WebSocket
|
|
42
|
-
daemon_client = self.get_client_by_parser_id(parser_id)
|
|
43
|
-
if daemon_client:
|
|
44
|
-
logger.info(f"📤 Forwarding command {command.command_type} to daemon {parser_id}")
|
|
45
|
-
# Send command via WebSocket
|
|
46
|
-
command_message = {
|
|
47
|
-
"message_type": "command",
|
|
48
|
-
"command_id": command.command_id,
|
|
49
|
-
"command_type": command.command_type,
|
|
50
|
-
"parameters": command.parameters,
|
|
51
|
-
"parser_id": parser_id
|
|
52
|
-
}
|
|
53
|
-
await daemon_client.send_message(command_message)
|
|
54
|
-
# For now, return mock response - daemon should respond via WebSocket later
|
|
55
|
-
result_data = {
|
|
56
|
-
"command_type": command.command_type,
|
|
57
|
-
"status": "forwarded_to_daemon",
|
|
58
|
-
"parser_id": parser_id
|
|
59
|
-
}
|
|
60
|
-
else:
|
|
61
|
-
logger.warning(f"⚠️ No daemon found for parser {parser_id}")
|
|
62
|
-
# Fallback to local handlers
|
|
63
|
-
handler = self.command_handlers.get(command.command_type)
|
|
64
|
-
if handler:
|
|
65
|
-
logger.info(f"🔧 Using local handler for {command.command_type}")
|
|
66
|
-
result_data = await handler(command)
|
|
67
|
-
else:
|
|
68
|
-
logger.warning(f"⚠️ No handler found for {command.command_type}")
|
|
69
|
-
result_data = {"error": f"No daemon connected for parser {parser_id}"}
|
|
70
|
-
|
|
71
|
-
result = CommandResult(
|
|
72
|
-
command_id=command.command_id,
|
|
73
|
-
success=True,
|
|
74
|
-
result_data=result_data,
|
|
75
|
-
execution_time=0.5
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
logger.info(f"Command executed: {command.command_id} ({command.command_type})")
|
|
79
|
-
|
|
80
|
-
response = CommandExecuteResponse(success=True, result=result)
|
|
81
|
-
return response.model_dump(mode='json')
|
|
82
|
-
|
|
83
|
-
except Exception as e:
|
|
84
|
-
logger.error(f"Command execution failed: {e}")
|
|
85
|
-
response = CommandExecuteResponse(success=False, error=str(e))
|
|
86
|
-
return response.model_dump(mode='json')
|
|
87
|
-
|
|
88
|
-
async def handle_command_create(self, request: CommandCreateRequest) -> CommandCreateResponse:
|
|
89
|
-
"""Handle command creation."""
|
|
90
|
-
try:
|
|
91
|
-
command = ParserCommand(
|
|
92
|
-
command_id=str(uuid.uuid4()),
|
|
93
|
-
command_type=request.command_type,
|
|
94
|
-
parser_id=request.parser_id,
|
|
95
|
-
parameters=request.parameters
|
|
96
|
-
)
|
|
97
|
-
self.commands[command.command_id] = command
|
|
98
|
-
|
|
99
|
-
return CommandCreateResponse(success=True, command=command)
|
|
100
|
-
except Exception as e:
|
|
101
|
-
return CommandCreateResponse(success=False, error=str(e))
|
|
102
|
-
|
|
103
|
-
async def handle_command_get_status(self, request: CommandStatusRequest) -> CommandStatusResponse:
|
|
104
|
-
"""Handle command status request."""
|
|
105
|
-
command = self.commands.get(request.command_id)
|
|
106
|
-
|
|
107
|
-
if not command:
|
|
108
|
-
return CommandStatusResponse(success=False, error=f"Command {request.command_id} not found")
|
|
109
|
-
|
|
110
|
-
return CommandStatusResponse(success=True, command=command)
|
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
HTML Parser RPC handlers.
|
|
3
|
-
|
|
4
|
-
Clean implementation following CRITICAL_REQUIREMENTS.md:
|
|
5
|
-
- No inline imports
|
|
6
|
-
- Strict Pydantic v2 usage
|
|
7
|
-
- Complete type annotations
|
|
8
|
-
- No Dict[str, Any] usage
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import asyncio
|
|
12
|
-
import random
|
|
13
|
-
import uuid
|
|
14
|
-
from datetime import datetime
|
|
15
|
-
from typing import Optional
|
|
16
|
-
|
|
17
|
-
from unrealon_rpc.logging import get_logger
|
|
18
|
-
|
|
19
|
-
from ...models import HTMLParseRPCRequest, HTMLParseRPCResponse, HTMLParseResult
|
|
20
|
-
|
|
21
|
-
logger = get_logger(__name__)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class HTMLParserHandlers:
|
|
25
|
-
"""Handlers for HTML parser RPC operations."""
|
|
26
|
-
|
|
27
|
-
def __init__(self) -> None:
|
|
28
|
-
"""Initialize HTML parser handlers."""
|
|
29
|
-
pass
|
|
30
|
-
|
|
31
|
-
async def handle_html_parse(self, html_content: str, parser_id: str, url: Optional[str] = None, parse_type: str = "general", instructions: Optional[str] = None, timeout: int = 60, metadata: Optional[dict] = None) -> dict:
|
|
32
|
-
"""
|
|
33
|
-
Handle HTML parsing request.
|
|
34
|
-
|
|
35
|
-
Forwards HTML content to Django backend for AI/LLM processing.
|
|
36
|
-
Django will parse HTML and return JSON + markdown instructions.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
html_content: Raw HTML content to parse
|
|
40
|
-
parser_id: ID of the parser making the request
|
|
41
|
-
url: Source URL of the HTML (optional)
|
|
42
|
-
parse_type: Type of parsing (product, listing, article, etc.)
|
|
43
|
-
instructions: Additional parsing instructions (optional)
|
|
44
|
-
timeout: Timeout in seconds (default 60s for LLM processing)
|
|
45
|
-
metadata: Additional metadata (optional)
|
|
46
|
-
|
|
47
|
-
Returns:
|
|
48
|
-
HTMLParseRPCResponse as dict with success, result, request_id
|
|
49
|
-
"""
|
|
50
|
-
try:
|
|
51
|
-
# Create and validate request object
|
|
52
|
-
request = HTMLParseRPCRequest(html_content=html_content, parser_id=parser_id, url=url, parse_type=parse_type, instructions=instructions, timeout=timeout, metadata=metadata or {})
|
|
53
|
-
|
|
54
|
-
request_id = str(uuid.uuid4())
|
|
55
|
-
|
|
56
|
-
logger.info(f"HTML parse request from parser {parser_id}: " f"{len(html_content)} chars, type: {parse_type}")
|
|
57
|
-
|
|
58
|
-
# TODO: In production, make RPC call to Django backend
|
|
59
|
-
# For now, simulate the response
|
|
60
|
-
result = await self._simulate_html_parsing(request)
|
|
61
|
-
|
|
62
|
-
response = HTMLParseRPCResponse(success=True, result=result, request_id=request_id, message="HTML parsed successfully" if result.success else "HTML parsing failed")
|
|
63
|
-
|
|
64
|
-
return response.model_dump(mode="json")
|
|
65
|
-
|
|
66
|
-
except Exception as e:
|
|
67
|
-
logger.error(f"HTML parsing failed for parser {parser_id}: {e}")
|
|
68
|
-
|
|
69
|
-
response = HTMLParseRPCResponse(success=False, error=str(e), message="HTML parsing request failed")
|
|
70
|
-
|
|
71
|
-
return response.model_dump(mode="json")
|
|
72
|
-
|
|
73
|
-
async def _simulate_html_parsing(self, request: HTMLParseRPCRequest) -> HTMLParseResult:
|
|
74
|
-
"""
|
|
75
|
-
Simulate HTML parsing for demo purposes.
|
|
76
|
-
|
|
77
|
-
In production, this would make an RPC call to Django backend which would:
|
|
78
|
-
1. Receive the HTML content
|
|
79
|
-
2. Use LLM (GPT-4, Claude, etc.) to parse the HTML
|
|
80
|
-
3. Return structured JSON data + markdown instructions
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
request: Validated HTML parse request
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
HTMLParseResult with success/failure and data/markdown
|
|
87
|
-
"""
|
|
88
|
-
# Simulate brief processing delay
|
|
89
|
-
await asyncio.sleep(0.1)
|
|
90
|
-
|
|
91
|
-
# Simulate success/failure (85% success rate)
|
|
92
|
-
success_rate = 0.85
|
|
93
|
-
is_successful = random.random() < success_rate
|
|
94
|
-
|
|
95
|
-
if is_successful:
|
|
96
|
-
return self._create_success_result(request)
|
|
97
|
-
else:
|
|
98
|
-
return HTMLParseResult(success=False, error_message="Failed to extract structured data from HTML")
|
|
99
|
-
|
|
100
|
-
def _create_success_result(self, request: HTMLParseRPCRequest) -> HTMLParseResult:
|
|
101
|
-
"""Create successful parsing result with sample data."""
|
|
102
|
-
# Sample parsed data
|
|
103
|
-
parsed_data = {
|
|
104
|
-
"title": "Sample Product Title",
|
|
105
|
-
"price": "29,900,000",
|
|
106
|
-
"description": "Sample product description extracted from HTML",
|
|
107
|
-
"specifications": {"year": "2020", "mileage": "45,000 km", "fuel": "Gasoline"},
|
|
108
|
-
"images": ["https://example.com/image1.jpg", "https://example.com/image2.jpg"],
|
|
109
|
-
}
|
|
110
|
-
# Make parsed_data with pydantic model
|
|
111
|
-
|
|
112
|
-
# Generate markdown instructions
|
|
113
|
-
markdown_instructions = f"""# HTML Parsing Results
|
|
114
|
-
|
|
115
|
-
## Extracted Data
|
|
116
|
-
Successfully parsed {request.parse_type} content from the provided HTML.
|
|
117
|
-
|
|
118
|
-
### Key Findings:
|
|
119
|
-
- **Title**: {parsed_data.get('title', 'N/A')}
|
|
120
|
-
- **Price**: {parsed_data.get('price', 'N/A')}
|
|
121
|
-
- **Content Size**: {len(request.html_content)} characters
|
|
122
|
-
|
|
123
|
-
### Parsing Notes:
|
|
124
|
-
- Applied {request.parse_type} parsing rules
|
|
125
|
-
- Processed HTML structure successfully
|
|
126
|
-
- Extracted all required fields
|
|
127
|
-
|
|
128
|
-
### Recommendations:
|
|
129
|
-
- Data quality appears good
|
|
130
|
-
- Consider validating price format
|
|
131
|
-
- Check for additional product images
|
|
132
|
-
|
|
133
|
-
### Next Steps:
|
|
134
|
-
1. Validate extracted data against business rules
|
|
135
|
-
2. Store in appropriate database tables
|
|
136
|
-
3. Process for further analysis
|
|
137
|
-
"""
|
|
138
|
-
|
|
139
|
-
return HTMLParseResult(success=True, parsed_data=parsed_data, markdown=markdown_instructions)
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Parser logging RPC handlers.
|
|
3
|
-
|
|
4
|
-
Handles parser log entries sent from parsers to Django.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from typing import Optional
|
|
8
|
-
from unrealon_rpc.logging import get_logger
|
|
9
|
-
|
|
10
|
-
from ...models import ParserLogEntry, ParserLogRequest, ParserLogResponse
|
|
11
|
-
|
|
12
|
-
logger = get_logger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class LoggingHandlers:
|
|
16
|
-
"""Handlers for parser logging RPC operations."""
|
|
17
|
-
|
|
18
|
-
def __init__(self) -> None:
|
|
19
|
-
"""Initialize logging handlers."""
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
async def handle_parser_log(
|
|
23
|
-
self,
|
|
24
|
-
parser_id: str,
|
|
25
|
-
level: str,
|
|
26
|
-
message: str,
|
|
27
|
-
session_id: Optional[str] = None,
|
|
28
|
-
command_id: Optional[str] = None,
|
|
29
|
-
url: Optional[str] = None,
|
|
30
|
-
operation: Optional[str] = None,
|
|
31
|
-
data: Optional[dict] = None,
|
|
32
|
-
error_details: Optional[str] = None
|
|
33
|
-
) -> dict:
|
|
34
|
-
"""
|
|
35
|
-
Handle parser log entry.
|
|
36
|
-
|
|
37
|
-
Receives log from parser and forwards to Django for storage/processing.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
parser_id: ID of the parser sending the log
|
|
41
|
-
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
42
|
-
message: Log message
|
|
43
|
-
session_id: Parser session ID (optional)
|
|
44
|
-
command_id: Command ID if related to command (optional)
|
|
45
|
-
url: URL being processed (optional)
|
|
46
|
-
operation: Operation being performed (optional)
|
|
47
|
-
data: Additional log data (optional)
|
|
48
|
-
error_details: Error details if error log (optional)
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
ParserLogResponse as dict with success status
|
|
52
|
-
"""
|
|
53
|
-
try:
|
|
54
|
-
# Create and validate log entry
|
|
55
|
-
log_entry = ParserLogEntry(
|
|
56
|
-
parser_id=parser_id,
|
|
57
|
-
level=level,
|
|
58
|
-
message=message,
|
|
59
|
-
session_id=session_id,
|
|
60
|
-
command_id=command_id,
|
|
61
|
-
url=url,
|
|
62
|
-
operation=operation,
|
|
63
|
-
data=data or {},
|
|
64
|
-
error_details=error_details
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
# Log locally for debugging
|
|
68
|
-
local_logger_method = getattr(logger, level.lower(), logger.info)
|
|
69
|
-
local_logger_method(
|
|
70
|
-
f"Parser {parser_id} log: {message}",
|
|
71
|
-
component="parser_log",
|
|
72
|
-
operation=operation
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
# Django will receive this RPC call via Redis and handle the log
|
|
76
|
-
|
|
77
|
-
response = ParserLogResponse(
|
|
78
|
-
success=True,
|
|
79
|
-
message="Log entry received and forwarded to Django"
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
return response.model_dump(mode='json')
|
|
83
|
-
|
|
84
|
-
except Exception as e:
|
|
85
|
-
logger.error(f"Failed to handle parser log from {parser_id}: {e}")
|
|
86
|
-
|
|
87
|
-
response = ParserLogResponse(
|
|
88
|
-
success=False,
|
|
89
|
-
error=str(e),
|
|
90
|
-
message="Failed to process log entry"
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
return response.model_dump(mode='json')
|
|
94
|
-
|
|
95
|
-
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Parser-related RPC handlers.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from unrealon_rpc.logging import get_logger
|
|
6
|
-
from unrealon_bridge.configs import load_bridge_config
|
|
7
|
-
from unrealon_bridge.models import ParserInfo, ParserHealth, ParserSystemStats, ParserRegisterRequest, ParserRegisterResponse, ParserStatusRequest, ParserStatusResponse, ParserListRequest, ParserListResponse, ParserHealthRequest, ParserHealthResponse
|
|
8
|
-
|
|
9
|
-
logger = get_logger(__name__)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class ParserHandlers:
|
|
13
|
-
"""Handlers for parser-related RPC operations."""
|
|
14
|
-
|
|
15
|
-
async def handle_parser_register(self, parser_id: str, parser_type: str, version: str, capabilities: list, metadata: dict = None, api_key: str = None) -> dict:
|
|
16
|
-
"""Handle parser registration."""
|
|
17
|
-
try:
|
|
18
|
-
# Load bridge configuration
|
|
19
|
-
config = load_bridge_config()
|
|
20
|
-
|
|
21
|
-
# Check if API key is required
|
|
22
|
-
if config.security.require_api_key:
|
|
23
|
-
if not api_key:
|
|
24
|
-
response = ParserRegisterResponse(success=False, error="API key is required")
|
|
25
|
-
return response.model_dump(mode="json")
|
|
26
|
-
|
|
27
|
-
# Validate API key
|
|
28
|
-
if not config.is_valid_api_key(api_key):
|
|
29
|
-
logger.warning(f"Invalid API key attempted: {api_key[:8] if api_key else 'None'}...")
|
|
30
|
-
response = ParserRegisterResponse(success=False, error="Invalid API key")
|
|
31
|
-
return response.model_dump(mode="json")
|
|
32
|
-
|
|
33
|
-
# Create request object for validation
|
|
34
|
-
request = ParserRegisterRequest(parser_id=parser_id, parser_type=parser_type, version=version, capabilities=capabilities, metadata=metadata)
|
|
35
|
-
|
|
36
|
-
parser_info = ParserInfo(parser_id=request.parser_id, parser_type=request.parser_type, version=request.version, capabilities=request.capabilities, metadata=request.metadata or {})
|
|
37
|
-
self.parsers[parser_info.parser_id] = parser_info
|
|
38
|
-
|
|
39
|
-
# Find and map the most recent WebSocket client (daemon usually connects then registers immediately)
|
|
40
|
-
if self.bridge.connections:
|
|
41
|
-
# Get the most recently connected client
|
|
42
|
-
latest_client_id = max(self.bridge.connections.keys(),
|
|
43
|
-
key=lambda cid: self.bridge.connections[cid].client_info.connected_at)
|
|
44
|
-
self.parser_to_client[parser_info.parser_id] = latest_client_id
|
|
45
|
-
logger.info(f"🔗 Mapped parser {parser_info.parser_id} to client {latest_client_id}")
|
|
46
|
-
else:
|
|
47
|
-
logger.warning(f"⚠️ No WebSocket clients connected during parser registration")
|
|
48
|
-
|
|
49
|
-
# Log successful registration
|
|
50
|
-
api_key_display = api_key[:8] + "..." if api_key else "None"
|
|
51
|
-
logger.info(f"Parser registered: {parser_info.parser_id} ({parser_info.parser_type}) with API key: {api_key_display}")
|
|
52
|
-
|
|
53
|
-
# Log test key usage in development
|
|
54
|
-
if config.is_development() and api_key in config.security.test_api_keys:
|
|
55
|
-
logger.info(f"🧪 Using test API key for development: {api_key}")
|
|
56
|
-
|
|
57
|
-
response = ParserRegisterResponse(success=True, parser_id=parser_info.parser_id, message="Parser registered successfully")
|
|
58
|
-
return response.model_dump(mode="json")
|
|
59
|
-
except Exception as e:
|
|
60
|
-
logger.error(f"Parser registration failed: {e}")
|
|
61
|
-
response = ParserRegisterResponse(success=False, error=str(e))
|
|
62
|
-
return response.model_dump(mode="json")
|
|
63
|
-
|
|
64
|
-
async def handle_parser_get_status(self, parser_id: str) -> dict:
|
|
65
|
-
"""Handle parser status request."""
|
|
66
|
-
parser_info = self.parsers.get(parser_id)
|
|
67
|
-
|
|
68
|
-
if not parser_info:
|
|
69
|
-
response = ParserStatusResponse(success=False, error=f"Parser {parser_id} not found")
|
|
70
|
-
return response.model_dump(mode="json")
|
|
71
|
-
|
|
72
|
-
response = ParserStatusResponse(success=True, parser=parser_info)
|
|
73
|
-
return response.model_dump(mode="json")
|
|
74
|
-
|
|
75
|
-
async def handle_parser_list(self, parser_type: str = None) -> dict:
|
|
76
|
-
"""Handle parser list request."""
|
|
77
|
-
parsers = list(self.parsers.values())
|
|
78
|
-
|
|
79
|
-
if parser_type:
|
|
80
|
-
parsers = [p for p in parsers if p.parser_type == parser_type]
|
|
81
|
-
|
|
82
|
-
response = ParserListResponse(success=True, parsers=parsers, total=len(parsers))
|
|
83
|
-
return response.model_dump(mode="json")
|
|
84
|
-
|
|
85
|
-
async def handle_parser_get_health(self, parser_id: str) -> dict:
|
|
86
|
-
"""Handle parser health check."""
|
|
87
|
-
if parser_id not in self.parsers:
|
|
88
|
-
response = ParserHealthResponse(success=False, error="Parser not found")
|
|
89
|
-
return response.model_dump(mode="json")
|
|
90
|
-
|
|
91
|
-
# Health check implementation should be provided by external service
|
|
92
|
-
health = ParserHealth(parser_id=parser_id, status="healthy", response_time=0.1, memory_usage=50.0, cpu_usage=25.0, active_connections=1, queue_size=0)
|
|
93
|
-
|
|
94
|
-
response = ParserHealthResponse(success=True, health=health)
|
|
95
|
-
return response.model_dump(mode="json")
|