unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +23 -21
- unrealon-1.1.0.dist-info/METADATA +164 -0
- unrealon-1.1.0.dist-info/RECORD +82 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
- unrealon-1.1.0.dist-info/entry_points.txt +9 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
- unrealon_bridge/__init__.py +114 -0
- unrealon_bridge/cli.py +316 -0
- unrealon_bridge/client/__init__.py +93 -0
- unrealon_bridge/client/base.py +78 -0
- unrealon_bridge/client/commands.py +89 -0
- unrealon_bridge/client/connection.py +90 -0
- unrealon_bridge/client/events.py +65 -0
- unrealon_bridge/client/health.py +38 -0
- unrealon_bridge/client/html_parser.py +146 -0
- unrealon_bridge/client/logging.py +139 -0
- unrealon_bridge/client/proxy.py +70 -0
- unrealon_bridge/client/scheduler.py +450 -0
- unrealon_bridge/client/session.py +70 -0
- unrealon_bridge/configs/__init__.py +14 -0
- unrealon_bridge/configs/bridge_config.py +212 -0
- unrealon_bridge/configs/bridge_config.yaml +39 -0
- unrealon_bridge/models/__init__.py +138 -0
- unrealon_bridge/models/base.py +28 -0
- unrealon_bridge/models/command.py +41 -0
- unrealon_bridge/models/events.py +40 -0
- unrealon_bridge/models/html_parser.py +79 -0
- unrealon_bridge/models/logging.py +55 -0
- unrealon_bridge/models/parser.py +63 -0
- unrealon_bridge/models/proxy.py +41 -0
- unrealon_bridge/models/requests.py +95 -0
- unrealon_bridge/models/responses.py +88 -0
- unrealon_bridge/models/scheduler.py +592 -0
- unrealon_bridge/models/session.py +28 -0
- unrealon_bridge/server/__init__.py +91 -0
- unrealon_bridge/server/base.py +171 -0
- unrealon_bridge/server/handlers/__init__.py +23 -0
- unrealon_bridge/server/handlers/command.py +110 -0
- unrealon_bridge/server/handlers/html_parser.py +139 -0
- unrealon_bridge/server/handlers/logging.py +95 -0
- unrealon_bridge/server/handlers/parser.py +95 -0
- unrealon_bridge/server/handlers/proxy.py +75 -0
- unrealon_bridge/server/handlers/scheduler.py +545 -0
- unrealon_bridge/server/handlers/session.py +66 -0
- unrealon_browser/__init__.py +61 -18
- unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
- unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
- unrealon_browser/{src/core → core}/browser_manager.py +2 -2
- unrealon_browser/{src/managers → managers}/captcha.py +1 -1
- unrealon_browser/{src/managers → managers}/cookies.py +1 -1
- unrealon_browser/managers/logger_bridge.py +231 -0
- unrealon_browser/{src/managers → managers}/profile.py +1 -1
- unrealon_driver/__init__.py +73 -19
- unrealon_driver/browser/__init__.py +8 -0
- unrealon_driver/browser/config.py +74 -0
- unrealon_driver/browser/manager.py +416 -0
- unrealon_driver/exceptions.py +28 -0
- unrealon_driver/parser/__init__.py +55 -0
- unrealon_driver/parser/cli_manager.py +141 -0
- unrealon_driver/parser/daemon_manager.py +227 -0
- unrealon_driver/parser/managers/__init__.py +46 -0
- unrealon_driver/parser/managers/browser.py +51 -0
- unrealon_driver/parser/managers/config.py +281 -0
- unrealon_driver/parser/managers/error.py +412 -0
- unrealon_driver/parser/managers/html.py +732 -0
- unrealon_driver/parser/managers/logging.py +609 -0
- unrealon_driver/parser/managers/result.py +321 -0
- unrealon_driver/parser/parser_manager.py +628 -0
- unrealon/sdk_config.py +0 -88
- unrealon-1.0.9.dist-info/METADATA +0 -810
- unrealon-1.0.9.dist-info/RECORD +0 -246
- unrealon_browser/pyproject.toml +0 -182
- unrealon_browser/src/__init__.py +0 -62
- unrealon_browser/src/managers/logger_bridge.py +0 -395
- unrealon_driver/README.md +0 -204
- unrealon_driver/pyproject.toml +0 -187
- unrealon_driver/src/__init__.py +0 -90
- unrealon_driver/src/cli/__init__.py +0 -10
- unrealon_driver/src/cli/main.py +0 -66
- unrealon_driver/src/cli/simple.py +0 -510
- unrealon_driver/src/config/__init__.py +0 -11
- unrealon_driver/src/config/auto_config.py +0 -478
- unrealon_driver/src/core/__init__.py +0 -18
- unrealon_driver/src/core/exceptions.py +0 -289
- unrealon_driver/src/core/parser.py +0 -638
- unrealon_driver/src/dto/__init__.py +0 -66
- unrealon_driver/src/dto/cli.py +0 -119
- unrealon_driver/src/dto/config.py +0 -18
- unrealon_driver/src/dto/events.py +0 -237
- unrealon_driver/src/dto/execution.py +0 -313
- unrealon_driver/src/dto/services.py +0 -311
- unrealon_driver/src/execution/__init__.py +0 -23
- unrealon_driver/src/execution/daemon_mode.py +0 -317
- unrealon_driver/src/execution/interactive_mode.py +0 -88
- unrealon_driver/src/execution/modes.py +0 -45
- unrealon_driver/src/execution/scheduled_mode.py +0 -209
- unrealon_driver/src/execution/test_mode.py +0 -250
- unrealon_driver/src/logging/__init__.py +0 -24
- unrealon_driver/src/logging/driver_logger.py +0 -512
- unrealon_driver/src/services/__init__.py +0 -24
- unrealon_driver/src/services/browser_service.py +0 -726
- unrealon_driver/src/services/llm/__init__.py +0 -15
- unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
- unrealon_driver/src/services/llm/llm.py +0 -195
- unrealon_driver/src/services/logger_service.py +0 -232
- unrealon_driver/src/services/metrics_service.py +0 -185
- unrealon_driver/src/services/scheduler_service.py +0 -489
- unrealon_driver/src/services/websocket_service.py +0 -362
- unrealon_driver/src/utils/__init__.py +0 -16
- unrealon_driver/src/utils/service_factory.py +0 -317
- unrealon_driver/src/utils/time_formatter.py +0 -338
- unrealon_llm/README.md +0 -44
- unrealon_llm/__init__.py +0 -26
- unrealon_llm/pyproject.toml +0 -154
- unrealon_llm/src/__init__.py +0 -228
- unrealon_llm/src/cli/__init__.py +0 -0
- unrealon_llm/src/core/__init__.py +0 -11
- unrealon_llm/src/core/smart_client.py +0 -438
- unrealon_llm/src/dto/__init__.py +0 -155
- unrealon_llm/src/dto/models/__init__.py +0 -0
- unrealon_llm/src/dto/models/config.py +0 -343
- unrealon_llm/src/dto/models/core.py +0 -328
- unrealon_llm/src/dto/models/enums.py +0 -123
- unrealon_llm/src/dto/models/html_analysis.py +0 -345
- unrealon_llm/src/dto/models/statistics.py +0 -473
- unrealon_llm/src/dto/models/translation.py +0 -383
- unrealon_llm/src/dto/models/type_conversion.py +0 -462
- unrealon_llm/src/dto/schemas/__init__.py +0 -0
- unrealon_llm/src/exceptions.py +0 -392
- unrealon_llm/src/llm_config/__init__.py +0 -20
- unrealon_llm/src/llm_config/logging_config.py +0 -178
- unrealon_llm/src/llm_logging/__init__.py +0 -42
- unrealon_llm/src/llm_logging/llm_events.py +0 -107
- unrealon_llm/src/llm_logging/llm_logger.py +0 -466
- unrealon_llm/src/managers/__init__.py +0 -15
- unrealon_llm/src/managers/cache_manager.py +0 -67
- unrealon_llm/src/managers/cost_manager.py +0 -107
- unrealon_llm/src/managers/request_manager.py +0 -298
- unrealon_llm/src/modules/__init__.py +0 -0
- unrealon_llm/src/modules/html_processor/__init__.py +0 -25
- unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
- unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
- unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
- unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
- unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
- unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
- unrealon_llm/src/modules/html_processor/processor.py +0 -102
- unrealon_llm/src/modules/llm/__init__.py +0 -0
- unrealon_llm/src/modules/translator/__init__.py +0 -0
- unrealon_llm/src/provider.py +0 -116
- unrealon_llm/src/utils/__init__.py +0 -95
- unrealon_llm/src/utils/common.py +0 -64
- unrealon_llm/src/utils/data_extractor.py +0 -188
- unrealon_llm/src/utils/html_cleaner.py +0 -767
- unrealon_llm/src/utils/language_detector.py +0 -308
- unrealon_llm/src/utils/models_cache.py +0 -592
- unrealon_llm/src/utils/smart_counter.py +0 -229
- unrealon_llm/src/utils/token_counter.py +0 -189
- unrealon_sdk/README.md +0 -25
- unrealon_sdk/__init__.py +0 -30
- unrealon_sdk/pyproject.toml +0 -231
- unrealon_sdk/src/__init__.py +0 -150
- unrealon_sdk/src/cli/__init__.py +0 -12
- unrealon_sdk/src/cli/commands/__init__.py +0 -22
- unrealon_sdk/src/cli/commands/benchmark.py +0 -42
- unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
- unrealon_sdk/src/cli/commands/health.py +0 -46
- unrealon_sdk/src/cli/commands/integration.py +0 -498
- unrealon_sdk/src/cli/commands/reports.py +0 -43
- unrealon_sdk/src/cli/commands/security.py +0 -36
- unrealon_sdk/src/cli/commands/server.py +0 -483
- unrealon_sdk/src/cli/commands/servers.py +0 -56
- unrealon_sdk/src/cli/commands/tests.py +0 -55
- unrealon_sdk/src/cli/main.py +0 -126
- unrealon_sdk/src/cli/utils/reporter.py +0 -519
- unrealon_sdk/src/clients/openapi.yaml +0 -3347
- unrealon_sdk/src/clients/python_http/__init__.py +0 -3
- unrealon_sdk/src/clients/python_http/api_config.py +0 -228
- unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
- unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
- unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
- unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
- unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
- unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
- unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
- unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
- unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
- unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
- unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
- unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
- unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
- unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
- unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
- unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
- unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
- unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
- unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
- unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
- unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
- unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
- unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
- unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
- unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
- unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
- unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
- unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
- unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
- unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
- unrealon_sdk/src/clients/python_websocket/client.py +0 -490
- unrealon_sdk/src/clients/python_websocket/events.py +0 -732
- unrealon_sdk/src/clients/python_websocket/example.py +0 -136
- unrealon_sdk/src/clients/python_websocket/types.py +0 -871
- unrealon_sdk/src/core/__init__.py +0 -64
- unrealon_sdk/src/core/client.py +0 -556
- unrealon_sdk/src/core/config.py +0 -465
- unrealon_sdk/src/core/exceptions.py +0 -239
- unrealon_sdk/src/core/metadata.py +0 -191
- unrealon_sdk/src/core/models.py +0 -142
- unrealon_sdk/src/core/types.py +0 -68
- unrealon_sdk/src/dto/__init__.py +0 -268
- unrealon_sdk/src/dto/authentication.py +0 -108
- unrealon_sdk/src/dto/cache.py +0 -208
- unrealon_sdk/src/dto/common.py +0 -19
- unrealon_sdk/src/dto/concurrency.py +0 -393
- unrealon_sdk/src/dto/events.py +0 -108
- unrealon_sdk/src/dto/health.py +0 -339
- unrealon_sdk/src/dto/load_balancing.py +0 -336
- unrealon_sdk/src/dto/logging.py +0 -230
- unrealon_sdk/src/dto/performance.py +0 -165
- unrealon_sdk/src/dto/rate_limiting.py +0 -295
- unrealon_sdk/src/dto/resource_pooling.py +0 -128
- unrealon_sdk/src/dto/structured_logging.py +0 -112
- unrealon_sdk/src/dto/task_scheduling.py +0 -121
- unrealon_sdk/src/dto/websocket.py +0 -55
- unrealon_sdk/src/enterprise/__init__.py +0 -59
- unrealon_sdk/src/enterprise/authentication.py +0 -401
- unrealon_sdk/src/enterprise/cache_manager.py +0 -578
- unrealon_sdk/src/enterprise/error_recovery.py +0 -494
- unrealon_sdk/src/enterprise/event_system.py +0 -549
- unrealon_sdk/src/enterprise/health_monitor.py +0 -747
- unrealon_sdk/src/enterprise/load_balancer.py +0 -964
- unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
- unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
- unrealon_sdk/src/enterprise/logging/development.py +0 -744
- unrealon_sdk/src/enterprise/logging/service.py +0 -410
- unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
- unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
- unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
- unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
- unrealon_sdk/src/enterprise/resource_pool.py +0 -763
- unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
- unrealon_sdk/src/internal/__init__.py +0 -10
- unrealon_sdk/src/internal/command_router.py +0 -497
- unrealon_sdk/src/internal/connection_manager.py +0 -397
- unrealon_sdk/src/internal/http_client.py +0 -446
- unrealon_sdk/src/internal/websocket_client.py +0 -420
- unrealon_sdk/src/provider.py +0 -471
- unrealon_sdk/src/utils.py +0 -234
- /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
- /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
- /unrealon_browser/{src/cli → cli}/main.py +0 -0
- /unrealon_browser/{src/core → core}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
- /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
- /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
|
@@ -1,726 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Smart Browser Service for UnrealOn Driver v3.0
|
|
3
|
-
|
|
4
|
-
Zero-configuration browser automation with intelligent features.
|
|
5
|
-
Wraps unrealon_browser with enhanced capabilities and smart defaults.
|
|
6
|
-
|
|
7
|
-
CRITICAL REQUIREMENTS COMPLIANCE:
|
|
8
|
-
- ✅ Absolute imports only
|
|
9
|
-
- ✅ Pydantic v2 models everywhere
|
|
10
|
-
- ✅ No Dict[str, Any] usage
|
|
11
|
-
- ✅ Complete type annotations
|
|
12
|
-
- ✅ Auto-generated model usage
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
import asyncio
|
|
16
|
-
from pathlib import Path
|
|
17
|
-
from typing import Any, List, Optional, Union, Callable
|
|
18
|
-
from datetime import datetime
|
|
19
|
-
|
|
20
|
-
from unrealon_browser.src.core.browser_manager import BrowserManager
|
|
21
|
-
from unrealon_browser.src.managers import ProfileManager, CookieManager, StealthManager
|
|
22
|
-
from unrealon_browser.src.dto.models.statistics import BrowserStatistics
|
|
23
|
-
from unrealon_browser.src.dto.models.core import PageResult
|
|
24
|
-
from unrealon_browser.src.dto.models.config import (
|
|
25
|
-
BrowserConfig as UnrealOnBrowserConfig,
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
# CRITICAL REQUIREMENTS COMPLIANCE - NO INLINE IMPORTS!
|
|
29
|
-
from unrealon_browser.src.dto import (
|
|
30
|
-
BrowserConfig,
|
|
31
|
-
BrowserType,
|
|
32
|
-
BrowserMode,
|
|
33
|
-
)
|
|
34
|
-
from unrealon_sdk.src.provider import Utils
|
|
35
|
-
from unrealon_sdk.src.clients.python_http.models.SuccessResponse import SuccessResponse
|
|
36
|
-
from unrealon_sdk.src.clients.python_http.models.ErrorResponse import ErrorResponse
|
|
37
|
-
from unrealon_sdk.src.enterprise.logging.development import get_development_logger
|
|
38
|
-
from unrealon_sdk.src.dto.logging import SDKContext, SDKEventType
|
|
39
|
-
|
|
40
|
-
from unrealon_driver.src.core.exceptions import BrowserError, create_browser_error
|
|
41
|
-
from unrealon_driver.src.dto.services import (
|
|
42
|
-
DriverBrowserConfig,
|
|
43
|
-
ServiceHealthStatus,
|
|
44
|
-
ServiceOperationResult,
|
|
45
|
-
)
|
|
46
|
-
from unrealon_driver.src.dto.events import DriverEventType
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class BrowserService:
|
|
50
|
-
"""
|
|
51
|
-
🌐 Smart Browser Service
|
|
52
|
-
|
|
53
|
-
Zero-configuration browser automation with intelligent features:
|
|
54
|
-
- 🔥 STEALTH BY DEFAULT - all navigation uses stealth automatically
|
|
55
|
-
- Smart waiting and content detection
|
|
56
|
-
- Automatic anti-detection measures
|
|
57
|
-
- Error recovery and retries
|
|
58
|
-
- Resource management
|
|
59
|
-
- Performance optimization
|
|
60
|
-
|
|
61
|
-
🔥 NAVIGATION METHODS:
|
|
62
|
-
- navigate(url) - STEALTH navigation (recommended for all use)
|
|
63
|
-
- navigate_unsafe(url) - without stealth (use only when stealth not needed)
|
|
64
|
-
- get_html(url) - STEALTH + special Amazon handling
|
|
65
|
-
"""
|
|
66
|
-
|
|
67
|
-
def __init__(
|
|
68
|
-
self,
|
|
69
|
-
config: DriverBrowserConfig,
|
|
70
|
-
logger: Optional[Any] = None,
|
|
71
|
-
metrics: Optional[Any] = None,
|
|
72
|
-
):
|
|
73
|
-
"""Initialize browser service with auto-configuration."""
|
|
74
|
-
self.config = config
|
|
75
|
-
self.logger = logger
|
|
76
|
-
self.metrics = metrics
|
|
77
|
-
|
|
78
|
-
# ✅ DEVELOPMENT LOGGER INTEGRATION (CRITICAL REQUIREMENT)
|
|
79
|
-
self.dev_logger = get_development_logger()
|
|
80
|
-
|
|
81
|
-
# Browser management
|
|
82
|
-
self._browser_manager: Optional[BrowserManager] = None
|
|
83
|
-
self._current_page = None
|
|
84
|
-
self._is_initialized = False
|
|
85
|
-
|
|
86
|
-
# Performance tracking
|
|
87
|
-
self._operation_count = 0
|
|
88
|
-
self._total_duration = 0.0
|
|
89
|
-
|
|
90
|
-
# Log initialization with development logger
|
|
91
|
-
if self.dev_logger:
|
|
92
|
-
self.dev_logger.log_info(
|
|
93
|
-
SDKEventType.COMPONENT_CREATED,
|
|
94
|
-
"Browser service initialized",
|
|
95
|
-
context=SDKContext(
|
|
96
|
-
parser_id=self.config.parser_id,
|
|
97
|
-
component_name="Browser",
|
|
98
|
-
layer_name="UnrealOn_Driver",
|
|
99
|
-
metadata={
|
|
100
|
-
"headless": self.config.headless,
|
|
101
|
-
"stealth": True, # Always enabled
|
|
102
|
-
"timeout": self.config.timeout,
|
|
103
|
-
"debug_mode": self.config.debug_mode,
|
|
104
|
-
},
|
|
105
|
-
),
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
async def _log_driver_event(
|
|
109
|
-
self, event_type: DriverEventType, message: str, **metadata
|
|
110
|
-
) -> None:
|
|
111
|
-
"""Log ONLY driver-specific events (not browser module events)."""
|
|
112
|
-
if self.dev_logger and event_type in [
|
|
113
|
-
DriverEventType.SERVICE_INITIALIZED,
|
|
114
|
-
DriverEventType.SERVICE_ERROR,
|
|
115
|
-
DriverEventType.BROWSER_CONTENT_EXTRACTED,
|
|
116
|
-
DriverEventType.BROWSER_SCREENSHOT_TAKEN,
|
|
117
|
-
]:
|
|
118
|
-
self.dev_logger.log_info(
|
|
119
|
-
event_type.value,
|
|
120
|
-
message,
|
|
121
|
-
context=SDKContext(
|
|
122
|
-
parser_id=self.config.parser_id,
|
|
123
|
-
component_name="Browser",
|
|
124
|
-
layer_name="UnrealOn_Driver",
|
|
125
|
-
metadata=metadata,
|
|
126
|
-
),
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
async def _ensure_initialized(self):
|
|
130
|
-
"""Ensure browser is initialized."""
|
|
131
|
-
if not self._is_initialized:
|
|
132
|
-
await self._initialize_browser()
|
|
133
|
-
|
|
134
|
-
async def _initialize_browser(self):
|
|
135
|
-
"""Initialize browser with unrealon_browser integration."""
|
|
136
|
-
try:
|
|
137
|
-
|
|
138
|
-
browser_config = Utils.create_browser_config(
|
|
139
|
-
parser_name=self.config.parser_id,
|
|
140
|
-
browser_type=BrowserType.CHROMIUM,
|
|
141
|
-
# 🔥 STEALTH ALWAYS ON - NO CONFIG NEEDED!
|
|
142
|
-
headless=self.config.headless
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# Create browser manager (logger_bridge auto-integrates with SDK)
|
|
146
|
-
self._browser_manager = BrowserManager(config=browser_config)
|
|
147
|
-
|
|
148
|
-
# Setup system paths if provided
|
|
149
|
-
system_dir = self.config.user_data_dir
|
|
150
|
-
if system_dir:
|
|
151
|
-
profiles_dir = Path(system_dir) / "browser_profiles"
|
|
152
|
-
cookies_dir = Path(system_dir) / "cookies"
|
|
153
|
-
|
|
154
|
-
# Ensure directories exist
|
|
155
|
-
profiles_dir.mkdir(parents=True, exist_ok=True)
|
|
156
|
-
cookies_dir.mkdir(parents=True, exist_ok=True)
|
|
157
|
-
|
|
158
|
-
# Override managers with custom paths
|
|
159
|
-
self._browser_manager.profile_manager = ProfileManager(
|
|
160
|
-
profiles_dir=str(profiles_dir)
|
|
161
|
-
)
|
|
162
|
-
self._browser_manager.cookie_manager = CookieManager(
|
|
163
|
-
cookies_dir=str(cookies_dir),
|
|
164
|
-
parser_name=self.config.parser_id,
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
# Initialize browser async
|
|
168
|
-
await self._browser_manager.initialize_async()
|
|
169
|
-
|
|
170
|
-
self._is_initialized = True
|
|
171
|
-
|
|
172
|
-
# Log browser initialized event
|
|
173
|
-
if self.logger:
|
|
174
|
-
self.logger.info(
|
|
175
|
-
f"Browser service initialized - headless: {self.config.headless}"
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
if self.logger:
|
|
179
|
-
self.logger.info("Browser service initialized successfully")
|
|
180
|
-
|
|
181
|
-
except Exception as e:
|
|
182
|
-
# Log browser launch failure
|
|
183
|
-
if self.logger:
|
|
184
|
-
self.logger.error(f"Browser initialization failed: {e}")
|
|
185
|
-
raise BrowserError(f"Failed to initialize browser: {e}")
|
|
186
|
-
|
|
187
|
-
def _convert_config_to_unrealon_browser(self) -> UnrealOnBrowserConfig:
|
|
188
|
-
"""Convert our config to unrealon_browser Pydantic model with type safety."""
|
|
189
|
-
return UnrealOnBrowserConfig(
|
|
190
|
-
parser_name=self.config.parser_id,
|
|
191
|
-
page_load_timeout_seconds=float(self.config.timeout),
|
|
192
|
-
navigation_timeout_seconds=float(self.config.timeout),
|
|
193
|
-
disable_images=not self.config.enable_images,
|
|
194
|
-
# Map our settings to unrealon_browser settings
|
|
195
|
-
use_proxy_rotation=False, # Default behavior
|
|
196
|
-
realistic_ports_only=False, # Default behavior
|
|
197
|
-
enable_stealth_check=self.config.debug_mode,
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
# ==========================================
|
|
201
|
-
# SMART EXTRACTION METHODS
|
|
202
|
-
# ==========================================
|
|
203
|
-
|
|
204
|
-
async def extract(
|
|
205
|
-
self,
|
|
206
|
-
url: str,
|
|
207
|
-
selector: str,
|
|
208
|
-
limit: Optional[int] = None,
|
|
209
|
-
timeout: Optional[int] = None,
|
|
210
|
-
attribute: Optional[str] = None,
|
|
211
|
-
**kwargs,
|
|
212
|
-
) -> List[str]:
|
|
213
|
-
"""
|
|
214
|
-
🎯 Smart extraction with automatic waiting and error handling.
|
|
215
|
-
|
|
216
|
-
Args:
|
|
217
|
-
url: Target URL
|
|
218
|
-
selector: CSS selector
|
|
219
|
-
limit: Maximum number of items to extract
|
|
220
|
-
timeout: Custom timeout (uses default if not specified)
|
|
221
|
-
attribute: Extract attribute instead of text
|
|
222
|
-
**kwargs: Additional options
|
|
223
|
-
|
|
224
|
-
Returns:
|
|
225
|
-
List of extracted text/attributes
|
|
226
|
-
|
|
227
|
-
Example:
|
|
228
|
-
headlines = await browser.extract(
|
|
229
|
-
"https://news.com",
|
|
230
|
-
".headline",
|
|
231
|
-
limit=10
|
|
232
|
-
)
|
|
233
|
-
"""
|
|
234
|
-
start_time = datetime.now()
|
|
235
|
-
|
|
236
|
-
try:
|
|
237
|
-
await self._ensure_initialized()
|
|
238
|
-
|
|
239
|
-
# Navigate to URL with smart waiting
|
|
240
|
-
page = await self._navigate_smart(url, timeout=timeout)
|
|
241
|
-
|
|
242
|
-
# Wait for content to be ready
|
|
243
|
-
await self._wait_for_content_ready(page, selector, timeout)
|
|
244
|
-
|
|
245
|
-
# Extract elements
|
|
246
|
-
if attribute:
|
|
247
|
-
elements = await page.query_selector_all(selector)
|
|
248
|
-
results = [
|
|
249
|
-
await element.get_attribute(attribute)
|
|
250
|
-
for element in elements[:limit]
|
|
251
|
-
if element
|
|
252
|
-
]
|
|
253
|
-
results = [r for r in results if r] # Filter None values
|
|
254
|
-
else:
|
|
255
|
-
elements = await page.query_selector_all(selector)
|
|
256
|
-
results = [
|
|
257
|
-
await element.text_content()
|
|
258
|
-
for element in elements[:limit]
|
|
259
|
-
if element
|
|
260
|
-
]
|
|
261
|
-
results = [r.strip() for r in results if r and r.strip()] # Clean text
|
|
262
|
-
|
|
263
|
-
# Apply limit if specified
|
|
264
|
-
if limit:
|
|
265
|
-
results = results[:limit]
|
|
266
|
-
|
|
267
|
-
# Record metrics
|
|
268
|
-
duration = (datetime.now() - start_time).total_seconds()
|
|
269
|
-
self._record_operation("extract", duration, len(results))
|
|
270
|
-
|
|
271
|
-
if self.logger:
|
|
272
|
-
self.logger.info(
|
|
273
|
-
f"Extracted {len(results)} items from {url} in {duration:.2f}s"
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
return results
|
|
277
|
-
|
|
278
|
-
except Exception as e:
|
|
279
|
-
duration = (datetime.now() - start_time).total_seconds()
|
|
280
|
-
self._record_operation("extract", duration, 0, error=str(e))
|
|
281
|
-
|
|
282
|
-
raise create_browser_error(
|
|
283
|
-
f"Failed to extract from {url}: {e}", url=url, selector=selector
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
async def extract_all(
|
|
287
|
-
self, url: str, selector: str, timeout: Optional[int] = None, **kwargs
|
|
288
|
-
) -> List[str]:
|
|
289
|
-
"""Extract all matching elements without limit."""
|
|
290
|
-
return await self.extract(url, selector, limit=None, timeout=timeout, **kwargs)
|
|
291
|
-
|
|
292
|
-
async def extract_attributes(
|
|
293
|
-
self,
|
|
294
|
-
url: str,
|
|
295
|
-
selector: str,
|
|
296
|
-
attribute: str,
|
|
297
|
-
limit: Optional[int] = None,
|
|
298
|
-
timeout: Optional[int] = None,
|
|
299
|
-
**kwargs,
|
|
300
|
-
) -> List[str]:
|
|
301
|
-
"""Extract specific attributes from elements."""
|
|
302
|
-
return await self.extract(
|
|
303
|
-
url, selector, limit=limit, timeout=timeout, attribute=attribute, **kwargs
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
async def extract_structured(
|
|
307
|
-
self, url: str, schema: dict, timeout: Optional[int] = None, **kwargs
|
|
308
|
-
) -> dict:
|
|
309
|
-
"""
|
|
310
|
-
🏗️ Extract structured data using schema definition.
|
|
311
|
-
|
|
312
|
-
Args:
|
|
313
|
-
url: Target URL
|
|
314
|
-
schema: Schema defining what to extract
|
|
315
|
-
timeout: Custom timeout
|
|
316
|
-
**kwargs: Additional options
|
|
317
|
-
|
|
318
|
-
Returns:
|
|
319
|
-
Structured data matching schema
|
|
320
|
-
|
|
321
|
-
Example:
|
|
322
|
-
products = await browser.extract_structured(
|
|
323
|
-
"https://shop.com",
|
|
324
|
-
schema={
|
|
325
|
-
"name": ".product-name",
|
|
326
|
-
"price": ".price",
|
|
327
|
-
"rating": ".rating"
|
|
328
|
-
}
|
|
329
|
-
)
|
|
330
|
-
"""
|
|
331
|
-
start_time = datetime.now()
|
|
332
|
-
|
|
333
|
-
try:
|
|
334
|
-
await self._ensure_initialized()
|
|
335
|
-
page = await self._navigate_smart(url, timeout=timeout)
|
|
336
|
-
|
|
337
|
-
result = {}
|
|
338
|
-
|
|
339
|
-
for field, selector in schema.items():
|
|
340
|
-
if isinstance(selector, dict):
|
|
341
|
-
# Nested schema
|
|
342
|
-
if "selector" in selector and "fields" in selector:
|
|
343
|
-
# Multiple items with fields
|
|
344
|
-
items = []
|
|
345
|
-
elements = await page.query_selector_all(selector["selector"])
|
|
346
|
-
|
|
347
|
-
for element in elements:
|
|
348
|
-
item = {}
|
|
349
|
-
for sub_field, sub_selector in selector["fields"].items():
|
|
350
|
-
sub_element = await element.query_selector(sub_selector)
|
|
351
|
-
if sub_element:
|
|
352
|
-
item[sub_field] = (
|
|
353
|
-
await sub_element.text_content()
|
|
354
|
-
).strip()
|
|
355
|
-
if item:
|
|
356
|
-
items.append(item)
|
|
357
|
-
|
|
358
|
-
result[field] = items
|
|
359
|
-
else:
|
|
360
|
-
# Single nested object
|
|
361
|
-
nested_result = {}
|
|
362
|
-
for sub_field, sub_selector in selector.items():
|
|
363
|
-
element = await page.query_selector(sub_selector)
|
|
364
|
-
if element:
|
|
365
|
-
nested_result[sub_field] = (
|
|
366
|
-
await element.text_content()
|
|
367
|
-
).strip()
|
|
368
|
-
result[field] = nested_result
|
|
369
|
-
else:
|
|
370
|
-
# Simple selector
|
|
371
|
-
element = await page.query_selector(selector)
|
|
372
|
-
if element:
|
|
373
|
-
result[field] = (await element.text_content()).strip()
|
|
374
|
-
|
|
375
|
-
duration = (datetime.now() - start_time).total_seconds()
|
|
376
|
-
self._record_operation("extract_structured", duration, len(result))
|
|
377
|
-
|
|
378
|
-
# Log content extraction success
|
|
379
|
-
await self._log_driver_event(
|
|
380
|
-
DriverEventType.BROWSER_CONTENT_EXTRACTED,
|
|
381
|
-
f"Content extracted successfully from {url}",
|
|
382
|
-
url=url,
|
|
383
|
-
extraction_time_ms=duration * 1000,
|
|
384
|
-
fields_extracted=len(result),
|
|
385
|
-
schema_fields=list(schema.keys()),
|
|
386
|
-
)
|
|
387
|
-
|
|
388
|
-
return result
|
|
389
|
-
|
|
390
|
-
except Exception as e:
|
|
391
|
-
duration = (datetime.now() - start_time).total_seconds()
|
|
392
|
-
self._record_operation("extract_structured", duration, 0, error=str(e))
|
|
393
|
-
|
|
394
|
-
raise create_browser_error(
|
|
395
|
-
f"Failed to extract structured data from {url}: {e}", url=url
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
# ==========================================
|
|
399
|
-
# NAVIGATION AND PAGE CONTROL
|
|
400
|
-
# ==========================================
|
|
401
|
-
|
|
402
|
-
async def navigate(self, url: str, timeout: Optional[int] = None):
|
|
403
|
-
"""🔥 NAVIGATE WITH STEALTH BY DEFAULT - safer and better detection avoidance."""
|
|
404
|
-
return await self._navigate_stealth(url, timeout)
|
|
405
|
-
|
|
406
|
-
async def navigate_unsafe(self, url: str, timeout: Optional[int] = None):
|
|
407
|
-
"""Navigate WITHOUT stealth - use only when stealth is not needed."""
|
|
408
|
-
await self._ensure_initialized()
|
|
409
|
-
return await self._navigate_smart(url, timeout)
|
|
410
|
-
|
|
411
|
-
async def _navigate_stealth(self, url: str, timeout: Optional[int] = None):
|
|
412
|
-
"""Private: Navigate with advanced stealth - blank page first, then target."""
|
|
413
|
-
await self._ensure_initialized()
|
|
414
|
-
|
|
415
|
-
# Step 1: Navigate to blank page first (stealth technique)
|
|
416
|
-
page = self._current_page or self._browser_manager.page
|
|
417
|
-
|
|
418
|
-
if self.logger:
|
|
419
|
-
self.logger.info(f"🕸️ Stealth navigation: blank → {url}")
|
|
420
|
-
|
|
421
|
-
# Navigate to blank page first
|
|
422
|
-
await page.goto("about:blank", wait_until="domcontentloaded")
|
|
423
|
-
await asyncio.sleep(1.0) # Brief pause
|
|
424
|
-
|
|
425
|
-
# Step 2: Navigate to target URL with proper waiting
|
|
426
|
-
return await self._navigate_smart(url, timeout)
|
|
427
|
-
|
|
428
|
-
async def get_html(self, url: str, timeout: Optional[int] = None) -> str:
|
|
429
|
-
"""Get full HTML content from URL with proper stealth navigation."""
|
|
430
|
-
|
|
431
|
-
# 🔥 AMAZON SPECIAL: Go to homepage first, then target URL!
|
|
432
|
-
if "amazon.com" in url:
|
|
433
|
-
await self._ensure_initialized()
|
|
434
|
-
page = self._current_page or self._browser_manager.page
|
|
435
|
-
|
|
436
|
-
if self.logger:
|
|
437
|
-
self.logger.info(f"🛒 Amazon navigation: homepage → {url}")
|
|
438
|
-
|
|
439
|
-
# Step 1: Go to Amazon homepage first (balanced approach)
|
|
440
|
-
await page.goto("about:blank", wait_until="domcontentloaded")
|
|
441
|
-
await asyncio.sleep(1.0)
|
|
442
|
-
await page.goto("https://www.amazon.com", wait_until="domcontentloaded", timeout=15000)
|
|
443
|
-
await asyncio.sleep(2.0) # Let homepage stabilize
|
|
444
|
-
|
|
445
|
-
# Step 2: Navigate to target URL (balanced approach)
|
|
446
|
-
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
|
447
|
-
|
|
448
|
-
# Step 3: Wait for search results to load dynamically
|
|
449
|
-
await asyncio.sleep(3.0) # Wait for dynamic content
|
|
450
|
-
|
|
451
|
-
# Step 4: Additional wait for any delayed content
|
|
452
|
-
try:
|
|
453
|
-
await page.wait_for_selector("[data-component-type='s-search-result']", timeout=5000)
|
|
454
|
-
except:
|
|
455
|
-
# Fallback: just wait a bit more
|
|
456
|
-
await asyncio.sleep(2.0)
|
|
457
|
-
|
|
458
|
-
return await page.content()
|
|
459
|
-
else:
|
|
460
|
-
# Regular stealth navigation for non-Amazon sites
|
|
461
|
-
page = await self._navigate_stealth(url, timeout)
|
|
462
|
-
return await page.content()
|
|
463
|
-
|
|
464
|
-
async def screenshot(
|
|
465
|
-
self,
|
|
466
|
-
url: Optional[str] = None,
|
|
467
|
-
path: Optional[str] = None,
|
|
468
|
-
full_page: bool = True,
|
|
469
|
-
) -> str:
|
|
470
|
-
"""Take screenshot and return path."""
|
|
471
|
-
try:
|
|
472
|
-
if url:
|
|
473
|
-
page = await self.navigate(url) # 🔥 Now uses stealth by default!
|
|
474
|
-
else:
|
|
475
|
-
page = self._current_page
|
|
476
|
-
if not page:
|
|
477
|
-
raise BrowserError("No active page for screenshot")
|
|
478
|
-
|
|
479
|
-
if not path:
|
|
480
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
481
|
-
path = f"screenshot_{timestamp}.png"
|
|
482
|
-
|
|
483
|
-
await page.screenshot(path=path, full_page=full_page)
|
|
484
|
-
|
|
485
|
-
if self.logger:
|
|
486
|
-
self.logger.info(f"Screenshot saved: {path}")
|
|
487
|
-
|
|
488
|
-
return path
|
|
489
|
-
|
|
490
|
-
except Exception as e:
|
|
491
|
-
raise BrowserError(f"Failed to take screenshot: {e}")
|
|
492
|
-
|
|
493
|
-
# ==========================================
|
|
494
|
-
# SMART FEATURES
|
|
495
|
-
# ==========================================
|
|
496
|
-
|
|
497
|
-
async def extract_with_retry(
|
|
498
|
-
self,
|
|
499
|
-
url: str,
|
|
500
|
-
selector: str,
|
|
501
|
-
max_retries: int = 3,
|
|
502
|
-
backoff_factor: float = 2.0,
|
|
503
|
-
**kwargs,
|
|
504
|
-
) -> List[str]:
|
|
505
|
-
"""Extract with automatic retry logic."""
|
|
506
|
-
last_error = None
|
|
507
|
-
|
|
508
|
-
for attempt in range(max_retries + 1):
|
|
509
|
-
try:
|
|
510
|
-
return await self.extract(url, selector, **kwargs)
|
|
511
|
-
except Exception as e:
|
|
512
|
-
last_error = e
|
|
513
|
-
if attempt < max_retries:
|
|
514
|
-
delay = backoff_factor**attempt
|
|
515
|
-
if self.logger:
|
|
516
|
-
self.logger.warning(
|
|
517
|
-
f"Extraction attempt {attempt + 1} failed, retrying in {delay}s: {e}"
|
|
518
|
-
)
|
|
519
|
-
await asyncio.sleep(delay)
|
|
520
|
-
else:
|
|
521
|
-
if self.logger:
|
|
522
|
-
self.logger.error(
|
|
523
|
-
f"All {max_retries + 1} extraction attempts failed"
|
|
524
|
-
)
|
|
525
|
-
|
|
526
|
-
raise last_error
|
|
527
|
-
|
|
528
|
-
async def extract_with_scroll(
|
|
529
|
-
self,
|
|
530
|
-
url: str,
|
|
531
|
-
selector: str,
|
|
532
|
-
max_scrolls: int = 10,
|
|
533
|
-
scroll_delay: float = 1.0,
|
|
534
|
-
auto_detect_end: bool = True,
|
|
535
|
-
**kwargs,
|
|
536
|
-
) -> List[str]:
|
|
537
|
-
"""Extract with infinite scroll handling."""
|
|
538
|
-
try:
|
|
539
|
-
page = await self.navigate(url) # 🔥 Now uses stealth by default!
|
|
540
|
-
all_results = []
|
|
541
|
-
last_count = 0
|
|
542
|
-
|
|
543
|
-
for scroll in range(max_scrolls):
|
|
544
|
-
# Extract current items
|
|
545
|
-
elements = await page.query_selector_all(selector)
|
|
546
|
-
current_results = [
|
|
547
|
-
(await elem.text_content()).strip() for elem in elements if elem
|
|
548
|
-
]
|
|
549
|
-
current_results = [r for r in current_results if r]
|
|
550
|
-
|
|
551
|
-
# Check if we found new items
|
|
552
|
-
if auto_detect_end and len(current_results) == last_count:
|
|
553
|
-
if self.logger:
|
|
554
|
-
self.logger.info(
|
|
555
|
-
f"No new items found, stopping scroll at {scroll}"
|
|
556
|
-
)
|
|
557
|
-
break
|
|
558
|
-
|
|
559
|
-
all_results = current_results
|
|
560
|
-
last_count = len(current_results)
|
|
561
|
-
|
|
562
|
-
# Scroll to bottom
|
|
563
|
-
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
564
|
-
await asyncio.sleep(scroll_delay)
|
|
565
|
-
|
|
566
|
-
# Wait for potential new content
|
|
567
|
-
await page.wait_for_timeout(1000)
|
|
568
|
-
|
|
569
|
-
if self.logger:
|
|
570
|
-
self.logger.info(
|
|
571
|
-
f"Extracted {len(all_results)} items with {scroll + 1} scrolls"
|
|
572
|
-
)
|
|
573
|
-
|
|
574
|
-
return all_results
|
|
575
|
-
|
|
576
|
-
except Exception as e:
|
|
577
|
-
raise create_browser_error(
|
|
578
|
-
f"Failed to extract with scroll from {url}: {e}",
|
|
579
|
-
url=url,
|
|
580
|
-
selector=selector,
|
|
581
|
-
)
|
|
582
|
-
|
|
583
|
-
# ==========================================
|
|
584
|
-
# PRIVATE METHODS
|
|
585
|
-
# ==========================================
|
|
586
|
-
|
|
587
|
-
async def _navigate_smart(self, url: str, timeout: Optional[int] = None):
|
|
588
|
-
"""Smart navigation with optimal waiting."""
|
|
589
|
-
timeout = timeout or self.config.timeout
|
|
590
|
-
start_time = datetime.now()
|
|
591
|
-
|
|
592
|
-
# Navigation events are automatically logged by unrealon_browser module
|
|
593
|
-
|
|
594
|
-
try:
|
|
595
|
-
# Get or create page
|
|
596
|
-
if not self._current_page:
|
|
597
|
-
self._current_page = self._browser_manager.page
|
|
598
|
-
|
|
599
|
-
page = self._current_page
|
|
600
|
-
|
|
601
|
-
# Navigate with fast waiting (like old driver)
|
|
602
|
-
await page.goto(url, wait_until="domcontentloaded", timeout=timeout * 1000)
|
|
603
|
-
|
|
604
|
-
# Quick wait for basic content (like old driver: 1 second)
|
|
605
|
-
await asyncio.sleep(1.0)
|
|
606
|
-
|
|
607
|
-
# Navigation success events are automatically logged by unrealon_browser module
|
|
608
|
-
|
|
609
|
-
return page
|
|
610
|
-
|
|
611
|
-
except Exception as e:
|
|
612
|
-
# Navigation failure events are automatically logged by unrealon_browser module
|
|
613
|
-
raise BrowserError(f"Failed to navigate to {url}: {e}")
|
|
614
|
-
|
|
615
|
-
async def _wait_for_content_ready(
|
|
616
|
-
self, page, selector: str, timeout: Optional[int] = None
|
|
617
|
-
):
|
|
618
|
-
"""Wait for content to be ready with intelligent detection."""
|
|
619
|
-
timeout = timeout or self.config.timeout
|
|
620
|
-
|
|
621
|
-
try:
|
|
622
|
-
# Wait for selector to appear
|
|
623
|
-
await page.wait_for_selector(selector, timeout=timeout * 1000)
|
|
624
|
-
|
|
625
|
-
# Additional waiting for dynamic content
|
|
626
|
-
await asyncio.sleep(0.5) # Brief pause for dynamic content
|
|
627
|
-
|
|
628
|
-
except Exception:
|
|
629
|
-
# Selector not found - this might be okay, let extraction handle it
|
|
630
|
-
pass
|
|
631
|
-
|
|
632
|
-
async def _wait_for_dynamic_content(self, page, max_wait: float = 3.0):
|
|
633
|
-
"""Wait for dynamic content to stabilize."""
|
|
634
|
-
try:
|
|
635
|
-
# Wait for network to be mostly idle
|
|
636
|
-
await page.wait_for_load_state("networkidle", timeout=max_wait * 1000)
|
|
637
|
-
except Exception as e:
|
|
638
|
-
# Timeout is okay - page might be ready enough
|
|
639
|
-
if self.logger:
|
|
640
|
-
self.logger.debug(f"Network idle wait timeout (acceptable): {e}")
|
|
641
|
-
pass
|
|
642
|
-
|
|
643
|
-
def _record_operation(
|
|
644
|
-
self,
|
|
645
|
-
operation: str,
|
|
646
|
-
duration: float,
|
|
647
|
-
result_count: int,
|
|
648
|
-
error: Optional[str] = None,
|
|
649
|
-
):
|
|
650
|
-
"""Record operation metrics."""
|
|
651
|
-
self._operation_count += 1
|
|
652
|
-
self._total_duration += duration
|
|
653
|
-
|
|
654
|
-
if self.metrics:
|
|
655
|
-
self.metrics.record_operation(
|
|
656
|
-
service="browser",
|
|
657
|
-
operation=operation,
|
|
658
|
-
duration=duration,
|
|
659
|
-
result_count=result_count,
|
|
660
|
-
error=error,
|
|
661
|
-
)
|
|
662
|
-
|
|
663
|
-
# ==========================================
|
|
664
|
-
# SERVICE MANAGEMENT
|
|
665
|
-
# ==========================================
|
|
666
|
-
|
|
667
|
-
async def health_check(self) -> dict:
|
|
668
|
-
"""Check browser service health with type safety."""
|
|
669
|
-
try:
|
|
670
|
-
last_check = datetime.now().isoformat()
|
|
671
|
-
|
|
672
|
-
if not self._is_initialized:
|
|
673
|
-
return {
|
|
674
|
-
"status": "degraded", # Change to degraded instead of unhealthy
|
|
675
|
-
"service_name": "browser",
|
|
676
|
-
"last_check": last_check,
|
|
677
|
-
"last_error": "Service not initialized",
|
|
678
|
-
"error_count": 1,
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
# Basic health check - try to create a page
|
|
682
|
-
start_time = datetime.now()
|
|
683
|
-
test_page = await self._browser_manager.get_page()
|
|
684
|
-
await test_page.close()
|
|
685
|
-
response_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
686
|
-
|
|
687
|
-
return {
|
|
688
|
-
"status": "healthy",
|
|
689
|
-
"service_name": "browser",
|
|
690
|
-
"last_check": last_check,
|
|
691
|
-
"response_time_ms": response_time,
|
|
692
|
-
"error_rate": 0.0,
|
|
693
|
-
"uptime_seconds": self._operation_count, # Using operation count as proxy
|
|
694
|
-
"error_count": 0,
|
|
695
|
-
}
|
|
696
|
-
except Exception as e:
|
|
697
|
-
return {
|
|
698
|
-
"status": "degraded", # Change to degraded for consistency
|
|
699
|
-
"service_name": "browser",
|
|
700
|
-
"last_check": datetime.now().isoformat(),
|
|
701
|
-
"last_error": str(e),
|
|
702
|
-
"error_count": 1,
|
|
703
|
-
}
|
|
704
|
-
|
|
705
|
-
async def cleanup(self):
|
|
706
|
-
"""Clean up browser resources."""
|
|
707
|
-
try:
|
|
708
|
-
if self._current_page:
|
|
709
|
-
await self._current_page.close()
|
|
710
|
-
self._current_page = None
|
|
711
|
-
|
|
712
|
-
if self._browser_manager:
|
|
713
|
-
await self._browser_manager.close_async()
|
|
714
|
-
self._browser_manager = None
|
|
715
|
-
|
|
716
|
-
self._is_initialized = False
|
|
717
|
-
|
|
718
|
-
if self.logger:
|
|
719
|
-
self.logger.info("Browser service cleaned up")
|
|
720
|
-
|
|
721
|
-
except Exception as e:
|
|
722
|
-
if self.logger:
|
|
723
|
-
self.logger.error(f"Error during browser cleanup: {e}")
|
|
724
|
-
|
|
725
|
-
def __repr__(self) -> str:
|
|
726
|
-
return f"<BrowserService(initialized={self._is_initialized}, operations={self._operation_count})>"
|