unrealon 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +23 -21
- unrealon-1.1.0.dist-info/METADATA +164 -0
- unrealon-1.1.0.dist-info/RECORD +82 -0
- {unrealon-1.0.8.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
- unrealon-1.1.0.dist-info/entry_points.txt +9 -0
- {unrealon-1.0.8.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
- unrealon_bridge/__init__.py +114 -0
- unrealon_bridge/cli.py +316 -0
- unrealon_bridge/client/__init__.py +93 -0
- unrealon_bridge/client/base.py +78 -0
- unrealon_bridge/client/commands.py +89 -0
- unrealon_bridge/client/connection.py +90 -0
- unrealon_bridge/client/events.py +65 -0
- unrealon_bridge/client/health.py +38 -0
- unrealon_bridge/client/html_parser.py +146 -0
- unrealon_bridge/client/logging.py +139 -0
- unrealon_bridge/client/proxy.py +70 -0
- unrealon_bridge/client/scheduler.py +450 -0
- unrealon_bridge/client/session.py +70 -0
- unrealon_bridge/configs/__init__.py +14 -0
- unrealon_bridge/configs/bridge_config.py +212 -0
- unrealon_bridge/configs/bridge_config.yaml +39 -0
- unrealon_bridge/models/__init__.py +138 -0
- unrealon_bridge/models/base.py +28 -0
- unrealon_bridge/models/command.py +41 -0
- unrealon_bridge/models/events.py +40 -0
- unrealon_bridge/models/html_parser.py +79 -0
- unrealon_bridge/models/logging.py +55 -0
- unrealon_bridge/models/parser.py +63 -0
- unrealon_bridge/models/proxy.py +41 -0
- unrealon_bridge/models/requests.py +95 -0
- unrealon_bridge/models/responses.py +88 -0
- unrealon_bridge/models/scheduler.py +592 -0
- unrealon_bridge/models/session.py +28 -0
- unrealon_bridge/server/__init__.py +91 -0
- unrealon_bridge/server/base.py +171 -0
- unrealon_bridge/server/handlers/__init__.py +23 -0
- unrealon_bridge/server/handlers/command.py +110 -0
- unrealon_bridge/server/handlers/html_parser.py +139 -0
- unrealon_bridge/server/handlers/logging.py +95 -0
- unrealon_bridge/server/handlers/parser.py +95 -0
- unrealon_bridge/server/handlers/proxy.py +75 -0
- unrealon_bridge/server/handlers/scheduler.py +545 -0
- unrealon_bridge/server/handlers/session.py +66 -0
- unrealon_browser/__init__.py +61 -18
- unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
- unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
- unrealon_browser/{src/core → core}/browser_manager.py +2 -2
- unrealon_browser/{src/managers → managers}/captcha.py +1 -1
- unrealon_browser/{src/managers → managers}/cookies.py +1 -1
- unrealon_browser/managers/logger_bridge.py +231 -0
- unrealon_browser/{src/managers → managers}/profile.py +1 -1
- unrealon_driver/__init__.py +73 -19
- unrealon_driver/browser/__init__.py +8 -0
- unrealon_driver/browser/config.py +74 -0
- unrealon_driver/browser/manager.py +416 -0
- unrealon_driver/exceptions.py +28 -0
- unrealon_driver/parser/__init__.py +55 -0
- unrealon_driver/parser/cli_manager.py +141 -0
- unrealon_driver/parser/daemon_manager.py +227 -0
- unrealon_driver/parser/managers/__init__.py +46 -0
- unrealon_driver/parser/managers/browser.py +51 -0
- unrealon_driver/parser/managers/config.py +281 -0
- unrealon_driver/parser/managers/error.py +412 -0
- unrealon_driver/parser/managers/html.py +732 -0
- unrealon_driver/parser/managers/logging.py +609 -0
- unrealon_driver/parser/managers/result.py +321 -0
- unrealon_driver/parser/parser_manager.py +628 -0
- unrealon/sdk_config.py +0 -88
- unrealon-1.0.8.dist-info/METADATA +0 -803
- unrealon-1.0.8.dist-info/RECORD +0 -246
- unrealon_browser/pyproject.toml +0 -182
- unrealon_browser/src/__init__.py +0 -62
- unrealon_browser/src/managers/logger_bridge.py +0 -395
- unrealon_driver/README.md +0 -204
- unrealon_driver/pyproject.toml +0 -187
- unrealon_driver/src/__init__.py +0 -90
- unrealon_driver/src/cli/__init__.py +0 -10
- unrealon_driver/src/cli/main.py +0 -66
- unrealon_driver/src/cli/simple.py +0 -510
- unrealon_driver/src/config/__init__.py +0 -11
- unrealon_driver/src/config/auto_config.py +0 -478
- unrealon_driver/src/core/__init__.py +0 -18
- unrealon_driver/src/core/exceptions.py +0 -289
- unrealon_driver/src/core/parser.py +0 -638
- unrealon_driver/src/dto/__init__.py +0 -66
- unrealon_driver/src/dto/cli.py +0 -119
- unrealon_driver/src/dto/config.py +0 -18
- unrealon_driver/src/dto/events.py +0 -237
- unrealon_driver/src/dto/execution.py +0 -313
- unrealon_driver/src/dto/services.py +0 -311
- unrealon_driver/src/execution/__init__.py +0 -23
- unrealon_driver/src/execution/daemon_mode.py +0 -317
- unrealon_driver/src/execution/interactive_mode.py +0 -88
- unrealon_driver/src/execution/modes.py +0 -45
- unrealon_driver/src/execution/scheduled_mode.py +0 -209
- unrealon_driver/src/execution/test_mode.py +0 -250
- unrealon_driver/src/logging/__init__.py +0 -24
- unrealon_driver/src/logging/driver_logger.py +0 -512
- unrealon_driver/src/services/__init__.py +0 -24
- unrealon_driver/src/services/browser_service.py +0 -726
- unrealon_driver/src/services/llm/__init__.py +0 -15
- unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
- unrealon_driver/src/services/llm/llm.py +0 -195
- unrealon_driver/src/services/logger_service.py +0 -232
- unrealon_driver/src/services/metrics_service.py +0 -185
- unrealon_driver/src/services/scheduler_service.py +0 -489
- unrealon_driver/src/services/websocket_service.py +0 -362
- unrealon_driver/src/utils/__init__.py +0 -16
- unrealon_driver/src/utils/service_factory.py +0 -317
- unrealon_driver/src/utils/time_formatter.py +0 -338
- unrealon_llm/README.md +0 -44
- unrealon_llm/__init__.py +0 -26
- unrealon_llm/pyproject.toml +0 -154
- unrealon_llm/src/__init__.py +0 -228
- unrealon_llm/src/cli/__init__.py +0 -0
- unrealon_llm/src/core/__init__.py +0 -11
- unrealon_llm/src/core/smart_client.py +0 -438
- unrealon_llm/src/dto/__init__.py +0 -155
- unrealon_llm/src/dto/models/__init__.py +0 -0
- unrealon_llm/src/dto/models/config.py +0 -343
- unrealon_llm/src/dto/models/core.py +0 -328
- unrealon_llm/src/dto/models/enums.py +0 -123
- unrealon_llm/src/dto/models/html_analysis.py +0 -345
- unrealon_llm/src/dto/models/statistics.py +0 -473
- unrealon_llm/src/dto/models/translation.py +0 -383
- unrealon_llm/src/dto/models/type_conversion.py +0 -462
- unrealon_llm/src/dto/schemas/__init__.py +0 -0
- unrealon_llm/src/exceptions.py +0 -392
- unrealon_llm/src/llm_config/__init__.py +0 -20
- unrealon_llm/src/llm_config/logging_config.py +0 -178
- unrealon_llm/src/llm_logging/__init__.py +0 -42
- unrealon_llm/src/llm_logging/llm_events.py +0 -107
- unrealon_llm/src/llm_logging/llm_logger.py +0 -466
- unrealon_llm/src/managers/__init__.py +0 -15
- unrealon_llm/src/managers/cache_manager.py +0 -67
- unrealon_llm/src/managers/cost_manager.py +0 -107
- unrealon_llm/src/managers/request_manager.py +0 -298
- unrealon_llm/src/modules/__init__.py +0 -0
- unrealon_llm/src/modules/html_processor/__init__.py +0 -25
- unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
- unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
- unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
- unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
- unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
- unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
- unrealon_llm/src/modules/html_processor/processor.py +0 -102
- unrealon_llm/src/modules/llm/__init__.py +0 -0
- unrealon_llm/src/modules/translator/__init__.py +0 -0
- unrealon_llm/src/provider.py +0 -116
- unrealon_llm/src/utils/__init__.py +0 -95
- unrealon_llm/src/utils/common.py +0 -64
- unrealon_llm/src/utils/data_extractor.py +0 -188
- unrealon_llm/src/utils/html_cleaner.py +0 -767
- unrealon_llm/src/utils/language_detector.py +0 -308
- unrealon_llm/src/utils/models_cache.py +0 -592
- unrealon_llm/src/utils/smart_counter.py +0 -229
- unrealon_llm/src/utils/token_counter.py +0 -189
- unrealon_sdk/README.md +0 -25
- unrealon_sdk/__init__.py +0 -30
- unrealon_sdk/pyproject.toml +0 -231
- unrealon_sdk/src/__init__.py +0 -150
- unrealon_sdk/src/cli/__init__.py +0 -12
- unrealon_sdk/src/cli/commands/__init__.py +0 -22
- unrealon_sdk/src/cli/commands/benchmark.py +0 -42
- unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
- unrealon_sdk/src/cli/commands/health.py +0 -46
- unrealon_sdk/src/cli/commands/integration.py +0 -498
- unrealon_sdk/src/cli/commands/reports.py +0 -43
- unrealon_sdk/src/cli/commands/security.py +0 -36
- unrealon_sdk/src/cli/commands/server.py +0 -483
- unrealon_sdk/src/cli/commands/servers.py +0 -56
- unrealon_sdk/src/cli/commands/tests.py +0 -55
- unrealon_sdk/src/cli/main.py +0 -126
- unrealon_sdk/src/cli/utils/reporter.py +0 -519
- unrealon_sdk/src/clients/openapi.yaml +0 -3347
- unrealon_sdk/src/clients/python_http/__init__.py +0 -3
- unrealon_sdk/src/clients/python_http/api_config.py +0 -228
- unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
- unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
- unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
- unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
- unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
- unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
- unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
- unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
- unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
- unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
- unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
- unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
- unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
- unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
- unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
- unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
- unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
- unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
- unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
- unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
- unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
- unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
- unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
- unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
- unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
- unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
- unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
- unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
- unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
- unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
- unrealon_sdk/src/clients/python_websocket/client.py +0 -490
- unrealon_sdk/src/clients/python_websocket/events.py +0 -732
- unrealon_sdk/src/clients/python_websocket/example.py +0 -136
- unrealon_sdk/src/clients/python_websocket/types.py +0 -871
- unrealon_sdk/src/core/__init__.py +0 -64
- unrealon_sdk/src/core/client.py +0 -556
- unrealon_sdk/src/core/config.py +0 -465
- unrealon_sdk/src/core/exceptions.py +0 -239
- unrealon_sdk/src/core/metadata.py +0 -191
- unrealon_sdk/src/core/models.py +0 -142
- unrealon_sdk/src/core/types.py +0 -68
- unrealon_sdk/src/dto/__init__.py +0 -268
- unrealon_sdk/src/dto/authentication.py +0 -108
- unrealon_sdk/src/dto/cache.py +0 -208
- unrealon_sdk/src/dto/common.py +0 -19
- unrealon_sdk/src/dto/concurrency.py +0 -393
- unrealon_sdk/src/dto/events.py +0 -108
- unrealon_sdk/src/dto/health.py +0 -339
- unrealon_sdk/src/dto/load_balancing.py +0 -336
- unrealon_sdk/src/dto/logging.py +0 -230
- unrealon_sdk/src/dto/performance.py +0 -165
- unrealon_sdk/src/dto/rate_limiting.py +0 -295
- unrealon_sdk/src/dto/resource_pooling.py +0 -128
- unrealon_sdk/src/dto/structured_logging.py +0 -112
- unrealon_sdk/src/dto/task_scheduling.py +0 -121
- unrealon_sdk/src/dto/websocket.py +0 -55
- unrealon_sdk/src/enterprise/__init__.py +0 -59
- unrealon_sdk/src/enterprise/authentication.py +0 -401
- unrealon_sdk/src/enterprise/cache_manager.py +0 -578
- unrealon_sdk/src/enterprise/error_recovery.py +0 -494
- unrealon_sdk/src/enterprise/event_system.py +0 -549
- unrealon_sdk/src/enterprise/health_monitor.py +0 -747
- unrealon_sdk/src/enterprise/load_balancer.py +0 -964
- unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
- unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
- unrealon_sdk/src/enterprise/logging/development.py +0 -744
- unrealon_sdk/src/enterprise/logging/service.py +0 -410
- unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
- unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
- unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
- unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
- unrealon_sdk/src/enterprise/resource_pool.py +0 -763
- unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
- unrealon_sdk/src/internal/__init__.py +0 -10
- unrealon_sdk/src/internal/command_router.py +0 -497
- unrealon_sdk/src/internal/connection_manager.py +0 -397
- unrealon_sdk/src/internal/http_client.py +0 -446
- unrealon_sdk/src/internal/websocket_client.py +0 -420
- unrealon_sdk/src/provider.py +0 -471
- unrealon_sdk/src/utils.py +0 -234
- /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
- /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
- /unrealon_browser/{src/cli → cli}/main.py +0 -0
- /unrealon_browser/{src/core → core}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
- /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
- /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
|
@@ -0,0 +1,628 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parser Manager - Unified parser management system with Pydantic v2
|
|
3
|
+
|
|
4
|
+
Strict compliance with CRITICAL_REQUIREMENTS.md:
|
|
5
|
+
- No Dict[str, Any] usage
|
|
6
|
+
- Complete type annotations
|
|
7
|
+
- Pydantic v2 models everywhere
|
|
8
|
+
- Custom exception hierarchy
|
|
9
|
+
- No try blocks in imports
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import Optional, List, Union, Any
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from pydantic import BaseModel, Field, ConfigDict, field_validator
|
|
17
|
+
|
|
18
|
+
from unrealon_bridge import ParserBridgeClient
|
|
19
|
+
from unrealon_rpc.logging import get_logger
|
|
20
|
+
|
|
21
|
+
from .managers import (
|
|
22
|
+
ConfigManager, ParserConfig,
|
|
23
|
+
ResultManager, ParseResult, ParseMetrics,
|
|
24
|
+
ErrorManager, RetryConfig, ErrorInfo,
|
|
25
|
+
LoggingManager, LoggingConfig, LogLevel,
|
|
26
|
+
HTMLManager, HTMLCleaningConfig,
|
|
27
|
+
BrowserManager, BrowserConfig
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ParserManagerConfig(BaseModel):
|
|
32
|
+
"""Complete parser manager configuration"""
|
|
33
|
+
model_config = ConfigDict(
|
|
34
|
+
validate_assignment=True,
|
|
35
|
+
extra="forbid"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Core configuration
|
|
39
|
+
parser_config: ParserConfig = Field(
|
|
40
|
+
default_factory=ParserConfig,
|
|
41
|
+
description="Core parser configuration"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Manager configurations
|
|
45
|
+
logging_config: LoggingConfig = Field(
|
|
46
|
+
default_factory=LoggingConfig,
|
|
47
|
+
description="Logging configuration"
|
|
48
|
+
)
|
|
49
|
+
html_config: HTMLCleaningConfig = Field(
|
|
50
|
+
default_factory=HTMLCleaningConfig,
|
|
51
|
+
description="HTML cleaning configuration"
|
|
52
|
+
)
|
|
53
|
+
browser_config: BrowserConfig = Field(
|
|
54
|
+
default_factory=BrowserConfig,
|
|
55
|
+
description="Browser configuration"
|
|
56
|
+
)
|
|
57
|
+
retry_config: RetryConfig = Field(
|
|
58
|
+
default_factory=RetryConfig,
|
|
59
|
+
description="Retry configuration"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Bridge settings
|
|
63
|
+
bridge_enabled: bool = Field(
|
|
64
|
+
default=True,
|
|
65
|
+
description="Enable bridge connection"
|
|
66
|
+
)
|
|
67
|
+
auto_register: bool = Field(
|
|
68
|
+
default=True,
|
|
69
|
+
description="Auto-register parser with bridge"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def model_post_init(self, __context) -> None:
|
|
73
|
+
"""Sync configurations across managers"""
|
|
74
|
+
# Sync parser name across all configs
|
|
75
|
+
parser_name = self.parser_config.parser_name
|
|
76
|
+
if hasattr(self.logging_config, 'parser_name'):
|
|
77
|
+
self.logging_config.parser_name = parser_name
|
|
78
|
+
|
|
79
|
+
# Sync system directories
|
|
80
|
+
system_dir = self.parser_config.system_dir
|
|
81
|
+
if system_dir:
|
|
82
|
+
self.logging_config.log_dir = system_dir / "logs"
|
|
83
|
+
self.browser_config.screenshots_dir = system_dir / "screenshots"
|
|
84
|
+
self.browser_config.cookies_file = system_dir / "cookies.json"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ParserStats(BaseModel):
|
|
88
|
+
"""Comprehensive parser statistics"""
|
|
89
|
+
model_config = ConfigDict(
|
|
90
|
+
validate_assignment=True,
|
|
91
|
+
extra="forbid"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
parser_id: str = Field(...)
|
|
95
|
+
parser_name: str = Field(...)
|
|
96
|
+
session_id: Optional[str] = Field(default=None)
|
|
97
|
+
|
|
98
|
+
# Timing
|
|
99
|
+
session_start: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
100
|
+
session_duration: float = Field(default=0.0, ge=0.0)
|
|
101
|
+
|
|
102
|
+
# Operations
|
|
103
|
+
operations_completed: int = Field(default=0, ge=0)
|
|
104
|
+
operations_failed: int = Field(default=0, ge=0)
|
|
105
|
+
success_rate: float = Field(default=0.0, ge=0.0, le=100.0)
|
|
106
|
+
|
|
107
|
+
# Content processing
|
|
108
|
+
pages_processed: int = Field(default=0, ge=0)
|
|
109
|
+
html_cleaned_count: int = Field(default=0, ge=0)
|
|
110
|
+
total_html_reduction: float = Field(default=0.0, ge=0.0)
|
|
111
|
+
|
|
112
|
+
# Errors
|
|
113
|
+
total_errors: int = Field(default=0, ge=0)
|
|
114
|
+
retries_attempted: int = Field(default=0, ge=0)
|
|
115
|
+
|
|
116
|
+
# Bridge
|
|
117
|
+
bridge_connected: bool = Field(default=False)
|
|
118
|
+
bridge_messages_sent: int = Field(default=0, ge=0)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class ParserManagerError(Exception):
|
|
122
|
+
"""Base exception for parser manager"""
|
|
123
|
+
def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
|
|
124
|
+
self.message = message
|
|
125
|
+
self.operation = operation
|
|
126
|
+
self.details = details or {}
|
|
127
|
+
super().__init__(message)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class InitializationError(ParserManagerError):
|
|
131
|
+
"""Raised when parser manager initialization fails"""
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class OperationError(ParserManagerError):
|
|
136
|
+
"""Raised when parser operation fails"""
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ParserManager:
|
|
141
|
+
"""
|
|
142
|
+
🚀 Parser Manager - Unified parser management system
|
|
143
|
+
|
|
144
|
+
Features:
|
|
145
|
+
- Unified Configuration: Single config for all managers
|
|
146
|
+
- Automatic Lifecycle: Handles initialization, execution, cleanup
|
|
147
|
+
- Error Recovery: Smart retry logic with exponential backoff
|
|
148
|
+
- Performance Monitoring: Comprehensive statistics and metrics
|
|
149
|
+
- Bridge Integration: Seamless communication with Django
|
|
150
|
+
- Type Safety: Full Pydantic v2 compliance
|
|
151
|
+
|
|
152
|
+
Usage:
|
|
153
|
+
config = ParserManagerConfig(
|
|
154
|
+
parser_config=ParserConfig(parser_name="MyParser"),
|
|
155
|
+
bridge_enabled=True
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
async with ParserManager(config) as parser:
|
|
159
|
+
# Navigate and extract
|
|
160
|
+
html = await parser.get_html("https://example.com")
|
|
161
|
+
cleaned_html = await parser.clean_html(html)
|
|
162
|
+
result = await parser.analyze_html(cleaned_html)
|
|
163
|
+
|
|
164
|
+
# Results are automatically tracked
|
|
165
|
+
stats = parser.get_stats()
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
def __init__(self, config: ParserManagerConfig):
|
|
169
|
+
self.config = config
|
|
170
|
+
self.internal_logger = get_logger()
|
|
171
|
+
|
|
172
|
+
# Initialize managers
|
|
173
|
+
self.config_manager = ConfigManager(self.config.parser_config)
|
|
174
|
+
self.result_manager = ResultManager(self.config.parser_config.parser_id)
|
|
175
|
+
self.error_manager = ErrorManager(self.internal_logger)
|
|
176
|
+
self.logging_manager = LoggingManager(self.config.logging_config)
|
|
177
|
+
self.html_manager = HTMLManager(self.config.html_config)
|
|
178
|
+
self.browser_manager = BrowserManager(self.config.browser_config)
|
|
179
|
+
|
|
180
|
+
# Bridge client
|
|
181
|
+
self.bridge_client: Optional[ParserBridgeClient] = None
|
|
182
|
+
|
|
183
|
+
# State
|
|
184
|
+
self._is_initialized = False
|
|
185
|
+
self._session_id: Optional[str] = None
|
|
186
|
+
self._stats = ParserStats(
|
|
187
|
+
parser_id=self.config.parser_config.parser_id,
|
|
188
|
+
parser_name=self.config.parser_config.parser_name
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Register retry configurations
|
|
192
|
+
self._setup_retry_configs()
|
|
193
|
+
|
|
194
|
+
# ==========================================
|
|
195
|
+
# LIFECYCLE MANAGEMENT
|
|
196
|
+
# ==========================================
|
|
197
|
+
|
|
198
|
+
async def initialize(self) -> None:
|
|
199
|
+
"""Initialize all managers and establish connections"""
|
|
200
|
+
if self._is_initialized:
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
self.logging_manager.info("🚀 Initializing parser manager...")
|
|
205
|
+
|
|
206
|
+
# Initialize bridge client
|
|
207
|
+
if self.config.bridge_enabled:
|
|
208
|
+
await self._initialize_bridge()
|
|
209
|
+
|
|
210
|
+
# Initialize browser
|
|
211
|
+
await self.browser_manager.initialize()
|
|
212
|
+
|
|
213
|
+
# Update logging manager with bridge client
|
|
214
|
+
if self.bridge_client:
|
|
215
|
+
self.logging_manager.update_bridge_client(self.bridge_client)
|
|
216
|
+
|
|
217
|
+
# Register parser if enabled
|
|
218
|
+
if self.config.auto_register and self.bridge_client:
|
|
219
|
+
await self._register_parser()
|
|
220
|
+
|
|
221
|
+
self._is_initialized = True
|
|
222
|
+
self.logging_manager.info("✅ Parser manager initialized successfully")
|
|
223
|
+
|
|
224
|
+
except Exception as e:
|
|
225
|
+
self.error_manager.record_error(e, "initialization")
|
|
226
|
+
raise InitializationError(
|
|
227
|
+
message=f"Failed to initialize parser manager: {e}",
|
|
228
|
+
operation="initialization"
|
|
229
|
+
) from e
|
|
230
|
+
|
|
231
|
+
async def cleanup(self) -> None:
|
|
232
|
+
"""Clean up all resources"""
|
|
233
|
+
self.logging_manager.info("🧹 Cleaning up parser manager...")
|
|
234
|
+
|
|
235
|
+
cleanup_errors = []
|
|
236
|
+
|
|
237
|
+
# End session if active
|
|
238
|
+
if self._session_id and self.bridge_client:
|
|
239
|
+
try:
|
|
240
|
+
await self.bridge_client.end_session()
|
|
241
|
+
except Exception as e:
|
|
242
|
+
cleanup_errors.append(f"end_session: {e}")
|
|
243
|
+
|
|
244
|
+
# Cleanup browser
|
|
245
|
+
try:
|
|
246
|
+
await self.browser_manager.cleanup()
|
|
247
|
+
except Exception as e:
|
|
248
|
+
cleanup_errors.append(f"browser_cleanup: {e}")
|
|
249
|
+
|
|
250
|
+
# Disconnect bridge
|
|
251
|
+
if self.bridge_client:
|
|
252
|
+
try:
|
|
253
|
+
await self.bridge_client.disconnect()
|
|
254
|
+
except Exception as e:
|
|
255
|
+
cleanup_errors.append(f"bridge_disconnect: {e}")
|
|
256
|
+
|
|
257
|
+
# Update final stats
|
|
258
|
+
self._update_session_stats()
|
|
259
|
+
|
|
260
|
+
# Log cleanup errors but don't raise
|
|
261
|
+
if cleanup_errors:
|
|
262
|
+
self.logging_manager.warning(f"Cleanup errors: {'; '.join(cleanup_errors)}")
|
|
263
|
+
|
|
264
|
+
self.logging_manager.info("✅ Parser manager cleanup completed")
|
|
265
|
+
|
|
266
|
+
# ==========================================
|
|
267
|
+
# CORE PARSING METHODS
|
|
268
|
+
# ==========================================
|
|
269
|
+
|
|
270
|
+
async def get_html(self, url: str) -> str:
|
|
271
|
+
"""Get HTML content from URL with error handling"""
|
|
272
|
+
if not self._is_initialized:
|
|
273
|
+
await self.initialize()
|
|
274
|
+
|
|
275
|
+
@self.error_manager.with_retry("get_html", self.config.retry_config)
|
|
276
|
+
async def _get_html_with_retry():
|
|
277
|
+
self.logging_manager.url_access(url, "fetching")
|
|
278
|
+
html = await self.browser_manager.get_html(url)
|
|
279
|
+
self._stats.pages_processed += 1
|
|
280
|
+
return html
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
return await _get_html_with_retry()
|
|
284
|
+
except Exception as e:
|
|
285
|
+
self._stats.total_errors += 1
|
|
286
|
+
raise OperationError(
|
|
287
|
+
message=f"Failed to get HTML from {url}: {e}",
|
|
288
|
+
operation="get_html",
|
|
289
|
+
details={"url": url}
|
|
290
|
+
) from e
|
|
291
|
+
|
|
292
|
+
async def clean_html(self, html: str, **kwargs) -> str:
|
|
293
|
+
"""Clean HTML content for LLM analysis"""
|
|
294
|
+
try:
|
|
295
|
+
self.logging_manager.info(f"🧹 Cleaning HTML: {len(html)} characters")
|
|
296
|
+
|
|
297
|
+
cleaned_html = await self.html_manager.clean_html(html, **kwargs)
|
|
298
|
+
|
|
299
|
+
# Update stats
|
|
300
|
+
self._stats.html_cleaned_count += 1
|
|
301
|
+
stats = self.html_manager.get_cleaning_stats(html, cleaned_html)
|
|
302
|
+
self._stats.total_html_reduction += stats.size_reduction_percent
|
|
303
|
+
|
|
304
|
+
self.logging_manager.info(
|
|
305
|
+
f"✅ HTML cleaned: {len(html)} → {len(cleaned_html)} chars "
|
|
306
|
+
f"({stats.size_reduction_percent:.1f}% reduction)"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
return cleaned_html
|
|
310
|
+
|
|
311
|
+
except Exception as e:
|
|
312
|
+
self._stats.total_errors += 1
|
|
313
|
+
raise OperationError(
|
|
314
|
+
message=f"Failed to clean HTML: {e}",
|
|
315
|
+
operation="clean_html"
|
|
316
|
+
) from e
|
|
317
|
+
|
|
318
|
+
async def analyze_html(
|
|
319
|
+
self,
|
|
320
|
+
html: str,
|
|
321
|
+
instructions: Optional[str] = None,
|
|
322
|
+
**kwargs
|
|
323
|
+
) -> dict[str, str]:
|
|
324
|
+
"""Analyze HTML content via bridge"""
|
|
325
|
+
if not self.bridge_client:
|
|
326
|
+
raise OperationError(
|
|
327
|
+
message="Bridge client not available for HTML analysis",
|
|
328
|
+
operation="analyze_html"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
self.logging_manager.info("🤖 Analyzing HTML with LLM...")
|
|
333
|
+
|
|
334
|
+
result = await self.bridge_client.parse_html(
|
|
335
|
+
html_content=html,
|
|
336
|
+
instructions=instructions,
|
|
337
|
+
parse_type="general",
|
|
338
|
+
timeout=kwargs.get("timeout", 60),
|
|
339
|
+
metadata=kwargs.get("metadata", {})
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
return {
|
|
343
|
+
"success": str(result.success),
|
|
344
|
+
"parsed_data": str(result.parsed_data),
|
|
345
|
+
"markdown": result.markdown or "",
|
|
346
|
+
"error_message": result.error_message or ""
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
except Exception as e:
|
|
350
|
+
self._stats.total_errors += 1
|
|
351
|
+
raise OperationError(
|
|
352
|
+
message=f"Failed to analyze HTML: {e}",
|
|
353
|
+
operation="analyze_html"
|
|
354
|
+
) from e
|
|
355
|
+
|
|
356
|
+
async def parse_url(
|
|
357
|
+
self,
|
|
358
|
+
url: str,
|
|
359
|
+
instructions: Optional[str] = None,
|
|
360
|
+
**kwargs
|
|
361
|
+
) -> dict[str, str]:
|
|
362
|
+
"""Complete parsing workflow: fetch → clean → analyze"""
|
|
363
|
+
operation = self.result_manager.start_operation()
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
self.logging_manager.start_operation("parse_url")
|
|
367
|
+
|
|
368
|
+
# Fetch HTML
|
|
369
|
+
html = await self.get_html(url)
|
|
370
|
+
|
|
371
|
+
# Clean HTML
|
|
372
|
+
cleaned_html = await self.clean_html(html, **kwargs)
|
|
373
|
+
|
|
374
|
+
# Analyze HTML
|
|
375
|
+
analysis_result = await self.analyze_html(cleaned_html, instructions, **kwargs)
|
|
376
|
+
|
|
377
|
+
# Complete operation
|
|
378
|
+
self.result_manager.complete_operation(
|
|
379
|
+
data=[], # Analysis result is returned directly
|
|
380
|
+
source_urls=[url],
|
|
381
|
+
success=analysis_result.get("success", "false") == "true"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
self._stats.operations_completed += 1
|
|
385
|
+
self.logging_manager.end_operation("parse_url", operation.duration_seconds)
|
|
386
|
+
|
|
387
|
+
return analysis_result
|
|
388
|
+
|
|
389
|
+
except Exception as e:
|
|
390
|
+
self.result_manager.complete_operation(
|
|
391
|
+
data=[],
|
|
392
|
+
source_urls=[url],
|
|
393
|
+
success=False,
|
|
394
|
+
error_message=str(e)
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
self._stats.operations_failed += 1
|
|
398
|
+
self.logging_manager.fail_operation("parse_url", str(e))
|
|
399
|
+
raise
|
|
400
|
+
|
|
401
|
+
# ==========================================
|
|
402
|
+
# SESSION MANAGEMENT
|
|
403
|
+
# ==========================================
|
|
404
|
+
|
|
405
|
+
async def start_session(self, session_type: str = "parsing") -> str:
|
|
406
|
+
"""Start a new parsing session"""
|
|
407
|
+
if not self.bridge_client:
|
|
408
|
+
raise OperationError(
|
|
409
|
+
message="Bridge client not available for session management",
|
|
410
|
+
operation="start_session"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
try:
|
|
414
|
+
session_id = await self.bridge_client.start_session(
|
|
415
|
+
session_type=session_type,
|
|
416
|
+
metadata={
|
|
417
|
+
"parser_name": self.config.parser_config.parser_name,
|
|
418
|
+
"parser_type": self.config.parser_config.parser_type
|
|
419
|
+
}
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
self._session_id = session_id
|
|
423
|
+
self._stats.session_id = session_id
|
|
424
|
+
self.logging_manager.set_session(session_id)
|
|
425
|
+
|
|
426
|
+
self.logging_manager.info(f"📋 Session started: {session_id}")
|
|
427
|
+
return session_id
|
|
428
|
+
|
|
429
|
+
except Exception as e:
|
|
430
|
+
raise OperationError(
|
|
431
|
+
message=f"Failed to start session: {e}",
|
|
432
|
+
operation="start_session"
|
|
433
|
+
) from e
|
|
434
|
+
|
|
435
|
+
async def end_session(self) -> None:
|
|
436
|
+
"""End current parsing session"""
|
|
437
|
+
if not self._session_id or not self.bridge_client:
|
|
438
|
+
return
|
|
439
|
+
|
|
440
|
+
try:
|
|
441
|
+
await self.bridge_client.end_session()
|
|
442
|
+
self.logging_manager.info(f"📋 Session ended: {self._session_id}")
|
|
443
|
+
self._session_id = None
|
|
444
|
+
self._stats.session_id = None
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
self.logging_manager.warning(f"Failed to end session: {e}")
|
|
448
|
+
|
|
449
|
+
# ==========================================
|
|
450
|
+
# STATISTICS AND MONITORING
|
|
451
|
+
# ==========================================
|
|
452
|
+
|
|
453
|
+
def get_stats(self) -> ParserStats:
|
|
454
|
+
"""Get comprehensive parser statistics"""
|
|
455
|
+
self._update_session_stats()
|
|
456
|
+
return ParserStats.model_validate(self._stats.model_dump())
|
|
457
|
+
|
|
458
|
+
def get_manager_stats(self) -> dict[str, dict[str, str]]:
|
|
459
|
+
"""Get statistics from all managers"""
|
|
460
|
+
return {
|
|
461
|
+
"result_manager": self.result_manager.get_stats(),
|
|
462
|
+
"error_manager": self.error_manager.get_error_stats(),
|
|
463
|
+
"browser_manager": self.browser_manager.get_stats().model_dump(mode='json'),
|
|
464
|
+
"logging_manager": self.logging_manager.get_log_stats()
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
async def health_check(self) -> dict[str, str]:
|
|
468
|
+
"""Comprehensive health check"""
|
|
469
|
+
health = {
|
|
470
|
+
"status": "healthy",
|
|
471
|
+
"parser_id": self.config.parser_config.parser_id,
|
|
472
|
+
"parser_name": self.config.parser_config.parser_name,
|
|
473
|
+
"initialized": str(self._is_initialized),
|
|
474
|
+
"session_active": str(self._session_id is not None)
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
# Check browser health
|
|
478
|
+
try:
|
|
479
|
+
browser_health = await self.browser_manager.health_check()
|
|
480
|
+
health["browser_status"] = browser_health.get("status", "unknown")
|
|
481
|
+
except Exception as e:
|
|
482
|
+
health["browser_status"] = f"error: {e}"
|
|
483
|
+
|
|
484
|
+
# Check bridge health
|
|
485
|
+
if self.bridge_client:
|
|
486
|
+
health["bridge_connected"] = "true"
|
|
487
|
+
else:
|
|
488
|
+
health["bridge_connected"] = "false"
|
|
489
|
+
|
|
490
|
+
return health
|
|
491
|
+
|
|
492
|
+
# ==========================================
|
|
493
|
+
# INTERNAL METHODS
|
|
494
|
+
# ==========================================
|
|
495
|
+
|
|
496
|
+
async def _initialize_bridge(self) -> None:
|
|
497
|
+
"""Initialize bridge client"""
|
|
498
|
+
self.bridge_client = ParserBridgeClient(
|
|
499
|
+
websocket_url=self.config.parser_config.websocket_url,
|
|
500
|
+
parser_type=self.config.parser_config.parser_type,
|
|
501
|
+
api_key=self.config.parser_config.api_key
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
await self.bridge_client.bridge_client.connect()
|
|
505
|
+
self._stats.bridge_connected = True
|
|
506
|
+
self.logging_manager.info("🔗 Bridge client connected")
|
|
507
|
+
|
|
508
|
+
async def _register_parser(self) -> None:
|
|
509
|
+
"""Register parser with bridge"""
|
|
510
|
+
if not self.bridge_client:
|
|
511
|
+
return
|
|
512
|
+
|
|
513
|
+
parser_info = await self.bridge_client.register_parser(
|
|
514
|
+
metadata={
|
|
515
|
+
"driver_version": "4.0.0",
|
|
516
|
+
"capabilities": "scraping,html_cleaning,llm_integration",
|
|
517
|
+
"managers": "config,result,error,logging,html,browser"
|
|
518
|
+
}
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# Update parser ID
|
|
522
|
+
self.config.parser_config.parser_id = parser_info.parser_id
|
|
523
|
+
self._stats.parser_id = parser_info.parser_id
|
|
524
|
+
|
|
525
|
+
self.logging_manager.info(f"📝 Parser registered: {parser_info.parser_id}")
|
|
526
|
+
|
|
527
|
+
def _setup_retry_configs(self) -> None:
|
|
528
|
+
"""Setup retry configurations for different operations"""
|
|
529
|
+
# Navigation retry config
|
|
530
|
+
nav_config = RetryConfig(
|
|
531
|
+
max_attempts=3,
|
|
532
|
+
base_delay=2.0,
|
|
533
|
+
retry_on_exceptions=["NavigationError", "TimeoutError", "ConnectionError"]
|
|
534
|
+
)
|
|
535
|
+
self.error_manager.register_retry_config("get_html", nav_config)
|
|
536
|
+
|
|
537
|
+
# Bridge communication retry config
|
|
538
|
+
bridge_config = RetryConfig(
|
|
539
|
+
max_attempts=2,
|
|
540
|
+
base_delay=1.0,
|
|
541
|
+
retry_on_exceptions=["ConnectionError", "TimeoutError"]
|
|
542
|
+
)
|
|
543
|
+
self.error_manager.register_retry_config("analyze_html", bridge_config)
|
|
544
|
+
|
|
545
|
+
def _update_session_stats(self) -> None:
|
|
546
|
+
"""Update session statistics"""
|
|
547
|
+
self._stats.session_duration = (datetime.now(timezone.utc) - self._stats.session_start).total_seconds()
|
|
548
|
+
|
|
549
|
+
total_operations = self._stats.operations_completed + self._stats.operations_failed
|
|
550
|
+
if total_operations > 0:
|
|
551
|
+
self._stats.success_rate = (self._stats.operations_completed / total_operations) * 100.0
|
|
552
|
+
|
|
553
|
+
# ==========================================
|
|
554
|
+
# CONTEXT MANAGER SUPPORT
|
|
555
|
+
# ==========================================
|
|
556
|
+
|
|
557
|
+
async def __aenter__(self):
|
|
558
|
+
"""Async context manager entry"""
|
|
559
|
+
await self.initialize()
|
|
560
|
+
return self
|
|
561
|
+
|
|
562
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
563
|
+
"""Async context manager exit"""
|
|
564
|
+
await self.cleanup()
|
|
565
|
+
return False
|
|
566
|
+
|
|
567
|
+
def __repr__(self) -> str:
|
|
568
|
+
return f"<ParserManager(id='{self.config.parser_config.parser_id}', name='{self.config.parser_config.parser_name}')>"
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
# ==========================================
|
|
572
|
+
# CONVENIENCE FUNCTIONS
|
|
573
|
+
# ==========================================
|
|
574
|
+
|
|
575
|
+
def get_parser_manager(
|
|
576
|
+
parser_name: str,
|
|
577
|
+
parser_type: str = "generic",
|
|
578
|
+
**kwargs
|
|
579
|
+
) -> ParserManager:
|
|
580
|
+
"""
|
|
581
|
+
Get a parser manager instance with minimal configuration
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
parser_name: Name of the parser
|
|
585
|
+
parser_type: Type of parser (generic, ecommerce, news, etc.)
|
|
586
|
+
**kwargs: Additional configuration options
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
Configured ParserManager instance
|
|
590
|
+
"""
|
|
591
|
+
parser_config = ParserConfig(
|
|
592
|
+
parser_name=parser_name,
|
|
593
|
+
parser_type=parser_type,
|
|
594
|
+
**{k: v for k, v in kwargs.items() if k in ParserConfig.model_fields}
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Create logging config with parser name
|
|
598
|
+
logging_config = LoggingConfig(parser_name=parser_name)
|
|
599
|
+
|
|
600
|
+
config = ParserManagerConfig(
|
|
601
|
+
parser_config=parser_config,
|
|
602
|
+
logging_config=logging_config,
|
|
603
|
+
**{k: v for k, v in kwargs.items() if k in ParserManagerConfig.model_fields and k not in ['parser_config', 'logging_config']}
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
return ParserManager(config)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
async def quick_parse(
|
|
610
|
+
url: str,
|
|
611
|
+
parser_name: str = "QuickParser",
|
|
612
|
+
instructions: Optional[str] = None,
|
|
613
|
+
**kwargs
|
|
614
|
+
) -> dict[str, str]:
|
|
615
|
+
"""
|
|
616
|
+
Quick parsing convenience function
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
url: URL to parse
|
|
620
|
+
parser_name: Name for the parser
|
|
621
|
+
instructions: Optional parsing instructions
|
|
622
|
+
**kwargs: Additional configuration
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
Parsing result
|
|
626
|
+
"""
|
|
627
|
+
async with get_parser_manager(parser_name, **kwargs) as parser:
|
|
628
|
+
return await parser.parse_url(url, instructions, **kwargs)
|