unrealon 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +23 -21
- unrealon-1.1.1.dist-info/METADATA +722 -0
- unrealon-1.1.1.dist-info/RECORD +82 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.1.dist-info}/WHEEL +1 -1
- unrealon-1.1.1.dist-info/entry_points.txt +9 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.1.dist-info/licenses}/LICENSE +1 -1
- unrealon_bridge/__init__.py +114 -0
- unrealon_bridge/cli.py +316 -0
- unrealon_bridge/client/__init__.py +93 -0
- unrealon_bridge/client/base.py +78 -0
- unrealon_bridge/client/commands.py +89 -0
- unrealon_bridge/client/connection.py +90 -0
- unrealon_bridge/client/events.py +65 -0
- unrealon_bridge/client/health.py +38 -0
- unrealon_bridge/client/html_parser.py +146 -0
- unrealon_bridge/client/logging.py +139 -0
- unrealon_bridge/client/proxy.py +70 -0
- unrealon_bridge/client/scheduler.py +450 -0
- unrealon_bridge/client/session.py +70 -0
- unrealon_bridge/configs/__init__.py +14 -0
- unrealon_bridge/configs/bridge_config.py +212 -0
- unrealon_bridge/configs/bridge_config.yaml +39 -0
- unrealon_bridge/models/__init__.py +138 -0
- unrealon_bridge/models/base.py +28 -0
- unrealon_bridge/models/command.py +41 -0
- unrealon_bridge/models/events.py +40 -0
- unrealon_bridge/models/html_parser.py +79 -0
- unrealon_bridge/models/logging.py +55 -0
- unrealon_bridge/models/parser.py +63 -0
- unrealon_bridge/models/proxy.py +41 -0
- unrealon_bridge/models/requests.py +95 -0
- unrealon_bridge/models/responses.py +88 -0
- unrealon_bridge/models/scheduler.py +592 -0
- unrealon_bridge/models/session.py +28 -0
- unrealon_bridge/server/__init__.py +91 -0
- unrealon_bridge/server/base.py +171 -0
- unrealon_bridge/server/handlers/__init__.py +23 -0
- unrealon_bridge/server/handlers/command.py +110 -0
- unrealon_bridge/server/handlers/html_parser.py +139 -0
- unrealon_bridge/server/handlers/logging.py +95 -0
- unrealon_bridge/server/handlers/parser.py +95 -0
- unrealon_bridge/server/handlers/proxy.py +75 -0
- unrealon_bridge/server/handlers/scheduler.py +545 -0
- unrealon_bridge/server/handlers/session.py +66 -0
- unrealon_browser/__init__.py +61 -18
- unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
- unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
- unrealon_browser/{src/core → core}/browser_manager.py +2 -2
- unrealon_browser/{src/managers → managers}/captcha.py +1 -1
- unrealon_browser/{src/managers → managers}/cookies.py +1 -1
- unrealon_browser/managers/logger_bridge.py +231 -0
- unrealon_browser/{src/managers → managers}/profile.py +1 -1
- unrealon_driver/__init__.py +73 -19
- unrealon_driver/browser/__init__.py +8 -0
- unrealon_driver/browser/config.py +74 -0
- unrealon_driver/browser/manager.py +416 -0
- unrealon_driver/exceptions.py +28 -0
- unrealon_driver/parser/__init__.py +55 -0
- unrealon_driver/parser/cli_manager.py +141 -0
- unrealon_driver/parser/daemon_manager.py +227 -0
- unrealon_driver/parser/managers/__init__.py +46 -0
- unrealon_driver/parser/managers/browser.py +51 -0
- unrealon_driver/parser/managers/config.py +281 -0
- unrealon_driver/parser/managers/error.py +412 -0
- unrealon_driver/parser/managers/html.py +732 -0
- unrealon_driver/parser/managers/logging.py +609 -0
- unrealon_driver/parser/managers/result.py +321 -0
- unrealon_driver/parser/parser_manager.py +628 -0
- unrealon/sdk_config.py +0 -88
- unrealon-1.0.9.dist-info/METADATA +0 -810
- unrealon-1.0.9.dist-info/RECORD +0 -246
- unrealon_browser/pyproject.toml +0 -182
- unrealon_browser/src/__init__.py +0 -62
- unrealon_browser/src/managers/logger_bridge.py +0 -395
- unrealon_driver/README.md +0 -204
- unrealon_driver/pyproject.toml +0 -187
- unrealon_driver/src/__init__.py +0 -90
- unrealon_driver/src/cli/__init__.py +0 -10
- unrealon_driver/src/cli/main.py +0 -66
- unrealon_driver/src/cli/simple.py +0 -510
- unrealon_driver/src/config/__init__.py +0 -11
- unrealon_driver/src/config/auto_config.py +0 -478
- unrealon_driver/src/core/__init__.py +0 -18
- unrealon_driver/src/core/exceptions.py +0 -289
- unrealon_driver/src/core/parser.py +0 -638
- unrealon_driver/src/dto/__init__.py +0 -66
- unrealon_driver/src/dto/cli.py +0 -119
- unrealon_driver/src/dto/config.py +0 -18
- unrealon_driver/src/dto/events.py +0 -237
- unrealon_driver/src/dto/execution.py +0 -313
- unrealon_driver/src/dto/services.py +0 -311
- unrealon_driver/src/execution/__init__.py +0 -23
- unrealon_driver/src/execution/daemon_mode.py +0 -317
- unrealon_driver/src/execution/interactive_mode.py +0 -88
- unrealon_driver/src/execution/modes.py +0 -45
- unrealon_driver/src/execution/scheduled_mode.py +0 -209
- unrealon_driver/src/execution/test_mode.py +0 -250
- unrealon_driver/src/logging/__init__.py +0 -24
- unrealon_driver/src/logging/driver_logger.py +0 -512
- unrealon_driver/src/services/__init__.py +0 -24
- unrealon_driver/src/services/browser_service.py +0 -726
- unrealon_driver/src/services/llm/__init__.py +0 -15
- unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
- unrealon_driver/src/services/llm/llm.py +0 -195
- unrealon_driver/src/services/logger_service.py +0 -232
- unrealon_driver/src/services/metrics_service.py +0 -185
- unrealon_driver/src/services/scheduler_service.py +0 -489
- unrealon_driver/src/services/websocket_service.py +0 -362
- unrealon_driver/src/utils/__init__.py +0 -16
- unrealon_driver/src/utils/service_factory.py +0 -317
- unrealon_driver/src/utils/time_formatter.py +0 -338
- unrealon_llm/README.md +0 -44
- unrealon_llm/__init__.py +0 -26
- unrealon_llm/pyproject.toml +0 -154
- unrealon_llm/src/__init__.py +0 -228
- unrealon_llm/src/cli/__init__.py +0 -0
- unrealon_llm/src/core/__init__.py +0 -11
- unrealon_llm/src/core/smart_client.py +0 -438
- unrealon_llm/src/dto/__init__.py +0 -155
- unrealon_llm/src/dto/models/__init__.py +0 -0
- unrealon_llm/src/dto/models/config.py +0 -343
- unrealon_llm/src/dto/models/core.py +0 -328
- unrealon_llm/src/dto/models/enums.py +0 -123
- unrealon_llm/src/dto/models/html_analysis.py +0 -345
- unrealon_llm/src/dto/models/statistics.py +0 -473
- unrealon_llm/src/dto/models/translation.py +0 -383
- unrealon_llm/src/dto/models/type_conversion.py +0 -462
- unrealon_llm/src/dto/schemas/__init__.py +0 -0
- unrealon_llm/src/exceptions.py +0 -392
- unrealon_llm/src/llm_config/__init__.py +0 -20
- unrealon_llm/src/llm_config/logging_config.py +0 -178
- unrealon_llm/src/llm_logging/__init__.py +0 -42
- unrealon_llm/src/llm_logging/llm_events.py +0 -107
- unrealon_llm/src/llm_logging/llm_logger.py +0 -466
- unrealon_llm/src/managers/__init__.py +0 -15
- unrealon_llm/src/managers/cache_manager.py +0 -67
- unrealon_llm/src/managers/cost_manager.py +0 -107
- unrealon_llm/src/managers/request_manager.py +0 -298
- unrealon_llm/src/modules/__init__.py +0 -0
- unrealon_llm/src/modules/html_processor/__init__.py +0 -25
- unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
- unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
- unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
- unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
- unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
- unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
- unrealon_llm/src/modules/html_processor/processor.py +0 -102
- unrealon_llm/src/modules/llm/__init__.py +0 -0
- unrealon_llm/src/modules/translator/__init__.py +0 -0
- unrealon_llm/src/provider.py +0 -116
- unrealon_llm/src/utils/__init__.py +0 -95
- unrealon_llm/src/utils/common.py +0 -64
- unrealon_llm/src/utils/data_extractor.py +0 -188
- unrealon_llm/src/utils/html_cleaner.py +0 -767
- unrealon_llm/src/utils/language_detector.py +0 -308
- unrealon_llm/src/utils/models_cache.py +0 -592
- unrealon_llm/src/utils/smart_counter.py +0 -229
- unrealon_llm/src/utils/token_counter.py +0 -189
- unrealon_sdk/README.md +0 -25
- unrealon_sdk/__init__.py +0 -30
- unrealon_sdk/pyproject.toml +0 -231
- unrealon_sdk/src/__init__.py +0 -150
- unrealon_sdk/src/cli/__init__.py +0 -12
- unrealon_sdk/src/cli/commands/__init__.py +0 -22
- unrealon_sdk/src/cli/commands/benchmark.py +0 -42
- unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
- unrealon_sdk/src/cli/commands/health.py +0 -46
- unrealon_sdk/src/cli/commands/integration.py +0 -498
- unrealon_sdk/src/cli/commands/reports.py +0 -43
- unrealon_sdk/src/cli/commands/security.py +0 -36
- unrealon_sdk/src/cli/commands/server.py +0 -483
- unrealon_sdk/src/cli/commands/servers.py +0 -56
- unrealon_sdk/src/cli/commands/tests.py +0 -55
- unrealon_sdk/src/cli/main.py +0 -126
- unrealon_sdk/src/cli/utils/reporter.py +0 -519
- unrealon_sdk/src/clients/openapi.yaml +0 -3347
- unrealon_sdk/src/clients/python_http/__init__.py +0 -3
- unrealon_sdk/src/clients/python_http/api_config.py +0 -228
- unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
- unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
- unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
- unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
- unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
- unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
- unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
- unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
- unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
- unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
- unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
- unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
- unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
- unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
- unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
- unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
- unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
- unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
- unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
- unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
- unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
- unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
- unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
- unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
- unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
- unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
- unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
- unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
- unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
- unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
- unrealon_sdk/src/clients/python_websocket/client.py +0 -490
- unrealon_sdk/src/clients/python_websocket/events.py +0 -732
- unrealon_sdk/src/clients/python_websocket/example.py +0 -136
- unrealon_sdk/src/clients/python_websocket/types.py +0 -871
- unrealon_sdk/src/core/__init__.py +0 -64
- unrealon_sdk/src/core/client.py +0 -556
- unrealon_sdk/src/core/config.py +0 -465
- unrealon_sdk/src/core/exceptions.py +0 -239
- unrealon_sdk/src/core/metadata.py +0 -191
- unrealon_sdk/src/core/models.py +0 -142
- unrealon_sdk/src/core/types.py +0 -68
- unrealon_sdk/src/dto/__init__.py +0 -268
- unrealon_sdk/src/dto/authentication.py +0 -108
- unrealon_sdk/src/dto/cache.py +0 -208
- unrealon_sdk/src/dto/common.py +0 -19
- unrealon_sdk/src/dto/concurrency.py +0 -393
- unrealon_sdk/src/dto/events.py +0 -108
- unrealon_sdk/src/dto/health.py +0 -339
- unrealon_sdk/src/dto/load_balancing.py +0 -336
- unrealon_sdk/src/dto/logging.py +0 -230
- unrealon_sdk/src/dto/performance.py +0 -165
- unrealon_sdk/src/dto/rate_limiting.py +0 -295
- unrealon_sdk/src/dto/resource_pooling.py +0 -128
- unrealon_sdk/src/dto/structured_logging.py +0 -112
- unrealon_sdk/src/dto/task_scheduling.py +0 -121
- unrealon_sdk/src/dto/websocket.py +0 -55
- unrealon_sdk/src/enterprise/__init__.py +0 -59
- unrealon_sdk/src/enterprise/authentication.py +0 -401
- unrealon_sdk/src/enterprise/cache_manager.py +0 -578
- unrealon_sdk/src/enterprise/error_recovery.py +0 -494
- unrealon_sdk/src/enterprise/event_system.py +0 -549
- unrealon_sdk/src/enterprise/health_monitor.py +0 -747
- unrealon_sdk/src/enterprise/load_balancer.py +0 -964
- unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
- unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
- unrealon_sdk/src/enterprise/logging/development.py +0 -744
- unrealon_sdk/src/enterprise/logging/service.py +0 -410
- unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
- unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
- unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
- unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
- unrealon_sdk/src/enterprise/resource_pool.py +0 -763
- unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
- unrealon_sdk/src/internal/__init__.py +0 -10
- unrealon_sdk/src/internal/command_router.py +0 -497
- unrealon_sdk/src/internal/connection_manager.py +0 -397
- unrealon_sdk/src/internal/http_client.py +0 -446
- unrealon_sdk/src/internal/websocket_client.py +0 -420
- unrealon_sdk/src/provider.py +0 -471
- unrealon_sdk/src/utils.py +0 -234
- /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
- /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
- /unrealon_browser/{src/cli → cli}/main.py +0 -0
- /unrealon_browser/{src/core → core}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
- /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
- /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
|
@@ -1,810 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: unrealon
|
|
3
|
-
Version: 1.0.9
|
|
4
|
-
Summary: AI-powered web scraping platform with real-time orchestration
|
|
5
|
-
License: MIT
|
|
6
|
-
Author: Unrealon Team
|
|
7
|
-
Author-email: dev@unrealon.com
|
|
8
|
-
Requires-Python: >=3.9
|
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
-
Requires-Dist: aiohttp (>=3.8.0)
|
|
17
|
-
Requires-Dist: asyncio-extras (>=1.3.0)
|
|
18
|
-
Requires-Dist: beautifulsoup4 (>=4.12.0)
|
|
19
|
-
Requires-Dist: cachetools (>=5.3.0)
|
|
20
|
-
Requires-Dist: click (>=8.1.0)
|
|
21
|
-
Requires-Dist: fast-langdetect (>=0.3.2)
|
|
22
|
-
Requires-Dist: httpx (>=0.23.0,<0.24.0)
|
|
23
|
-
Requires-Dist: langdetect (>=1.0.9)
|
|
24
|
-
Requires-Dist: langid (>=1.1.6)
|
|
25
|
-
Requires-Dist: openai (>=1.0.0)
|
|
26
|
-
Requires-Dist: pathlib2 (>=2.3.0)
|
|
27
|
-
Requires-Dist: playwright (>=1.40.0)
|
|
28
|
-
Requires-Dist: playwright-stealth (>=1.0.5)
|
|
29
|
-
Requires-Dist: pydantic (>=2.0.0)
|
|
30
|
-
Requires-Dist: pydantic-settings (>=2.0.0)
|
|
31
|
-
Requires-Dist: python-dateutil (>=2.8)
|
|
32
|
-
Requires-Dist: python-dotenv (>=1.0.0)
|
|
33
|
-
Requires-Dist: python-socketio (>=5.0)
|
|
34
|
-
Requires-Dist: pyyaml (>=6.0)
|
|
35
|
-
Requires-Dist: questionary (>=2.0.0)
|
|
36
|
-
Requires-Dist: rich (>=13.0.0)
|
|
37
|
-
Requires-Dist: tiktoken (>=0.9.0)
|
|
38
|
-
Requires-Dist: typing-extensions (>=4.0)
|
|
39
|
-
Requires-Dist: websockets (>=10.0)
|
|
40
|
-
Description-Content-Type: text/markdown
|
|
41
|
-
|
|
42
|
-
# UnrealOn SDK
|
|
43
|
-
|
|
44
|
-
**Enterprise-grade web scraping platform with AI-powered automation and real-time orchestration capabilities.**
|
|
45
|
-
|
|
46
|
-
[](https://badge.fury.io/py/unrealon)
|
|
47
|
-
[](https://www.python.org/downloads/)
|
|
48
|
-
[](https://opensource.org/licenses/MIT)
|
|
49
|
-
|
|
50
|
-
## 🚀 Ready-to-Use Amazon Parser
|
|
51
|
-
|
|
52
|
-
**Get started immediately with our pre-configured Amazon parser:**
|
|
53
|
-
- **[GitHub Repository](https://github.com/markolofsen/unrealon-parser-amazon)** - Complete Amazon parser with all configurations
|
|
54
|
-
- **Zero Setup**: Clone and run with minimal configuration
|
|
55
|
-
- **Production Ready**: Includes all enterprise features and optimizations
|
|
56
|
-
|
|
57
|
-
## Overview
|
|
58
|
-
|
|
59
|
-
UnrealOn SDK is a comprehensive Python framework for building production-ready web scrapers with integrated AI capabilities, real-time monitoring, and enterprise orchestration features. The platform combines traditional web scraping techniques with modern AI-powered extraction, providing developers with a unified solution for data collection at scale.
|
|
60
|
-
|
|
61
|
-
**This SDK is designed to work with the [UnrealOn Server](https://unrealon.com/) - a B2B Commercial Parsing Platform that provides enterprise-grade web scraping infrastructure with anti-bot protection, proxy management, and real-time orchestration capabilities.**
|
|
62
|
-
|
|
63
|
-
**No Vendor Lock-in**: The SDK works both with and without the UnrealOn Server. You can use it as a standalone library for local development and testing, or connect to the server for enterprise features like real-time orchestration, proxy management, and monitoring.
|
|
64
|
-
|
|
65
|
-
**Key Features:**
|
|
66
|
-
- **AI-Powered Extraction**: Automatic selector generation and content analysis
|
|
67
|
-
- **Real-Time Orchestration**: Active Connection-based parser management and monitoring
|
|
68
|
-
- **Enterprise Architecture**: Clean Architecture with modular design patterns
|
|
69
|
-
- **Zero Configuration**: Production-ready defaults with minimal setup
|
|
70
|
-
- **Anti-Detection**: Advanced browser automation with proxy rotation
|
|
71
|
-
|
|
72
|
-
## Why Choose UnrealOn?
|
|
73
|
-
|
|
74
|
-
### 🆚 **Competitive Comparison**
|
|
75
|
-
|
|
76
|
-
| Feature | UnrealOn | Scrapy Cloud | ScrapingBee | Apify | Custom Solutions |
|
|
77
|
-
|---------|-------------|--------------|-------------|-------|------------------|
|
|
78
|
-
| **Real-time Communication** | ✅ Active Connection | ❌ HTTP Polling | ❌ HTTP Polling | ❌ HTTP Polling | ❌ Manual Setup |
|
|
79
|
-
| **Self-hosting** | ✅ Full Control | ❌ Vendor Lock-in | ❌ Vendor Lock-in | ❌ Vendor Lock-in | ✅ Full Control |
|
|
80
|
-
| **Pricing Model** | ✅ Free SDK + LLM costs | ❌ Pay per request | ❌ Pay per request | ❌ Pay per request | ❌ High dev costs |
|
|
81
|
-
| **AI-Powered Extraction** | ✅ Built-in | ❌ Manual | ❌ Manual | ❌ Manual | ❌ Custom AI Dev |
|
|
82
|
-
| **Enterprise Orchestration** | ✅ Active Connection | ❌ Limited | ❌ Limited | ❌ Limited | ❌ Custom Dev |
|
|
83
|
-
| **Proxy Management** | ✅ Auto-rotation | ✅ Managed | ✅ Managed | ✅ Managed | ❌ Manual Setup |
|
|
84
|
-
| **Data Ownership** | ✅ Your Servers | ❌ Their Servers | ❌ Their Servers | ❌ Their Servers | ✅ Your Servers |
|
|
85
|
-
| **Development Time** | ✅ Days | ❌ Weeks | ❌ Weeks | ❌ Weeks | ❌ Months |
|
|
86
|
-
| **Maintenance Overhead** | ✅ Minimal | ❌ High | ❌ High | ❌ High | ❌ Very High |
|
|
87
|
-
|
|
88
|
-
### 💎 **Key Advantages**
|
|
89
|
-
|
|
90
|
-
#### **1. No Vendor Lock-in**
|
|
91
|
-
**Problem**: Scrapy Cloud, Apify lock you into their platforms
|
|
92
|
-
**Solution**: Same code runs locally, self-hosted, or managed
|
|
93
|
-
```python
|
|
94
|
-
# This exact code works everywhere:
|
|
95
|
-
class MyParser(BaseParser):
|
|
96
|
-
async def run_once(self, **kwargs):
|
|
97
|
-
return await self.parse_data()
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
#### **2. Predictable Economics**
|
|
101
|
-
**Problem**: Per-request pricing becomes expensive at scale
|
|
102
|
-
**Solution**: Flat monitoring costs with unlimited requests
|
|
103
|
-
```
|
|
104
|
-
Traditional: Pay per request (expensive at scale)
|
|
105
|
-
UnrealOn: Free + optional monitoring
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
**Note**: SDK is completely free. You only pay for LLM services (OpenRouter, OpenAI, etc.) using your own API keys.
|
|
109
|
-
|
|
110
|
-
#### **3. Developer Experience Excellence**
|
|
111
|
-
**Problem**: Manual CLI development, boilerplate code
|
|
112
|
-
**Solution**: Auto-generated everything with clean architecture
|
|
113
|
-
```python
|
|
114
|
-
# Write parser logic only - CLI generated automatically
|
|
115
|
-
# TypeScript clients generated from OpenAPI
|
|
116
|
-
# Active Connection handlers created from decorators
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
#### **4. Real-time Capabilities**
|
|
120
|
-
**Problem**: Batch processing with delayed results
|
|
121
|
-
**Solution**: Instant command execution with live monitoring
|
|
122
|
-
```javascript
|
|
123
|
-
// Active Connection provides real-time updates
|
|
124
|
-
socket.on('parser_status', (data) => {
|
|
125
|
-
console.log(`Parser ${data.parser_id}: ${data.status}`);
|
|
126
|
-
});
|
|
127
|
-
```
|
|
128
|
-
|
|
129
|
-
#### **5. Enterprise-Grade Architecture**
|
|
130
|
-
**Problem**: Simple tools don't scale to enterprise needs
|
|
131
|
-
**Solution**: Clean Architecture with production patterns
|
|
132
|
-
- Multi-tenant authentication
|
|
133
|
-
- Audit trails and compliance
|
|
134
|
-
- Horizontal auto-scaling
|
|
135
|
-
- Performance monitoring
|
|
136
|
-
- Graceful error handling
|
|
137
|
-
|
|
138
|
-
## Architecture
|
|
139
|
-
|
|
140
|
-
### Core Components
|
|
141
|
-
|
|
142
|
-
```
|
|
143
|
-
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
144
|
-
│ Parser SDK │◄──►│ UnrealOn │◄──►│ Target │
|
|
145
|
-
│ (Client) │ │ Server │ │ Websites │
|
|
146
|
-
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
147
|
-
│ │ │
|
|
148
|
-
▼ ▼ ▼
|
|
149
|
-
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
150
|
-
│ AI Services │ │ Database & │ │ Proxy & │
|
|
151
|
-
│ (LLM) │ │ Cache │ │ Browser │
|
|
152
|
-
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
### Module Structure
|
|
156
|
-
|
|
157
|
-
- **`unrealon_sdk`**: Core server communication and client SDK
|
|
158
|
-
- **`unrealon_browser`**: Browser automation with anti-detection capabilities
|
|
159
|
-
- **`unrealon_driver`**: Parser development tools
|
|
160
|
-
- **`unrealon_llm`**: AI-powered extraction and content analysis
|
|
161
|
-
|
|
162
|
-
**Note**: The SDK connects to the **UnrealOn Server** platform for real-time orchestration.
|
|
163
|
-
|
|
164
|
-
## Installation
|
|
165
|
-
|
|
166
|
-
### Prerequisites
|
|
167
|
-
|
|
168
|
-
- Python 3.9 or higher
|
|
169
|
-
- Poetry for dependency management
|
|
170
|
-
- Valid API keys for LLM services (OpenRouter, OpenAI, etc.)
|
|
171
|
-
|
|
172
|
-
### Installation
|
|
173
|
-
|
|
174
|
-
```bash
|
|
175
|
-
# Install with Poetry
|
|
176
|
-
poetry add unrealon
|
|
177
|
-
|
|
178
|
-
# Install development dependencies
|
|
179
|
-
poetry add --group dev unrealon[dev]
|
|
180
|
-
```
|
|
181
|
-
|
|
182
|
-
## Configuration
|
|
183
|
-
|
|
184
|
-
### Environment Variables
|
|
185
|
-
|
|
186
|
-
Create a `config.env` file in your project directory:
|
|
187
|
-
|
|
188
|
-
```bash
|
|
189
|
-
# System Paths
|
|
190
|
-
UNREALON_SYSTEM_DIR=system
|
|
191
|
-
UNREALON_BROWSER_PROFILE_DIR=system/browser_profiles
|
|
192
|
-
|
|
193
|
-
# API Keys
|
|
194
|
-
UNREALON_OPENROUTER_API_KEY=sk-or-v1-your-openrouter-key
|
|
195
|
-
UNREALON_SERVER_URL=wss://api.unrealon.com
|
|
196
|
-
UNREALON_API_KEY=up_dev_your-api-key
|
|
197
|
-
|
|
198
|
-
# Runtime Limits
|
|
199
|
-
UNREALON_LLM_DAILY_LIMIT=1.0
|
|
200
|
-
UNREALON_MAX_PAGES=2
|
|
201
|
-
|
|
202
|
-
# Browser Settings
|
|
203
|
-
UNREALON_BROWSER_HEADLESS=true
|
|
204
|
-
UNREALON_BROWSER_TIMEOUT=30
|
|
205
|
-
UNREALON_SAVE_SCREENSHOTS=false
|
|
206
|
-
|
|
207
|
-
# Logging Settings
|
|
208
|
-
UNREALON_LOG_LEVEL=INFO
|
|
209
|
-
UNREALON_LOG_TO_FILE=true
|
|
210
|
-
```
|
|
211
|
-
|
|
212
|
-
### Custom Configuration
|
|
213
|
-
|
|
214
|
-
Extend the base configuration for project-specific settings:
|
|
215
|
-
|
|
216
|
-
```python
|
|
217
|
-
import os
|
|
218
|
-
from pathlib import Path
|
|
219
|
-
from typing import Optional
|
|
220
|
-
from pydantic import Field
|
|
221
|
-
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
222
|
-
|
|
223
|
-
from unrealon_driver.src.config.auto_config import AutoConfig
|
|
224
|
-
from unrealon_driver.src.dto.services import DriverBrowserConfig
|
|
225
|
-
from unrealon_driver.src.dto.execution import DaemonModeConfig
|
|
226
|
-
from unrealon_driver.src.dto.cli import ParserInstanceConfig
|
|
227
|
-
|
|
228
|
-
# Paths
|
|
229
|
-
THIS_DIR = Path(__file__).resolve().parent
|
|
230
|
-
SYSTEM_DIR = THIS_DIR / "system"
|
|
231
|
-
|
|
232
|
-
# Parser instance configuration
|
|
233
|
-
parser_instance_config = ParserInstanceConfig(
|
|
234
|
-
parser_id="my_parser",
|
|
235
|
-
parser_name="My Custom Parser",
|
|
236
|
-
description="Custom parser with specific requirements"
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
class ParserSettings(BaseSettings):
|
|
240
|
-
"""Environment settings from config.env"""
|
|
241
|
-
|
|
242
|
-
model_config = SettingsConfigDict(
|
|
243
|
-
env_file=THIS_DIR / "config.env",
|
|
244
|
-
env_file_encoding="utf-8",
|
|
245
|
-
extra="ignore",
|
|
246
|
-
env_prefix="UNREALON_",
|
|
247
|
-
case_sensitive=False,
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
# System Paths
|
|
251
|
-
SYSTEM_DIR: str = Field(default="system")
|
|
252
|
-
BROWSER_PROFILE_DIR: str = Field(default="system/browser_profiles")
|
|
253
|
-
|
|
254
|
-
# API Keys
|
|
255
|
-
OPENROUTER_API_KEY: str
|
|
256
|
-
SERVER_URL: str
|
|
257
|
-
API_KEY: str
|
|
258
|
-
|
|
259
|
-
# Runtime Limits
|
|
260
|
-
LLM_DAILY_LIMIT: float = Field(default=1.0)
|
|
261
|
-
MAX_PAGES: int = Field(default=2)
|
|
262
|
-
|
|
263
|
-
# Browser Settings
|
|
264
|
-
BROWSER_HEADLESS: bool = Field(default=False)
|
|
265
|
-
BROWSER_TIMEOUT: int = Field(default=30)
|
|
266
|
-
SAVE_SCREENSHOTS: bool = Field(default=False)
|
|
267
|
-
|
|
268
|
-
# Logging Settings
|
|
269
|
-
LOG_LEVEL: str = Field(default="INFO")
|
|
270
|
-
LOG_TO_FILE: bool = Field(default=True)
|
|
271
|
-
|
|
272
|
-
# Load settings globally
|
|
273
|
-
parser_settings = ParserSettings()
|
|
274
|
-
|
|
275
|
-
class CustomAutoConfig(AutoConfig):
|
|
276
|
-
"""Custom AutoConfig that extends driver config."""
|
|
277
|
-
|
|
278
|
-
def __init__(self):
|
|
279
|
-
super().__init__()
|
|
280
|
-
self.parser_id = parser_instance_config.parser_id
|
|
281
|
-
|
|
282
|
-
# Force custom system directory
|
|
283
|
-
self.system_dir = SYSTEM_DIR
|
|
284
|
-
self.project_root = THIS_DIR.parent
|
|
285
|
-
|
|
286
|
-
# Ensure system directories exist
|
|
287
|
-
SYSTEM_DIR.mkdir(exist_ok=True)
|
|
288
|
-
for dir in ["logs", "results", "browser_profiles"]:
|
|
289
|
-
(SYSTEM_DIR / dir).mkdir(exist_ok=True)
|
|
290
|
-
|
|
291
|
-
# Reinitialize configs with custom paths
|
|
292
|
-
self._initialize_configs()
|
|
293
|
-
|
|
294
|
-
def _create_browser_config(self):
|
|
295
|
-
"""Override browser config with custom settings from config.env."""
|
|
296
|
-
return DriverBrowserConfig(
|
|
297
|
-
parser_id=self.parser_id,
|
|
298
|
-
headless=parser_settings.BROWSER_HEADLESS,
|
|
299
|
-
timeout=parser_settings.BROWSER_TIMEOUT,
|
|
300
|
-
user_data_dir=str(SYSTEM_DIR / "browser_profiles"),
|
|
301
|
-
page_load_strategy="normal",
|
|
302
|
-
wait_for_selector_timeout=10,
|
|
303
|
-
network_idle_timeout=3,
|
|
304
|
-
enable_javascript=True,
|
|
305
|
-
enable_images=True,
|
|
306
|
-
enable_css=True,
|
|
307
|
-
debug_mode=False,
|
|
308
|
-
save_screenshots=parser_settings.SAVE_SCREENSHOTS,
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
def _create_llm_config(self):
|
|
312
|
-
"""Override LLM config with custom settings from config.env."""
|
|
313
|
-
config = super()._create_llm_config()
|
|
314
|
-
|
|
315
|
-
# Force custom LLM settings from config.env
|
|
316
|
-
config.provider = "openrouter"
|
|
317
|
-
config.model = "anthropic/claude-3.5-sonnet"
|
|
318
|
-
config.api_key = parser_settings.OPENROUTER_API_KEY
|
|
319
|
-
config.enable_caching = True
|
|
320
|
-
|
|
321
|
-
return config
|
|
322
|
-
|
|
323
|
-
def _create_daemon_config(self):
|
|
324
|
-
"""Override daemon config with custom settings."""
|
|
325
|
-
return DaemonModeConfig(
|
|
326
|
-
server_url=parser_settings.SERVER_URL,
|
|
327
|
-
api_key=parser_settings.API_KEY,
|
|
328
|
-
auto_reconnect=True,
|
|
329
|
-
connection_timeout=30,
|
|
330
|
-
heartbeat_interval=30,
|
|
331
|
-
max_reconnect_attempts=3,
|
|
332
|
-
health_check_interval=60,
|
|
333
|
-
enable_metrics=True,
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
# Global custom config instance
|
|
337
|
-
custom_config = CustomAutoConfig()
|
|
338
|
-
```
|
|
339
|
-
|
|
340
|
-
### AI-Powered Parser Implementation
|
|
341
|
-
|
|
342
|
-
```python
|
|
343
|
-
from unrealon_driver.src.core.parser import Parser
|
|
344
|
-
from custom_config import custom_config, parser_instance_config
|
|
345
|
-
|
|
346
|
-
class ProductExtractor(Parser):
|
|
347
|
-
"""AI-powered product data extractor."""
|
|
348
|
-
|
|
349
|
-
def __init__(self):
|
|
350
|
-
# Pass the extended config directly to Parser
|
|
351
|
-
super().__init__(
|
|
352
|
-
parser_id=parser_instance_config.parser_id,
|
|
353
|
-
parser_name="Product Extractor",
|
|
354
|
-
config=custom_config, # Pass the whole extended config!
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
async def extract_products(self, url: str) -> dict:
|
|
358
|
-
"""Extract product information from a listing page."""
|
|
359
|
-
try:
|
|
360
|
-
# AI-powered extraction with automatic selector generation
|
|
361
|
-
result = await self.browser_llm.extract_listing(url)
|
|
362
|
-
|
|
363
|
-
return {
|
|
364
|
-
"products": result.data,
|
|
365
|
-
"cost_usd": result.cost_usd,
|
|
366
|
-
"processing_time": result.total_duration_seconds,
|
|
367
|
-
"success": True
|
|
368
|
-
}
|
|
369
|
-
except Exception as e:
|
|
370
|
-
self.logger.error(f"Extraction failed: {e}")
|
|
371
|
-
return {"success": False, "error": str(e)}
|
|
372
|
-
|
|
373
|
-
async def extract_complete_data(self, listing_url: str) -> dict:
|
|
374
|
-
"""Extract complete product data including details."""
|
|
375
|
-
try:
|
|
376
|
-
# Extract listing page data
|
|
377
|
-
listing_result = await self.browser_llm.extract_listing(listing_url)
|
|
378
|
-
|
|
379
|
-
if not listing_result.data or not listing_result.data.get("products"):
|
|
380
|
-
return {"success": False, "error": "No products found"}
|
|
381
|
-
|
|
382
|
-
# Extract details for first product
|
|
383
|
-
first_product = listing_result.data["products"][0]
|
|
384
|
-
details_result = await self.browser_llm.extract_details(
|
|
385
|
-
first_product["url"]
|
|
386
|
-
)
|
|
387
|
-
|
|
388
|
-
return {
|
|
389
|
-
"listing_data": listing_result.data,
|
|
390
|
-
"details_data": details_result.data,
|
|
391
|
-
"total_cost": listing_result.cost_usd + details_result.cost_usd,
|
|
392
|
-
"processing_time": listing_result.total_duration_seconds,
|
|
393
|
-
"success": True
|
|
394
|
-
}
|
|
395
|
-
except Exception as e:
|
|
396
|
-
self.logger.error(f"Advanced extraction failed: {e}")
|
|
397
|
-
return {"success": False, "error": str(e)}
|
|
398
|
-
|
|
399
|
-
# Usage
|
|
400
|
-
async def main():
|
|
401
|
-
extractor = ProductExtractor()
|
|
402
|
-
await extractor.setup()
|
|
403
|
-
|
|
404
|
-
# Simple extraction
|
|
405
|
-
result = await extractor.extract_products("https://example.com/products")
|
|
406
|
-
print(f"Extracted {len(result.get('products', []))} products")
|
|
407
|
-
print(f"Cost: ${result.get('cost_usd', 0):.4f}")
|
|
408
|
-
|
|
409
|
-
# Complete extraction with details
|
|
410
|
-
complete_result = await extractor.extract_complete_data("https://example.com/products")
|
|
411
|
-
print(f"Complete extraction cost: ${complete_result.get('total_cost', 0):.4f}")
|
|
412
|
-
|
|
413
|
-
await extractor.cleanup()
|
|
414
|
-
|
|
415
|
-
if __name__ == "__main__":
|
|
416
|
-
import asyncio
|
|
417
|
-
asyncio.run(main())
|
|
418
|
-
```
|
|
419
|
-
|
|
420
|
-
### AI Extraction Results
|
|
421
|
-
|
|
422
|
-
The AI-powered extraction generates comprehensive results including selectors, documentation, and processing metrics:
|
|
423
|
-
|
|
424
|
-
#### **Extraction Result Example**
|
|
425
|
-
|
|
426
|
-
```json
|
|
427
|
-
{
|
|
428
|
-
"data": {
|
|
429
|
-
"extraction_result": {
|
|
430
|
-
"detected_item_type": "product_listing",
|
|
431
|
-
"confidence": 0.95,
|
|
432
|
-
"selectors": {
|
|
433
|
-
"items_container": [
|
|
434
|
-
"div.s-main-slot.s-result-list",
|
|
435
|
-
"div.s-result-item",
|
|
436
|
-
"div.sg-col-20-of-24"
|
|
437
|
-
],
|
|
438
|
-
"product_title": [
|
|
439
|
-
"h2.a-size-medium",
|
|
440
|
-
"span.a-size-base-plus",
|
|
441
|
-
"a.a-link-normal.s-link-style h2"
|
|
442
|
-
],
|
|
443
|
-
"product_price": [
|
|
444
|
-
"span.a-price",
|
|
445
|
-
"span.a-offscreen",
|
|
446
|
-
"span.a-price-whole"
|
|
447
|
-
],
|
|
448
|
-
"product_image": [
|
|
449
|
-
"img.s-image",
|
|
450
|
-
"div.a-section.aok-relative.s-image-fixed-height img"
|
|
451
|
-
],
|
|
452
|
-
"product_url": [
|
|
453
|
-
"a.a-link-normal.s-no-outline",
|
|
454
|
-
"a.a-link-normal.s-underline-text"
|
|
455
|
-
],
|
|
456
|
-
"ratings": [
|
|
457
|
-
"i.a-icon.a-icon-star-mini",
|
|
458
|
-
"span.a-icon-alt"
|
|
459
|
-
],
|
|
460
|
-
"reviews_count": [
|
|
461
|
-
"span.a-size-base.s-underline-text"
|
|
462
|
-
],
|
|
463
|
-
"pagination": [
|
|
464
|
-
"ul.a-pagination",
|
|
465
|
-
"span.s-pagination-strip"
|
|
466
|
-
]
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
},
|
|
470
|
-
"url": "https://example.com/search?q=products",
|
|
471
|
-
"success": true,
|
|
472
|
-
"cost_usd": 0.080241,
|
|
473
|
-
"processing_time": 50.79724
|
|
474
|
-
}
|
|
475
|
-
```
|
|
476
|
-
|
|
477
|
-
#### **Auto-Generated Documentation**
|
|
478
|
-
|
|
479
|
-
The AI generates comprehensive documentation for the extraction pattern:
|
|
480
|
-
|
|
481
|
-
```markdown
|
|
482
|
-
# Product Listing Extraction
|
|
483
|
-
|
|
484
|
-
## Overview
|
|
485
|
-
This pattern extracts product listings from search results pages.
|
|
486
|
-
|
|
487
|
-
## Key Elements
|
|
488
|
-
- Product containers use s-result-item class
|
|
489
|
-
- Titles are in h2 tags with a-size-medium class
|
|
490
|
-
- Prices use span.a-price structure
|
|
491
|
-
- Images in s-image class
|
|
492
|
-
- Ratings use star icon classes
|
|
493
|
-
|
|
494
|
-
## Extraction Tips
|
|
495
|
-
1. Handle sponsored and organic listings
|
|
496
|
-
2. Extract both main and sale prices
|
|
497
|
-
3. Get review counts when available
|
|
498
|
-
4. Follow pagination for complete results
|
|
499
|
-
5. Handle variable layouts and responsive design
|
|
500
|
-
|
|
501
|
-
## Common Patterns
|
|
502
|
-
- Product grid layout
|
|
503
|
-
- Sponsored product placement
|
|
504
|
-
- Price range variations
|
|
505
|
-
- Rating and review counts
|
|
506
|
-
- Multiple image sizes
|
|
507
|
-
```
|
|
508
|
-
|
|
509
|
-
#### **Performance Metrics**
|
|
510
|
-
|
|
511
|
-
- **Confidence**: 95% accuracy in selector generation
|
|
512
|
-
- **HTML Optimization**: 65% size reduction (1.6MB → 554KB)
|
|
513
|
-
- **Token Savings**: 66% reduction in LLM tokens
|
|
514
|
-
- **Processing Time**: ~50 seconds total (18s browser + 25s LLM)
|
|
515
|
-
- **LLM Cost**: $0.08 per extraction (developer pays directly to LLM provider)
|
|
516
|
-
|
|
517
|
-
**Note**: SDK is free to use. LLM costs are paid directly to providers (OpenRouter, OpenAI, etc.) using your own API keys.
|
|
518
|
-
|
|
519
|
-
### Traditional BeautifulSoup Parsing
|
|
520
|
-
|
|
521
|
-
```python
|
|
522
|
-
from bs4 import BeautifulSoup
|
|
523
|
-
from unrealon_driver.src.core.parser import Parser
|
|
524
|
-
from custom_config import custom_config, parser_instance_config
|
|
525
|
-
|
|
526
|
-
class TraditionalParser(Parser):
|
|
527
|
-
"""Traditional parser using BeautifulSoup with browser automation."""
|
|
528
|
-
|
|
529
|
-
def __init__(self, **kwargs):
|
|
530
|
-
super().__init__(
|
|
531
|
-
parser_id=parser_instance_config.parser_id,
|
|
532
|
-
parser_name=parser_instance_config.parser_name,
|
|
533
|
-
**kwargs,
|
|
534
|
-
)
|
|
535
|
-
|
|
536
|
-
async def parse_listing_page(self, url: str) -> dict:
|
|
537
|
-
"""Parse listing page using BeautifulSoup and CSS selectors."""
|
|
538
|
-
try:
|
|
539
|
-
# Get HTML content using browser service (includes proxy rotation)
|
|
540
|
-
html_content = await self.browser_llm.browser_service.get_html(url)
|
|
541
|
-
|
|
542
|
-
# Parse with BeautifulSoup
|
|
543
|
-
soup = BeautifulSoup(html_content, "html.parser")
|
|
544
|
-
|
|
545
|
-
# Extract products using CSS selectors
|
|
546
|
-
products = []
|
|
547
|
-
for item in soup.select("div.product-item"):
|
|
548
|
-
product = {
|
|
549
|
-
"title": self._extract_text(item, "h2.product-title"),
|
|
550
|
-
"price": self._extract_text(item, "span.product-price"),
|
|
551
|
-
"image_url": self._extract_attribute(item, "img.product-image", "src"),
|
|
552
|
-
"product_url": self._extract_attribute(item, "a.product-link", "href"),
|
|
553
|
-
"rating": self._extract_text(item, "span.product-rating"),
|
|
554
|
-
"review_count": self._extract_text(item, "span.review-count")
|
|
555
|
-
}
|
|
556
|
-
products.append(product)
|
|
557
|
-
|
|
558
|
-
return {
|
|
559
|
-
"products": products,
|
|
560
|
-
"total_count": len(products),
|
|
561
|
-
"success": True
|
|
562
|
-
}
|
|
563
|
-
except Exception as e:
|
|
564
|
-
self.logger.error(f"Parsing failed: {e}")
|
|
565
|
-
return {"success": False, "error": str(e)}
|
|
566
|
-
|
|
567
|
-
def _extract_text(self, element, selector: str) -> str:
|
|
568
|
-
"""Extract text content from element using CSS selector."""
|
|
569
|
-
found = element.select_one(selector)
|
|
570
|
-
return found.get_text(strip=True) if found else None
|
|
571
|
-
|
|
572
|
-
def _extract_attribute(self, element, selector: str, attribute: str) -> str:
|
|
573
|
-
"""Extract attribute value from element using CSS selector."""
|
|
574
|
-
found = element.select_one(selector)
|
|
575
|
-
return found.get(attribute) if found else None
|
|
576
|
-
```
|
|
577
|
-
|
|
578
|
-
### Scheduled Parser with CLI
|
|
579
|
-
|
|
580
|
-
```python
|
|
581
|
-
from unrealon_driver.src.cli.simple import SimpleParser
|
|
582
|
-
from unrealon_driver.src.core.parser import Parser
|
|
583
|
-
from custom_config import parser_instance_config
|
|
584
|
-
|
|
585
|
-
class ScheduledProductParser(Parser):
|
|
586
|
-
"""Parser with scheduling capabilities."""
|
|
587
|
-
|
|
588
|
-
def __init__(self, **kwargs):
|
|
589
|
-
super().__init__(
|
|
590
|
-
parser_id=parser_instance_config.parser_id,
|
|
591
|
-
parser_name=parser_instance_config.parser_name,
|
|
592
|
-
**kwargs,
|
|
593
|
-
)
|
|
594
|
-
|
|
595
|
-
async def parse(self) -> dict:
|
|
596
|
-
"""Main parsing method for scheduled execution."""
|
|
597
|
-
try:
|
|
598
|
-
# Your parsing logic here
|
|
599
|
-
result = await self.extract_products("https://example.com/products")
|
|
600
|
-
return {"success": True, "data": result}
|
|
601
|
-
except Exception as e:
|
|
602
|
-
return {"success": False, "error": str(e)}
|
|
603
|
-
|
|
604
|
-
class SchedulerWrapper(SimpleParser):
|
|
605
|
-
"""SimpleParser wrapper for ScheduledProductParser."""
|
|
606
|
-
|
|
607
|
-
def __init__(self):
|
|
608
|
-
super().__init__(parser_instance_config)
|
|
609
|
-
self.parser = ScheduledProductParser()
|
|
610
|
-
|
|
611
|
-
async def setup(self) -> None:
|
|
612
|
-
"""Setup - delegate to actual parser."""
|
|
613
|
-
await self.parser.setup()
|
|
614
|
-
|
|
615
|
-
async def cleanup(self) -> None:
|
|
616
|
-
"""Cleanup - delegate to actual parser."""
|
|
617
|
-
await self.parser.cleanup()
|
|
618
|
-
await super().cleanup()
|
|
619
|
-
|
|
620
|
-
async def parse_data(self) -> dict:
|
|
621
|
-
"""Parse data - delegate to actual parser."""
|
|
622
|
-
result = await self.parser.parse()
|
|
623
|
-
return {"success": result.get("success", False), "data": result}
|
|
624
|
-
|
|
625
|
-
# Run with scheduling capabilities
|
|
626
|
-
if __name__ == "__main__":
|
|
627
|
-
scheduler = SchedulerWrapper()
|
|
628
|
-
cli = scheduler.create_click_cli()
|
|
629
|
-
cli() # Provides: scheduled, test, daemon, interactive modes
|
|
630
|
-
```
|
|
631
|
-
|
|
632
|
-
### Daemon Mode for Real-Time Processing
|
|
633
|
-
|
|
634
|
-
```python
|
|
635
|
-
from unrealon_driver.src.core.parser import Parser
|
|
636
|
-
from custom_config import custom_config, parser_instance_config
|
|
637
|
-
|
|
638
|
-
class DaemonParser(Parser):
|
|
639
|
-
"""Parser running in daemon mode for real-time command processing."""
|
|
640
|
-
|
|
641
|
-
def __init__(self, **kwargs):
|
|
642
|
-
super().__init__(
|
|
643
|
-
parser_id=parser_instance_config.parser_id,
|
|
644
|
-
parser_name=parser_instance_config.parser_name,
|
|
645
|
-
**kwargs,
|
|
646
|
-
)
|
|
647
|
-
|
|
648
|
-
async def parse(self) -> dict:
|
|
649
|
-
"""Main parsing method for daemon mode."""
|
|
650
|
-
return {"success": True, "status": "daemon_running"}
|
|
651
|
-
|
|
652
|
-
class DaemonService:
|
|
653
|
-
"""Daemon service with Active Connection connectivity."""
|
|
654
|
-
|
|
655
|
-
def __init__(self):
|
|
656
|
-
self.parser = None
|
|
657
|
-
|
|
658
|
-
async def start_daemon(self):
|
|
659
|
-
"""Start daemon mode with Active Connection connectivity."""
|
|
660
|
-
try:
|
|
661
|
-
# Initialize parser with custom config
|
|
662
|
-
self.parser = DaemonParser()
|
|
663
|
-
await self.parser.setup()
|
|
664
|
-
|
|
665
|
-
# Daemon configuration from custom_config
|
|
666
|
-
daemon_config = {}
|
|
667
|
-
if hasattr(custom_config, 'daemon_config') and custom_config.daemon_config:
|
|
668
|
-
daemon_config = {
|
|
669
|
-
"server": custom_config.daemon_config.server_url,
|
|
670
|
-
"api_key": custom_config.daemon_config.api_key,
|
|
671
|
-
"heartbeat_interval": custom_config.daemon_config.heartbeat_interval,
|
|
672
|
-
"reconnect_attempts": custom_config.daemon_config.max_reconnect_attempts,
|
|
673
|
-
}
|
|
674
|
-
|
|
675
|
-
# Start daemon mode
|
|
676
|
-
await self.parser.daemon(**daemon_config)
|
|
677
|
-
|
|
678
|
-
except Exception as e:
|
|
679
|
-
print(f"Daemon error: {e}")
|
|
680
|
-
return False
|
|
681
|
-
finally:
|
|
682
|
-
if self.parser:
|
|
683
|
-
await self.parser.cleanup()
|
|
684
|
-
|
|
685
|
-
return True
|
|
686
|
-
|
|
687
|
-
# Run in daemon mode
|
|
688
|
-
async def run_daemon():
|
|
689
|
-
daemon = DaemonService()
|
|
690
|
-
await daemon.start_daemon()
|
|
691
|
-
|
|
692
|
-
if __name__ == "__main__":
|
|
693
|
-
import asyncio
|
|
694
|
-
asyncio.run(run_daemon())
|
|
695
|
-
```
|
|
696
|
-
|
|
697
|
-
## Real-Time Orchestration
|
|
698
|
-
|
|
699
|
-
### Active Connection Communication
|
|
700
|
-
|
|
701
|
-
The SDK establishes persistent Active Connection with the UnrealOn Server for real-time communication:
|
|
702
|
-
|
|
703
|
-
```python
|
|
704
|
-
class OrchestratedParser(Parser):
|
|
705
|
-
"""Parser with real-time command processing capabilities."""
|
|
706
|
-
|
|
707
|
-
@self.client.on_command("parse_products")
|
|
708
|
-
async def handle_parse_command(self, command):
|
|
709
|
-
"""Handle remote parse commands from server."""
|
|
710
|
-
query = command.data.get("query")
|
|
711
|
-
pages = command.data.get("pages", 1)
|
|
712
|
-
|
|
713
|
-
results = []
|
|
714
|
-
for page in range(pages):
|
|
715
|
-
url = f"https://example.com/search?q={query}&page={page}"
|
|
716
|
-
result = await self.parse_products_page(url)
|
|
717
|
-
results.append(result)
|
|
718
|
-
|
|
719
|
-
return {"success": True, "products": results}
|
|
720
|
-
```
|
|
721
|
-
|
|
722
|
-
### Monitoring and Management
|
|
723
|
-
|
|
724
|
-
- **Live Status Monitoring**: Real-time parser status and health checks
|
|
725
|
-
- **Performance Metrics**: Response times, success rates, error tracking
|
|
726
|
-
- **Cost Analytics**: LLM usage monitoring and cost optimization
|
|
727
|
-
- **Proxy Management**: Automatic rotation and health monitoring
|
|
728
|
-
- **Log Streaming**: Real-time log analysis and debugging
|
|
729
|
-
|
|
730
|
-
## Enterprise Features
|
|
731
|
-
|
|
732
|
-
### Security and Compliance
|
|
733
|
-
|
|
734
|
-
- **API Key Authentication**: Secure authentication with role-based access
|
|
735
|
-
- **Data Encryption**: End-to-end encryption for sensitive data
|
|
736
|
-
- **Audit Logging**: Comprehensive audit trails for compliance
|
|
737
|
-
- **Rate Limiting**: Built-in rate limiting and abuse prevention
|
|
738
|
-
|
|
739
|
-
### Scalability and Performance
|
|
740
|
-
|
|
741
|
-
- **Horizontal Scaling**: Add parser instances without code changes
|
|
742
|
-
- **Load Balancing**: Automatic distribution of parsing tasks
|
|
743
|
-
- **Caching**: Intelligent caching for improved performance
|
|
744
|
-
- **Failover**: Automatic failover and recovery mechanisms
|
|
745
|
-
|
|
746
|
-
### Development Tools
|
|
747
|
-
|
|
748
|
-
- **Testing Framework**: Built-in testing utilities and fixtures
|
|
749
|
-
- **Documentation**: Auto-generated API documentation
|
|
750
|
-
|
|
751
|
-
## Examples and Use Cases
|
|
752
|
-
|
|
753
|
-
### E-commerce Data Extraction
|
|
754
|
-
|
|
755
|
-
- **Product Listings**: Extract product information from search results
|
|
756
|
-
- **Price Monitoring**: Track price changes and availability
|
|
757
|
-
- **Review Analysis**: Collect and analyze customer reviews
|
|
758
|
-
- **Inventory Tracking**: Monitor stock levels and product availability
|
|
759
|
-
|
|
760
|
-
### Financial Data Collection
|
|
761
|
-
|
|
762
|
-
- **Market Data**: Extract stock prices and market information
|
|
763
|
-
- **News Analysis**: Collect and analyze financial news
|
|
764
|
-
- **Economic Indicators**: Monitor economic data and trends
|
|
765
|
-
|
|
766
|
-
### Research and Analytics
|
|
767
|
-
|
|
768
|
-
- **Competitive Intelligence**: Monitor competitor activities
|
|
769
|
-
- **Market Research**: Collect market data and insights
|
|
770
|
-
- **Academic Research**: Support research data collection
|
|
771
|
-
|
|
772
|
-
## Real Projects Built on UnrealOn
|
|
773
|
-
|
|
774
|
-
### 🚗 **CarAPIs** - Automotive Data Platform
|
|
775
|
-
**Platform**: [carapis.com](https://carapis.com)
|
|
776
|
-
**Use Case**: Vehicle information extraction from dealerships and marketplaces
|
|
777
|
-
**Features**: Real-time car listings, pricing analysis, market trends
|
|
778
|
-
**Technology**: AI-powered vehicle data extraction with 95% accuracy
|
|
779
|
-
|
|
780
|
-
### 🛒 **ShopAPIs** - E-commerce Intelligence
|
|
781
|
-
**Platform**: [shopapis.com](https://shopapis.com)
|
|
782
|
-
**Use Case**: Product monitoring and competitive analysis
|
|
783
|
-
**Features**: Price tracking, inventory monitoring, competitor analysis
|
|
784
|
-
**Technology**: Multi-platform e-commerce data collection
|
|
785
|
-
|
|
786
|
-
### 📊 **StockAPIs** - Financial Data Platform
|
|
787
|
-
**Platform**: [stockapis.com](https://stockapis.com)
|
|
788
|
-
**Use Case**: Market data and financial information extraction
|
|
789
|
-
**Features**: Real-time stock data, financial news analysis
|
|
790
|
-
**Technology**: High-frequency financial data collection
|
|
791
|
-
|
|
792
|
-
### 🏠 **PropAPIs** - Real Estate Data Platform
|
|
793
|
-
**Platform**: [propapis.com](https://propapis.com)
|
|
794
|
-
**Use Case**: Property listings and market analysis
|
|
795
|
-
**Features**: Real estate listings, price monitoring, market trends
|
|
796
|
-
**Technology**: Multi-source property data extraction
|
|
797
|
-
|
|
798
|
-
**All platforms built with UnrealOn for reliable, scalable data extraction.**
|
|
799
|
-
|
|
800
|
-
## License
|
|
801
|
-
|
|
802
|
-
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
803
|
-
|
|
804
|
-
## Commercial Platform
|
|
805
|
-
|
|
806
|
-
For enterprise features, managed hosting, and professional support, visit [unrealon.com](https://unrealon.com/).
|
|
807
|
-
|
|
808
|
-
---
|
|
809
|
-
|
|
810
|
-
**UnrealOn** - Enterprise-grade web scraping with AI-powered automation.
|