unrealon 1.0.5__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unrealon-1.0.5 → unrealon-1.0.6}/PKG-INFO +1 -1
- {unrealon-1.0.5 → unrealon-1.0.6}/pyproject.toml +1 -1
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon/sdk_config.py +1 -1
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/base_processor.py +134 -13
- unrealon-1.0.6/unrealon_llm/src/modules/html_processor/details_processor.py +85 -0
- unrealon-1.0.6/unrealon_llm/src/modules/html_processor/listing_processor.py +91 -0
- unrealon-1.0.5/unrealon_llm/src/modules/html_processor/details_processor.py +0 -61
- unrealon-1.0.5/unrealon_llm/src/modules/html_processor/listing_processor.py +0 -67
- {unrealon-1.0.5 → unrealon-1.0.6}/LICENSE +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/README.md +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/requirements-dev.txt +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/requirements.txt +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/README.md +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/pyproject.toml +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/browser_cli.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/cookies_cli.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/interactive_mode.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/cli/main.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/core/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/core/browser_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/config.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/core.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/dataclasses.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/detection.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/enums.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/dto/models/statistics.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/captcha.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/cookies.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/logger_bridge.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/profile.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_browser/src/managers/stealth.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/README.md +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/pyproject.toml +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/cli/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/cli/main.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/cli/simple.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/config/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/config/auto_config.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/core/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/core/exceptions.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/core/parser.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/cli.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/config.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/events.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/execution.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/dto/services.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/daemon_mode.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/interactive_mode.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/modes.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/scheduled_mode.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/execution/test_mode.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/logging/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/logging/driver_logger.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/browser_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/llm/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/llm/browser_llm_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/llm/llm.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/logger_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/metrics_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/scheduler_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/services/websocket_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/utils/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/utils/service_factory.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_driver/src/utils/time_formatter.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/README.md +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/pyproject.toml +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/cli/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/core/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/core/smart_client.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/config.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/core.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/enums.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/html_analysis.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/statistics.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/translation.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/models/type_conversion.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/dto/schemas/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/exceptions.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_config/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_config/logging_config.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_logging/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_logging/llm_events.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/llm_logging/llm_logger.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/cache_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/cost_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/managers/request_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/models/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/html_processor/processor.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/llm/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/modules/translator/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/provider.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/common.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/data_extractor.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/html_cleaner.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/language_detector.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/models_cache.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/smart_counter.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_llm/src/utils/token_counter.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/README.md +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/pyproject.toml +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/benchmark.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/diagnostics.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/health.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/integration.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/reports.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/security.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/server.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/servers.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/commands/tests.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/main.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/cli/utils/reporter.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/openapi.yaml +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/api_config.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/models/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/None_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/client.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/events.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/example.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/clients/python_websocket/types.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/client.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/config.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/exceptions.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/metadata.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/models.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/core/types.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/authentication.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/cache.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/common.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/concurrency.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/events.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/health.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/load_balancing.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/logging.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/performance.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/rate_limiting.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/resource_pooling.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/structured_logging.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/task_scheduling.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/dto/websocket.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/authentication.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/cache_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/error_recovery.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/event_system.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/health_monitor.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/load_balancer.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/cleanup.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/development.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/logging/service.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/multithreading_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/performance_monitor.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/proxy_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/rate_limiter.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/resource_pool.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/enterprise/task_scheduler.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/__init__.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/command_router.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/connection_manager.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/http_client.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/internal/websocket_client.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/provider.py +0 -0
- {unrealon-1.0.5 → unrealon-1.0.6}/unrealon_sdk/src/utils.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "unrealon"
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.6"
|
|
8
8
|
description = "AI-powered web scraping platform with real-time orchestration"
|
|
9
9
|
authors = ["Unrealon Team <dev@unrealon.com>"]
|
|
10
10
|
readme = "README.md"
|
|
@@ -10,6 +10,7 @@ import json
|
|
|
10
10
|
import random
|
|
11
11
|
from typing import Type
|
|
12
12
|
import traceback
|
|
13
|
+
import re
|
|
13
14
|
|
|
14
15
|
from unrealon_llm.src.core import SmartLLMClient
|
|
15
16
|
from unrealon_llm.src.dto import ChatMessage, MessageRole
|
|
@@ -77,6 +78,10 @@ class BaseHTMLProcessor(ABC):
|
|
|
77
78
|
"""Return extraction prompt template for this processor type"""
|
|
78
79
|
pass
|
|
79
80
|
|
|
81
|
+
def _trim_system_prompt(self, system_prompt: str) -> str:
|
|
82
|
+
"""Trim system prompt to remove empty lines"""
|
|
83
|
+
return "\n".join(system_prompt.split("\n")[1:])
|
|
84
|
+
|
|
80
85
|
async def extract_patterns(self, html_content: str) -> ExtractionResult:
|
|
81
86
|
"""
|
|
82
87
|
Extract patterns from HTML using LLM intelligence
|
|
@@ -116,15 +121,27 @@ class BaseHTMLProcessor(ABC):
|
|
|
116
121
|
prompt_tokens=0,
|
|
117
122
|
details={
|
|
118
123
|
"full_prompt": prompt[:2000] + "..." if len(prompt) > 2000 else prompt,
|
|
119
|
-
"schema_json": json.dumps(
|
|
120
|
-
|
|
124
|
+
"schema_json": json.dumps(
|
|
125
|
+
self.schema_class.model_json_schema(), indent=2
|
|
126
|
+
),
|
|
127
|
+
},
|
|
121
128
|
)
|
|
122
129
|
|
|
130
|
+
# Add critical format requirements to the prompt
|
|
131
|
+
SYSTEM_PROMPT = f"""
|
|
132
|
+
You are an HTML-to-JSON expert at analyzing {self.processor_type} pages.
|
|
133
|
+
You MUST return JSON that EXACTLY matches the Pydantic schema provided.
|
|
134
|
+
RESPOND ONLY WITH VALID JSON.
|
|
135
|
+
NO EXPLANATIONS, NO TEXT, ONLY JSON!
|
|
136
|
+
Include ALL required fields from the schema!
|
|
137
|
+
CRITICAL: The 'selectors' field must be a DICTIONARY/OBJECT, not a list!
|
|
138
|
+
"""
|
|
139
|
+
|
|
123
140
|
# Prepare LLM messages
|
|
124
141
|
messages = [
|
|
125
142
|
ChatMessage(
|
|
126
143
|
role=MessageRole.SYSTEM,
|
|
127
|
-
content=
|
|
144
|
+
content=self._trim_system_prompt(SYSTEM_PROMPT),
|
|
128
145
|
),
|
|
129
146
|
ChatMessage(
|
|
130
147
|
role=MessageRole.USER,
|
|
@@ -144,8 +161,7 @@ class BaseHTMLProcessor(ABC):
|
|
|
144
161
|
try:
|
|
145
162
|
# Call LLM
|
|
146
163
|
response = await self.llm_client.chat_completion(
|
|
147
|
-
messages,
|
|
148
|
-
response_model=self.schema_class
|
|
164
|
+
messages, response_model=self.schema_class
|
|
149
165
|
)
|
|
150
166
|
|
|
151
167
|
# Log full LLM response for debugging
|
|
@@ -167,7 +183,7 @@ class BaseHTMLProcessor(ABC):
|
|
|
167
183
|
)
|
|
168
184
|
|
|
169
185
|
# Use the validated model from LLM response
|
|
170
|
-
if hasattr(response,
|
|
186
|
+
if hasattr(response, "extracted_model") and response.extracted_model:
|
|
171
187
|
validated_model = response.extracted_model
|
|
172
188
|
validated_result = validated_model.model_dump()
|
|
173
189
|
logger.log_html_analysis_completed(
|
|
@@ -203,8 +219,36 @@ class BaseHTMLProcessor(ABC):
|
|
|
203
219
|
"raw_llm_response": result_data,
|
|
204
220
|
},
|
|
205
221
|
)
|
|
206
|
-
|
|
207
|
-
|
|
222
|
+
|
|
223
|
+
# 🔥 SMART FALLBACK: Try to fix common LLM format issues
|
|
224
|
+
try:
|
|
225
|
+
fixed_data = self._fix_llm_response_format(result_data, str(e))
|
|
226
|
+
validated_model = self.schema_class(**fixed_data)
|
|
227
|
+
validated_result = validated_model.model_dump()
|
|
228
|
+
logger.log_html_analysis_completed(
|
|
229
|
+
selectors_generated=len(str(fixed_data)),
|
|
230
|
+
confidence_score=fixed_data.get("confidence", 0.0),
|
|
231
|
+
details={
|
|
232
|
+
"processor_type": self.processor_type,
|
|
233
|
+
"validation_success": True,
|
|
234
|
+
"schema_matched": True,
|
|
235
|
+
"format_fixed": True,
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
except Exception as fix_error:
|
|
239
|
+
logger.log_html_analysis_failed(
|
|
240
|
+
error_message=f"Format fixing also failed: {str(fix_error)}",
|
|
241
|
+
details={
|
|
242
|
+
"processor_type": self.processor_type,
|
|
243
|
+
"validation_error": str(e),
|
|
244
|
+
"fix_error": str(fix_error),
|
|
245
|
+
"raw_llm_response": result_data,
|
|
246
|
+
},
|
|
247
|
+
)
|
|
248
|
+
# Final fallback: create minimal valid structure
|
|
249
|
+
validated_result = self._create_fallback_result(
|
|
250
|
+
result_data, str(e)
|
|
251
|
+
)
|
|
208
252
|
|
|
209
253
|
# Create Pydantic processing metadata
|
|
210
254
|
processing_info = ProcessingInfo(
|
|
@@ -253,12 +297,20 @@ class BaseHTMLProcessor(ABC):
|
|
|
253
297
|
|
|
254
298
|
# Add random number to bypass any caching
|
|
255
299
|
cache_buster = random.randint(100000, 999999)
|
|
256
|
-
|
|
257
|
-
schema_prompt = f"""PYDANTIC 2 SCHEMA (Request #{cache_buster}):
|
|
258
|
-
{schema_json}
|
|
259
300
|
|
|
260
|
-
|
|
261
|
-
|
|
301
|
+
schema_prompt = f"""
|
|
302
|
+
PYDANTIC 2 SCHEMA (Request #{cache_buster}):
|
|
303
|
+
{schema_json}
|
|
304
|
+
|
|
305
|
+
🚨 CRITICAL FORMAT REQUIREMENTS:
|
|
306
|
+
1. Return JSON that EXACTLY matches this schema structure!
|
|
307
|
+
2. The response must include ALL required fields: detected_item_type, extraction_strategy, confidence, selectors, documentation
|
|
308
|
+
3. The "selectors" field MUST be a DICTIONARY/OBJECT with field names as keys and arrays of CSS selectors as values
|
|
309
|
+
4. Example: "selectors": {{"title": ["h1.title", ".product-name"], "price": [".price", ".cost"]}}
|
|
310
|
+
5. DO NOT return "selectors" as a list: ❌ ["h1.title", ".price"]
|
|
311
|
+
6. DO return "selectors" as a dictionary: ✅ {{"title": ["h1.title"], "price": [".price"]}}
|
|
312
|
+
"""
|
|
313
|
+
schema_prompt = self._trim_system_prompt(schema_prompt)
|
|
262
314
|
|
|
263
315
|
return prompt_template.format(
|
|
264
316
|
processor_type=self.processor_type,
|
|
@@ -292,3 +344,72 @@ The response must include ALL required fields: detected_item_type, extraction_st
|
|
|
292
344
|
estimated_cost = (total_tokens / 1_000_000) * 0.25
|
|
293
345
|
|
|
294
346
|
return estimated_cost
|
|
347
|
+
|
|
348
|
+
def _fix_llm_response_format(self, result_data: dict, error_message: str) -> dict:
|
|
349
|
+
"""Fix common LLM response format issues."""
|
|
350
|
+
fixed_data = result_data.copy()
|
|
351
|
+
|
|
352
|
+
# Fix selectors if it's a list instead of dict
|
|
353
|
+
if "selectors" in fixed_data and isinstance(fixed_data["selectors"], list):
|
|
354
|
+
logger.log_html_analysis_failed(
|
|
355
|
+
error_message="Fixing selectors format: list -> dict",
|
|
356
|
+
details={
|
|
357
|
+
"processor_type": self.processor_type,
|
|
358
|
+
"original_selectors": fixed_data["selectors"],
|
|
359
|
+
},
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Convert list to dict with generic field names
|
|
363
|
+
selectors_list = fixed_data["selectors"]
|
|
364
|
+
fixed_data["selectors"] = {}
|
|
365
|
+
|
|
366
|
+
# Try to intelligently map list items to field names
|
|
367
|
+
field_names = ["item", "title", "price", "description", "image", "link"]
|
|
368
|
+
for i, selector in enumerate(selectors_list):
|
|
369
|
+
if i < len(field_names):
|
|
370
|
+
field_name = field_names[i]
|
|
371
|
+
else:
|
|
372
|
+
field_name = f"field_{i+1}"
|
|
373
|
+
|
|
374
|
+
# Convert single selector to list
|
|
375
|
+
if isinstance(selector, str):
|
|
376
|
+
fixed_data["selectors"][field_name] = [selector]
|
|
377
|
+
elif isinstance(selector, list):
|
|
378
|
+
fixed_data["selectors"][field_name] = selector
|
|
379
|
+
else:
|
|
380
|
+
fixed_data["selectors"][field_name] = [str(selector)]
|
|
381
|
+
|
|
382
|
+
# Ensure all required fields exist
|
|
383
|
+
required_fields = [
|
|
384
|
+
"detected_item_type",
|
|
385
|
+
"extraction_strategy",
|
|
386
|
+
"confidence",
|
|
387
|
+
"selectors",
|
|
388
|
+
"documentation",
|
|
389
|
+
]
|
|
390
|
+
for field in required_fields:
|
|
391
|
+
if field not in fixed_data:
|
|
392
|
+
if field == "detected_item_type":
|
|
393
|
+
fixed_data[field] = "unknown"
|
|
394
|
+
elif field == "extraction_strategy":
|
|
395
|
+
fixed_data[field] = "fallback_strategy"
|
|
396
|
+
elif field == "confidence":
|
|
397
|
+
fixed_data[field] = 0.1
|
|
398
|
+
elif field == "selectors":
|
|
399
|
+
fixed_data[field] = {}
|
|
400
|
+
elif field == "documentation":
|
|
401
|
+
fixed_data[field] = (
|
|
402
|
+
"Extraction completed with fallback processing due to format issues."
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
return fixed_data
|
|
406
|
+
|
|
407
|
+
def _create_fallback_result(self, result_data: dict, error_message: str) -> dict:
|
|
408
|
+
"""Create a minimal valid result when all else fails."""
|
|
409
|
+
return {
|
|
410
|
+
"detected_item_type": "unknown",
|
|
411
|
+
"extraction_strategy": "fallback_strategy",
|
|
412
|
+
"confidence": 0.1,
|
|
413
|
+
"selectors": {},
|
|
414
|
+
"documentation": f"Extraction failed due to validation error: {error_message}. Raw data: {str(result_data)[:500]}...",
|
|
415
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Details Processor
|
|
3
|
+
|
|
4
|
+
Universal processor for detail/product/item pages.
|
|
5
|
+
Handles ANY type of detail pages: product details, service info, article content, job descriptions, etc.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Type
|
|
9
|
+
|
|
10
|
+
from .base_processor import BaseHTMLProcessor
|
|
11
|
+
from .models import UniversalExtractionSchema
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DetailsProcessor(BaseHTMLProcessor):
|
|
15
|
+
"""Universal details page pattern extractor"""
|
|
16
|
+
|
|
17
|
+
def get_processor_type(self) -> str:
|
|
18
|
+
"""Return processor type identifier"""
|
|
19
|
+
return "details"
|
|
20
|
+
|
|
21
|
+
def get_schema_class(self) -> Type[UniversalExtractionSchema]:
|
|
22
|
+
"""Return Pydantic schema class for details extraction"""
|
|
23
|
+
return UniversalExtractionSchema
|
|
24
|
+
|
|
25
|
+
def get_extraction_prompt_template(self) -> str:
|
|
26
|
+
"""Return details-specific extraction prompt template"""
|
|
27
|
+
|
|
28
|
+
prompt = """{schema}
|
|
29
|
+
[__TASK_DESCRIPTION__]
|
|
30
|
+
Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
|
|
31
|
+
PROCESSOR TYPE: {processor_type}
|
|
32
|
+
THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
|
|
33
|
+
[/__TASK_DESCRIPTION__]
|
|
34
|
+
|
|
35
|
+
[__CRITICAL_FORMAT_REQUIREMENTS__]
|
|
36
|
+
🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
|
|
37
|
+
Example of CORRECT format:
|
|
38
|
+
"selectors": {{
|
|
39
|
+
"title": ["h1.product-title", "h1.page-title", ".item-name"],
|
|
40
|
+
"price": [".price", ".cost", "span[data-price]", ".product-price"],
|
|
41
|
+
"description": [".description", ".product-desc", ".item-details"],
|
|
42
|
+
"images": ["img.product-image", ".gallery img", "img[src*='product']"],
|
|
43
|
+
"specifications": [".specs", ".product-specs", ".item-specifications"],
|
|
44
|
+
"reviews": [".reviews", ".product-reviews", ".customer-reviews"]
|
|
45
|
+
}}
|
|
46
|
+
|
|
47
|
+
❌ WRONG format (DO NOT USE):
|
|
48
|
+
"selectors": ["h1.title", ".price", ".description"]
|
|
49
|
+
|
|
50
|
+
✅ CORRECT format (USE THIS):
|
|
51
|
+
"selectors": {{
|
|
52
|
+
"title": ["h1.title", ".product-name", "h1[itemprop='name']"],
|
|
53
|
+
"price": [".price", ".cost", "span[data-price]"],
|
|
54
|
+
"description": [".description", ".product-desc", ".item-details"]
|
|
55
|
+
}}
|
|
56
|
+
[/__CRITICAL_FORMAT_REQUIREMENTS__]
|
|
57
|
+
|
|
58
|
+
[__INSTRUCTIONS__]
|
|
59
|
+
YOUR TASK:
|
|
60
|
+
Analyze this details page and generate extraction patterns for ANY type of item.
|
|
61
|
+
This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
|
|
62
|
+
|
|
63
|
+
CRITICAL REQUIREMENTS:
|
|
64
|
+
1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
|
|
65
|
+
2. Include comprehensive markdown documentation
|
|
66
|
+
3. Provide real examples from the actual HTML
|
|
67
|
+
4. Explain the page structure and best extraction approach
|
|
68
|
+
5. Include confidence scores and fallback strategies
|
|
69
|
+
6. Document any special handling needed
|
|
70
|
+
|
|
71
|
+
ANALYZE THE HTML AND DETERMINE:
|
|
72
|
+
- What type of item this page describes
|
|
73
|
+
- What information is available (specs, pricing, reviews, etc.)
|
|
74
|
+
- How content is structured and organized
|
|
75
|
+
- What actions are possible (buy, contact, etc.)
|
|
76
|
+
- Best extraction strategy for this specific page
|
|
77
|
+
[/__INSTRUCTIONS__]
|
|
78
|
+
|
|
79
|
+
[__HTML_CONTENT__]
|
|
80
|
+
HTML CONTENT (first 50KB):
|
|
81
|
+
{html_content}
|
|
82
|
+
[/__HTML_CONTENT__]
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
return self._trim_system_prompt(prompt)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Listing Processor
|
|
3
|
+
|
|
4
|
+
Universal processor for listing/catalog pages.
|
|
5
|
+
Handles ANY type of listings: products, services, articles, real estate, jobs, etc.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Type
|
|
9
|
+
|
|
10
|
+
from .base_processor import BaseHTMLProcessor
|
|
11
|
+
from .models import UniversalExtractionSchema
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ListingProcessor(BaseHTMLProcessor):
|
|
15
|
+
"""Universal listing page pattern extractor"""
|
|
16
|
+
|
|
17
|
+
def get_processor_type(self) -> str:
|
|
18
|
+
"""Return processor type identifier"""
|
|
19
|
+
return "listing"
|
|
20
|
+
|
|
21
|
+
def get_schema_class(self) -> Type[UniversalExtractionSchema]:
|
|
22
|
+
"""Return Pydantic schema class for listing extraction"""
|
|
23
|
+
return UniversalExtractionSchema
|
|
24
|
+
|
|
25
|
+
def get_extraction_prompt_template(self) -> str:
|
|
26
|
+
"""Return listing-specific extraction prompt template"""
|
|
27
|
+
|
|
28
|
+
prompt = """{schema}
|
|
29
|
+
|
|
30
|
+
[__TASK_DESCRIPTION__]
|
|
31
|
+
Analyze this LISTING/CATALOG page and generate universal extraction patterns.
|
|
32
|
+
PROCESSOR TYPE: {processor_type}
|
|
33
|
+
THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
|
|
34
|
+
[/__TASK_DESCRIPTION__]
|
|
35
|
+
|
|
36
|
+
[__CRITICAL_FORMAT_REQUIREMENTS__]
|
|
37
|
+
🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
|
|
38
|
+
Example of CORRECT format:
|
|
39
|
+
"selectors": {{
|
|
40
|
+
"items_container": ["div.product-grid", "ul.product-list", "div.items"],
|
|
41
|
+
"item_title": ["h3.product-title", "a.product-link", ".item-name"],
|
|
42
|
+
"item_price": [".price", ".cost", "span[data-price]"],
|
|
43
|
+
"item_image": ["img.product-image", ".item-img", "img[src*='product']"],
|
|
44
|
+
"pagination": [".pagination", ".page-nav", "nav[aria-label='pagination']"]
|
|
45
|
+
}}
|
|
46
|
+
|
|
47
|
+
❌ WRONG format (DO NOT USE):
|
|
48
|
+
"selectors": ["div.product", "h3.title", ".price"]
|
|
49
|
+
|
|
50
|
+
✅ CORRECT format (USE THIS):
|
|
51
|
+
"selectors": {{
|
|
52
|
+
"items": ["div.product", "li.item", ".product-card"],
|
|
53
|
+
"titles": ["h3.title", ".product-name", "a[title]"],
|
|
54
|
+
"prices": [".price", ".cost", "span[data-price]"]
|
|
55
|
+
}}
|
|
56
|
+
[/__CRITICAL_FORMAT_REQUIREMENTS__]
|
|
57
|
+
|
|
58
|
+
[__INSTRUCTIONS__]
|
|
59
|
+
YOUR TASK:
|
|
60
|
+
Analyze this listing page and generate extraction patterns for ANY type of items.
|
|
61
|
+
This could be: products, services, articles, jobs, real estate, people, cars, etc.
|
|
62
|
+
|
|
63
|
+
CRITICAL REQUIREMENTS:
|
|
64
|
+
1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
|
|
65
|
+
2. This is a LISTING PAGE with multiple items
|
|
66
|
+
3. Focus on identifying item containers and individual item patterns
|
|
67
|
+
4. Detect ANY type of items - not just products!
|
|
68
|
+
5. Provide multiple fallback selectors for reliability
|
|
69
|
+
6. Include pagination and navigation patterns
|
|
70
|
+
7. Use realistic confidence scores (0.1-1.0)
|
|
71
|
+
8. Auto-detect what type of content this listing contains
|
|
72
|
+
9. Provide extraction strategy advice
|
|
73
|
+
10. Look for structured data (JSON-LD, microdata)
|
|
74
|
+
11. Generate patterns that work with BeautifulSoup4 .select() method
|
|
75
|
+
12. RETURN JSON that EXACTLY matches the Pydantic schema above!
|
|
76
|
+
|
|
77
|
+
ANALYZE THE HTML AND DETERMINE:
|
|
78
|
+
- What type of items are listed (products, services, articles, etc.)
|
|
79
|
+
- How items are structured and contained
|
|
80
|
+
- What navigation elements exist
|
|
81
|
+
- What metadata is available
|
|
82
|
+
- Best extraction strategy for this specific page
|
|
83
|
+
[/__INSTRUCTIONS__]
|
|
84
|
+
|
|
85
|
+
[__HTML_CONTENT__]
|
|
86
|
+
HTML CONTENT (first 50KB):
|
|
87
|
+
{html_content}
|
|
88
|
+
[/__HTML_CONTENT__]
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
return self._trim_system_prompt(prompt)
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Details Processor
|
|
3
|
-
|
|
4
|
-
Universal processor for detail/product/item pages.
|
|
5
|
-
Handles ANY type of detail pages: product details, service info, article content, job descriptions, etc.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from typing import Type
|
|
9
|
-
|
|
10
|
-
from .base_processor import BaseHTMLProcessor
|
|
11
|
-
from .models import UniversalExtractionSchema
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class DetailsProcessor(BaseHTMLProcessor):
|
|
15
|
-
"""Universal details page pattern extractor"""
|
|
16
|
-
|
|
17
|
-
def get_processor_type(self) -> str:
|
|
18
|
-
"""Return processor type identifier"""
|
|
19
|
-
return "details"
|
|
20
|
-
|
|
21
|
-
def get_schema_class(self) -> Type[UniversalExtractionSchema]:
|
|
22
|
-
"""Return Pydantic schema class for details extraction"""
|
|
23
|
-
return UniversalExtractionSchema
|
|
24
|
-
|
|
25
|
-
def get_extraction_prompt_template(self) -> str:
|
|
26
|
-
"""Return details-specific extraction prompt template"""
|
|
27
|
-
|
|
28
|
-
return """{schema}
|
|
29
|
-
|
|
30
|
-
[__TASK_DESCRIPTION__]
|
|
31
|
-
Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
|
|
32
|
-
PROCESSOR TYPE: {processor_type}
|
|
33
|
-
THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
|
|
34
|
-
[/__TASK_DESCRIPTION__]
|
|
35
|
-
|
|
36
|
-
[__INSTRUCTIONS__]
|
|
37
|
-
YOUR TASK:
|
|
38
|
-
Analyze this details page and generate extraction patterns for ANY type of item.
|
|
39
|
-
This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
|
|
40
|
-
|
|
41
|
-
CRITICAL REQUIREMENTS:
|
|
42
|
-
1. Return simple CSS selectors in the "selectors" object
|
|
43
|
-
2. Include comprehensive markdown documentation
|
|
44
|
-
3. Provide real examples from the actual HTML
|
|
45
|
-
4. Explain the page structure and best extraction approach
|
|
46
|
-
5. Include confidence scores and fallback strategies
|
|
47
|
-
6. Document any special handling needed
|
|
48
|
-
|
|
49
|
-
ANALYZE THE HTML AND DETERMINE:
|
|
50
|
-
- What type of item this page describes
|
|
51
|
-
- What information is available (specs, pricing, reviews, etc.)
|
|
52
|
-
- How content is structured and organized
|
|
53
|
-
- What actions are possible (buy, contact, etc.)
|
|
54
|
-
- Best extraction strategy for this specific page
|
|
55
|
-
[/__INSTRUCTIONS__]
|
|
56
|
-
|
|
57
|
-
[__HTML_CONTENT__]
|
|
58
|
-
HTML CONTENT (first 50KB):
|
|
59
|
-
{html_content}
|
|
60
|
-
[/__HTML_CONTENT__]
|
|
61
|
-
"""
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Listing Processor
|
|
3
|
-
|
|
4
|
-
Universal processor for listing/catalog pages.
|
|
5
|
-
Handles ANY type of listings: products, services, articles, real estate, jobs, etc.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from typing import Type
|
|
9
|
-
|
|
10
|
-
from .base_processor import BaseHTMLProcessor
|
|
11
|
-
from .models import UniversalExtractionSchema
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class ListingProcessor(BaseHTMLProcessor):
|
|
15
|
-
"""Universal listing page pattern extractor"""
|
|
16
|
-
|
|
17
|
-
def get_processor_type(self) -> str:
|
|
18
|
-
"""Return processor type identifier"""
|
|
19
|
-
return "listing"
|
|
20
|
-
|
|
21
|
-
def get_schema_class(self) -> Type[UniversalExtractionSchema]:
|
|
22
|
-
"""Return Pydantic schema class for listing extraction"""
|
|
23
|
-
return UniversalExtractionSchema
|
|
24
|
-
|
|
25
|
-
def get_extraction_prompt_template(self) -> str:
|
|
26
|
-
"""Return listing-specific extraction prompt template"""
|
|
27
|
-
|
|
28
|
-
return """{schema}
|
|
29
|
-
|
|
30
|
-
[__TASK_DESCRIPTION__]
|
|
31
|
-
Analyze this LISTING/CATALOG page and generate universal extraction patterns.
|
|
32
|
-
PROCESSOR TYPE: {processor_type}
|
|
33
|
-
THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
|
|
34
|
-
[/__TASK_DESCRIPTION__]
|
|
35
|
-
|
|
36
|
-
[__INSTRUCTIONS__]
|
|
37
|
-
YOUR TASK:
|
|
38
|
-
Analyze this listing page and generate extraction patterns for ANY type of items.
|
|
39
|
-
This could be: products, services, articles, jobs, real estate, people, cars, etc.
|
|
40
|
-
|
|
41
|
-
CRITICAL REQUIREMENTS:
|
|
42
|
-
1. Return simple CSS selectors in the "selectors" object
|
|
43
|
-
2. This is a LISTING PAGE with multiple items
|
|
44
|
-
3. Focus on identifying item containers and individual item patterns
|
|
45
|
-
4. Detect ANY type of items - not just products!
|
|
46
|
-
5. Provide multiple fallback selectors for reliability
|
|
47
|
-
6. Include pagination and navigation patterns
|
|
48
|
-
7. Use realistic confidence scores (0.1-1.0)
|
|
49
|
-
8. Auto-detect what type of content this listing contains
|
|
50
|
-
9. Provide extraction strategy advice
|
|
51
|
-
10. Look for structured data (JSON-LD, microdata)
|
|
52
|
-
11. Generate patterns that work with BeautifulSoup4 .select() method
|
|
53
|
-
12. RETURN JSON that EXACTLY matches the Pydantic schema above!
|
|
54
|
-
|
|
55
|
-
ANALYZE THE HTML AND DETERMINE:
|
|
56
|
-
- What type of items are listed (products, services, articles, etc.)
|
|
57
|
-
- How items are structured and contained
|
|
58
|
-
- What navigation elements exist
|
|
59
|
-
- What metadata is available
|
|
60
|
-
- Best extraction strategy for this specific page
|
|
61
|
-
[/__INSTRUCTIONS__]
|
|
62
|
-
|
|
63
|
-
[__HTML_CONTENT__]
|
|
64
|
-
HTML CONTENT (first 50KB):
|
|
65
|
-
{html_content}
|
|
66
|
-
[/__HTML_CONTENT__]
|
|
67
|
-
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|