PyPI - unrealon - Versions diffs - 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

unrealon 1.0.9py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

unrealon/__init__.py +23 -21
unrealon-1.1.0.dist-info/METADATA +164 -0
unrealon-1.1.0.dist-info/RECORD +82 -0
{unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
unrealon-1.1.0.dist-info/entry_points.txt +9 -0
{unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
unrealon_bridge/__init__.py +114 -0
unrealon_bridge/cli.py +316 -0
unrealon_bridge/client/__init__.py +93 -0
unrealon_bridge/client/base.py +78 -0
unrealon_bridge/client/commands.py +89 -0
unrealon_bridge/client/connection.py +90 -0
unrealon_bridge/client/events.py +65 -0
unrealon_bridge/client/health.py +38 -0
unrealon_bridge/client/html_parser.py +146 -0
unrealon_bridge/client/logging.py +139 -0
unrealon_bridge/client/proxy.py +70 -0
unrealon_bridge/client/scheduler.py +450 -0
unrealon_bridge/client/session.py +70 -0
unrealon_bridge/configs/__init__.py +14 -0
unrealon_bridge/configs/bridge_config.py +212 -0
unrealon_bridge/configs/bridge_config.yaml +39 -0
unrealon_bridge/models/__init__.py +138 -0
unrealon_bridge/models/base.py +28 -0
unrealon_bridge/models/command.py +41 -0
unrealon_bridge/models/events.py +40 -0
unrealon_bridge/models/html_parser.py +79 -0
unrealon_bridge/models/logging.py +55 -0
unrealon_bridge/models/parser.py +63 -0
unrealon_bridge/models/proxy.py +41 -0
unrealon_bridge/models/requests.py +95 -0
unrealon_bridge/models/responses.py +88 -0
unrealon_bridge/models/scheduler.py +592 -0
unrealon_bridge/models/session.py +28 -0
unrealon_bridge/server/__init__.py +91 -0
unrealon_bridge/server/base.py +171 -0
unrealon_bridge/server/handlers/__init__.py +23 -0
unrealon_bridge/server/handlers/command.py +110 -0
unrealon_bridge/server/handlers/html_parser.py +139 -0
unrealon_bridge/server/handlers/logging.py +95 -0
unrealon_bridge/server/handlers/parser.py +95 -0
unrealon_bridge/server/handlers/proxy.py +75 -0
unrealon_bridge/server/handlers/scheduler.py +545 -0
unrealon_bridge/server/handlers/session.py +66 -0
unrealon_browser/__init__.py +61 -18
unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
unrealon_browser/{src/core → core}/browser_manager.py +2 -2
unrealon_browser/{src/managers → managers}/captcha.py +1 -1
unrealon_browser/{src/managers → managers}/cookies.py +1 -1
unrealon_browser/managers/logger_bridge.py +231 -0
unrealon_browser/{src/managers → managers}/profile.py +1 -1
unrealon_driver/__init__.py +73 -19
unrealon_driver/browser/__init__.py +8 -0
unrealon_driver/browser/config.py +74 -0
unrealon_driver/browser/manager.py +416 -0
unrealon_driver/exceptions.py +28 -0
unrealon_driver/parser/__init__.py +55 -0
unrealon_driver/parser/cli_manager.py +141 -0
unrealon_driver/parser/daemon_manager.py +227 -0
unrealon_driver/parser/managers/__init__.py +46 -0
unrealon_driver/parser/managers/browser.py +51 -0
unrealon_driver/parser/managers/config.py +281 -0
unrealon_driver/parser/managers/error.py +412 -0
unrealon_driver/parser/managers/html.py +732 -0
unrealon_driver/parser/managers/logging.py +609 -0
unrealon_driver/parser/managers/result.py +321 -0
unrealon_driver/parser/parser_manager.py +628 -0
unrealon/sdk_config.py +0 -88
unrealon-1.0.9.dist-info/METADATA +0 -810
unrealon-1.0.9.dist-info/RECORD +0 -246
unrealon_browser/pyproject.toml +0 -182
unrealon_browser/src/__init__.py +0 -62
unrealon_browser/src/managers/logger_bridge.py +0 -395
unrealon_driver/README.md +0 -204
unrealon_driver/pyproject.toml +0 -187
unrealon_driver/src/__init__.py +0 -90
unrealon_driver/src/cli/__init__.py +0 -10
unrealon_driver/src/cli/main.py +0 -66
unrealon_driver/src/cli/simple.py +0 -510
unrealon_driver/src/config/__init__.py +0 -11
unrealon_driver/src/config/auto_config.py +0 -478
unrealon_driver/src/core/__init__.py +0 -18
unrealon_driver/src/core/exceptions.py +0 -289
unrealon_driver/src/core/parser.py +0 -638
unrealon_driver/src/dto/__init__.py +0 -66
unrealon_driver/src/dto/cli.py +0 -119
unrealon_driver/src/dto/config.py +0 -18
unrealon_driver/src/dto/events.py +0 -237
unrealon_driver/src/dto/execution.py +0 -313
unrealon_driver/src/dto/services.py +0 -311
unrealon_driver/src/execution/__init__.py +0 -23
unrealon_driver/src/execution/daemon_mode.py +0 -317
unrealon_driver/src/execution/interactive_mode.py +0 -88
unrealon_driver/src/execution/modes.py +0 -45
unrealon_driver/src/execution/scheduled_mode.py +0 -209
unrealon_driver/src/execution/test_mode.py +0 -250
unrealon_driver/src/logging/__init__.py +0 -24
unrealon_driver/src/logging/driver_logger.py +0 -512
unrealon_driver/src/services/__init__.py +0 -24
unrealon_driver/src/services/browser_service.py +0 -726
unrealon_driver/src/services/llm/__init__.py +0 -15
unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
unrealon_driver/src/services/llm/llm.py +0 -195
unrealon_driver/src/services/logger_service.py +0 -232
unrealon_driver/src/services/metrics_service.py +0 -185
unrealon_driver/src/services/scheduler_service.py +0 -489
unrealon_driver/src/services/websocket_service.py +0 -362
unrealon_driver/src/utils/__init__.py +0 -16
unrealon_driver/src/utils/service_factory.py +0 -317
unrealon_driver/src/utils/time_formatter.py +0 -338
unrealon_llm/README.md +0 -44
unrealon_llm/__init__.py +0 -26
unrealon_llm/pyproject.toml +0 -154
unrealon_llm/src/__init__.py +0 -228
unrealon_llm/src/cli/__init__.py +0 -0
unrealon_llm/src/core/__init__.py +0 -11
unrealon_llm/src/core/smart_client.py +0 -438
unrealon_llm/src/dto/__init__.py +0 -155
unrealon_llm/src/dto/models/__init__.py +0 -0
unrealon_llm/src/dto/models/config.py +0 -343
unrealon_llm/src/dto/models/core.py +0 -328
unrealon_llm/src/dto/models/enums.py +0 -123
unrealon_llm/src/dto/models/html_analysis.py +0 -345
unrealon_llm/src/dto/models/statistics.py +0 -473
unrealon_llm/src/dto/models/translation.py +0 -383
unrealon_llm/src/dto/models/type_conversion.py +0 -462
unrealon_llm/src/dto/schemas/__init__.py +0 -0
unrealon_llm/src/exceptions.py +0 -392
unrealon_llm/src/llm_config/__init__.py +0 -20
unrealon_llm/src/llm_config/logging_config.py +0 -178
unrealon_llm/src/llm_logging/__init__.py +0 -42
unrealon_llm/src/llm_logging/llm_events.py +0 -107
unrealon_llm/src/llm_logging/llm_logger.py +0 -466
unrealon_llm/src/managers/__init__.py +0 -15
unrealon_llm/src/managers/cache_manager.py +0 -67
unrealon_llm/src/managers/cost_manager.py +0 -107
unrealon_llm/src/managers/request_manager.py +0 -298
unrealon_llm/src/modules/__init__.py +0 -0
unrealon_llm/src/modules/html_processor/__init__.py +0 -25
unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
unrealon_llm/src/modules/html_processor/processor.py +0 -102
unrealon_llm/src/modules/llm/__init__.py +0 -0
unrealon_llm/src/modules/translator/__init__.py +0 -0
unrealon_llm/src/provider.py +0 -116
unrealon_llm/src/utils/__init__.py +0 -95
unrealon_llm/src/utils/common.py +0 -64
unrealon_llm/src/utils/data_extractor.py +0 -188
unrealon_llm/src/utils/html_cleaner.py +0 -767
unrealon_llm/src/utils/language_detector.py +0 -308
unrealon_llm/src/utils/models_cache.py +0 -592
unrealon_llm/src/utils/smart_counter.py +0 -229
unrealon_llm/src/utils/token_counter.py +0 -189
unrealon_sdk/README.md +0 -25
unrealon_sdk/__init__.py +0 -30
unrealon_sdk/pyproject.toml +0 -231
unrealon_sdk/src/__init__.py +0 -150
unrealon_sdk/src/cli/__init__.py +0 -12
unrealon_sdk/src/cli/commands/__init__.py +0 -22
unrealon_sdk/src/cli/commands/benchmark.py +0 -42
unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
unrealon_sdk/src/cli/commands/health.py +0 -46
unrealon_sdk/src/cli/commands/integration.py +0 -498
unrealon_sdk/src/cli/commands/reports.py +0 -43
unrealon_sdk/src/cli/commands/security.py +0 -36
unrealon_sdk/src/cli/commands/server.py +0 -483
unrealon_sdk/src/cli/commands/servers.py +0 -56
unrealon_sdk/src/cli/commands/tests.py +0 -55
unrealon_sdk/src/cli/main.py +0 -126
unrealon_sdk/src/cli/utils/reporter.py +0 -519
unrealon_sdk/src/clients/openapi.yaml +0 -3347
unrealon_sdk/src/clients/python_http/__init__.py +0 -3
unrealon_sdk/src/clients/python_http/api_config.py +0 -228
unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
unrealon_sdk/src/clients/python_websocket/client.py +0 -490
unrealon_sdk/src/clients/python_websocket/events.py +0 -732
unrealon_sdk/src/clients/python_websocket/example.py +0 -136
unrealon_sdk/src/clients/python_websocket/types.py +0 -871
unrealon_sdk/src/core/__init__.py +0 -64
unrealon_sdk/src/core/client.py +0 -556
unrealon_sdk/src/core/config.py +0 -465
unrealon_sdk/src/core/exceptions.py +0 -239
unrealon_sdk/src/core/metadata.py +0 -191
unrealon_sdk/src/core/models.py +0 -142
unrealon_sdk/src/core/types.py +0 -68
unrealon_sdk/src/dto/__init__.py +0 -268
unrealon_sdk/src/dto/authentication.py +0 -108
unrealon_sdk/src/dto/cache.py +0 -208
unrealon_sdk/src/dto/common.py +0 -19
unrealon_sdk/src/dto/concurrency.py +0 -393
unrealon_sdk/src/dto/events.py +0 -108
unrealon_sdk/src/dto/health.py +0 -339
unrealon_sdk/src/dto/load_balancing.py +0 -336
unrealon_sdk/src/dto/logging.py +0 -230
unrealon_sdk/src/dto/performance.py +0 -165
unrealon_sdk/src/dto/rate_limiting.py +0 -295
unrealon_sdk/src/dto/resource_pooling.py +0 -128
unrealon_sdk/src/dto/structured_logging.py +0 -112
unrealon_sdk/src/dto/task_scheduling.py +0 -121
unrealon_sdk/src/dto/websocket.py +0 -55
unrealon_sdk/src/enterprise/__init__.py +0 -59
unrealon_sdk/src/enterprise/authentication.py +0 -401
unrealon_sdk/src/enterprise/cache_manager.py +0 -578
unrealon_sdk/src/enterprise/error_recovery.py +0 -494
unrealon_sdk/src/enterprise/event_system.py +0 -549
unrealon_sdk/src/enterprise/health_monitor.py +0 -747
unrealon_sdk/src/enterprise/load_balancer.py +0 -964
unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
unrealon_sdk/src/enterprise/logging/development.py +0 -744
unrealon_sdk/src/enterprise/logging/service.py +0 -410
unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
unrealon_sdk/src/enterprise/resource_pool.py +0 -763
unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
unrealon_sdk/src/internal/__init__.py +0 -10
unrealon_sdk/src/internal/command_router.py +0 -497
unrealon_sdk/src/internal/connection_manager.py +0 -397
unrealon_sdk/src/internal/http_client.py +0 -446
unrealon_sdk/src/internal/websocket_client.py +0 -420
unrealon_sdk/src/provider.py +0 -471
unrealon_sdk/src/utils.py +0 -234
/unrealon_browser/{src/cli → cli}/__init__.py +0 -0
/unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
/unrealon_browser/{src/cli → cli}/main.py +0 -0
/unrealon_browser/{src/core → core}/__init__.py +0 -0
/unrealon_browser/{src/dto → dto}/__init__.py +0 -0
/unrealon_browser/{src/dto → dto}/models/config.py +0 -0
/unrealon_browser/{src/dto → dto}/models/core.py +0 -0
/unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
/unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
/unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
/unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
/unrealon_browser/{src/managers → managers}/__init__.py +0 -0
/unrealon_browser/{src/managers → managers}/stealth.py +0 -0

unrealon_llm/src/modules/html_processor/base_processor.py DELETED Viewed

@@ -1,415 +0,0 @@
-"""
-Base HTML Processor
-Universal base class for HTML pattern extraction processors.
-Provides common functionality for listing and details processors.
-"""
-from abc import ABC, abstractmethod
-import json
-import random
-from typing import Type
-import traceback
-import re
-from unrealon_llm.src.core import SmartLLMClient
-from unrealon_llm.src.dto import ChatMessage, MessageRole
-from unrealon_llm.src.utils.html_cleaner import SmartHTMLCleaner
-from unrealon_llm.src.utils.data_extractor import SmartDataExtractor
-from unrealon_llm.src.llm_logging import (
-    get_llm_logger,
-    initialize_development_logger,
-    initialize_llm_logger,
-)
-from .models import (
-    UniversalExtractionSchema,
-    ProcessingInfo,
-    ExtractionResult,
-)
-# Ensure loggers are initialized
-logger = get_llm_logger()
-if logger is None:
-    try:
-        initialize_development_logger()
-        initialize_llm_logger()
-        logger = get_llm_logger()
-    except:
-        logger = None
-class BaseHTMLProcessor(ABC):
-    """Base class for HTML pattern extraction processors"""
-    def __init__(self, llm_client: SmartLLMClient):
-        """
-        Initialize base processor
-        Args:
-            llm_client: LLM client for AI analysis
-        """
-        self.llm_client = llm_client
-        self.cleaner = SmartHTMLCleaner()
-        self.data_extractor = SmartDataExtractor()
-        # Get processor-specific configuration
-        self.processor_type = self.get_processor_type()
-        self.schema_class = self.get_schema_class()
-        logger.log_html_analysis_start(
-            html_size_bytes=0,  # Will be filled when processing
-            target_elements=[self.processor_type],
-            details={"processor_class": self.__class__.__name__},
-        )
-    @abstractmethod
-    def get_processor_type(self) -> str:
-        """Return processor type identifier"""
-        pass
-    @abstractmethod
-    def get_schema_class(self) -> Type:
-        """Return Pydantic schema class for this processor"""
-        pass
-    @abstractmethod
-    def get_extraction_prompt_template(self) -> str:
-        """Return extraction prompt template for this processor type"""
-        pass
-    def _trim_system_prompt(self, system_prompt: str) -> str:
-        """Trim system prompt to remove empty lines"""
-        return "\n".join(system_prompt.split("\n")[1:])
-    async def extract_patterns(self, html_content: str) -> ExtractionResult:
-        """
-        Extract patterns from HTML using LLM intelligence
-        Args:
-            html_content: Raw HTML content
-        Returns:
-            ExtractionResult: Validated Pydantic result with extraction patterns and processing metadata
-        """
-        logger.log_html_analysis_start(
-            html_size_bytes=len(html_content),
-            target_elements=[self.processor_type],
-            details={"processor_type": self.processor_type},
-        )
-        # Clean HTML first with aggressive cleaning for LLM analysis
-        cleaned_html, extracted_data = self.cleaner.clean_html(
-            html_content, preserve_js_data=True, aggressive_cleaning=True
-        )
-        cleaning_stats = self.cleaner.get_cleaning_stats(html_content, cleaned_html)
-        logger.log_html_cleaning(
-            original_size_bytes=len(html_content),
-            cleaned_size_bytes=len(cleaned_html),
-            optimization_type="aggressive",
-            details=cleaning_stats,
-        )
-        # Build extraction prompt
-        prompt = self._build_extraction_prompt(cleaned_html)
-        # Log the full prompt for debugging
-        logger.log_llm_request_start(
-            provider="debug",
-            model="prompt_debug",
-            prompt_tokens=0,
-            details={
-                "full_prompt": prompt[:2000] + "..." if len(prompt) > 2000 else prompt,
-                "schema_json": json.dumps(
-                    self.schema_class.model_json_schema(), indent=2
-                ),
-            },
-        )
-        # Add critical format requirements to the prompt
-        SYSTEM_PROMPT = f"""
-        You are an HTML-to-JSON expert at analyzing {self.processor_type} pages.
-        You MUST return JSON that EXACTLY matches the Pydantic schema provided.
-        RESPOND ONLY WITH VALID JSON.
-        NO EXPLANATIONS, NO TEXT, ONLY JSON!
-        Include ALL required fields from the schema!
-        CRITICAL: The 'selectors' field must be a DICTIONARY/OBJECT, not a list!
-        """
-        # Prepare LLM messages
-        messages = [
-            ChatMessage(
-                role=MessageRole.SYSTEM,
-                content=self._trim_system_prompt(SYSTEM_PROMPT),
-            ),
-            ChatMessage(
-                role=MessageRole.USER,
-                content=prompt
-                + "\n\nRESPOND ONLY WITH JSON! START WITH { AND END WITH }. NO OTHER TEXT!",
-            ),
-        ]
-        logger.log_llm_request_start(
-            provider="openrouter",
-            model=getattr(self.llm_client, "model", "unknown"),
-            prompt_tokens=len(prompt) // 4,  # rough estimate
-            details={"processor_type": self.processor_type},
-        )
-        response = None
-        try:
-            # Call LLM
-            response = await self.llm_client.chat_completion(
-                messages, response_model=self.schema_class
-            )
-            # Log full LLM response for debugging
-            logger.log_llm_response_received(
-                provider="openrouter",
-                model=getattr(response, "model", "unknown"),
-                completion_tokens=(
-                    getattr(response.usage, "completion_tokens", 0)
-                    if hasattr(response, "usage")
-                    else 0
-                ),
-                total_tokens=(
-                    getattr(response.usage, "total_tokens", 0)
-                    if hasattr(response, "usage")
-                    else 0
-                ),
-                cost_usd=getattr(response, "cost_usd", 0.0),
-                details={"raw_response_full": response.content},
-            )
-            # Use the validated model from LLM response
-            if hasattr(response, "extracted_model") and response.extracted_model:
-                validated_model = response.extracted_model
-                validated_result = validated_model.model_dump()
-                logger.log_html_analysis_completed(
-                    selectors_generated=len(str(validated_result)),
-                    confidence_score=validated_result.get("confidence", 0.0),
-                    details={
-                        "processor_type": self.processor_type,
-                        "validation_success": True,
-                        "schema_matched": True,
-                    },
-                )
-            else:
-                # Fallback: parse manually if no model provided
-                result_data = self.data_extractor.extract_json(response.content)
-                try:
-                    validated_model = self.schema_class(**result_data)
-                    validated_result = validated_model.model_dump()
-                    logger.log_html_analysis_completed(
-                        selectors_generated=len(str(result_data)),
-                        confidence_score=result_data.get("confidence", 0.0),
-                        details={
-                            "processor_type": self.processor_type,
-                            "validation_success": True,
-                            "schema_matched": True,
-                        },
-                    )
-                except Exception as e:
-                    logger.log_html_analysis_failed(
-                        error_message=f"Pydantic validation failed: {str(e)}",
-                        details={
-                            "processor_type": self.processor_type,
-                            "validation_error": str(e),
-                            "raw_llm_response": result_data,
-                        },
-                    )
-                    # 🔥 SMART FALLBACK: Try to fix common LLM format issues
-                    try:
-                        fixed_data = self._fix_llm_response_format(result_data, str(e))
-                        validated_model = self.schema_class(**fixed_data)
-                        validated_result = validated_model.model_dump()
-                        logger.log_html_analysis_completed(
-                            selectors_generated=len(str(fixed_data)),
-                            confidence_score=fixed_data.get("confidence", 0.0),
-                            details={
-                                "processor_type": self.processor_type,
-                                "validation_success": True,
-                                "schema_matched": True,
-                                "format_fixed": True,
-                            },
-                        )
-                    except Exception as fix_error:
-                        logger.log_html_analysis_failed(
-                            error_message=f"Format fixing also failed: {str(fix_error)}",
-                            details={
-                                "processor_type": self.processor_type,
-                                "validation_error": str(e),
-                                "fix_error": str(fix_error),
-                                "raw_llm_response": result_data,
-                            },
-                        )
-                        # Final fallback: create minimal valid structure
-                        validated_result = self._create_fallback_result(
-                            result_data, str(e)
-                        )
-            # Create Pydantic processing metadata
-            processing_info = ProcessingInfo(
-                original_html_size=len(html_content),
-                cleaned_html_size=len(cleaned_html),
-                cleaning_stats=cleaning_stats,
-                extracted_js_data=extracted_data,
-                processor_type=self.processor_type,
-                llm_model=getattr(response, "model", "unknown"),
-                tokens_used=(
-                    getattr(response.usage, "total_tokens", 0)
-                    if hasattr(response, "usage")
-                    else 0
-                ),
-                cost_usd=getattr(response, "cost_usd", 0.0),
-            )
-            # Return validated Pydantic result
-            return ExtractionResult(
-                extraction_result=validated_result,
-                processing_info=processing_info,
-            )
-        except Exception as e:
-            logger.log_html_analysis_failed(
-                error_message=str(e),
-                details={
-                    "processor_type": self.processor_type,
-                    "raw_response": getattr(response, "content", "No response"),
-                    "traceback": traceback.format_exc(),
-                },
-            )
-            raise
-    def _build_extraction_prompt(self, cleaned_html: str) -> str:
-        """Build extraction prompt using processor-specific template"""
-        # Processors handle their own prompt construction with schema and HTML
-        # Just get the template and let it handle the details
-        prompt_template = self.get_extraction_prompt_template()
-        # Use more content for better analysis, but still respect token limits
-        html_limit = 50000  # Increase from 15K to 50K characters
-        # Build full prompt with auto-generated Pydantic 2 schema
-        schema_json = json.dumps(self.schema_class.model_json_schema(), indent=2)
-        # Add random number to bypass any caching
-        cache_buster = random.randint(100000, 999999)
-        schema_prompt = f"""
-        PYDANTIC 2 SCHEMA (Request #{cache_buster}):
-        {schema_json}
-        🚨 CRITICAL FORMAT REQUIREMENTS:
-        1. Return JSON that EXACTLY matches this schema structure!
-        2. The response must include ALL required fields: detected_item_type, extraction_strategy, confidence, selectors, documentation
-        3. The "selectors" field MUST be a DICTIONARY/OBJECT with field names as keys and arrays of CSS selectors as values
-        4. Example: "selectors": {{"title": ["h1.title", ".product-name"], "price": [".price", ".cost"]}}
-        5. DO NOT return "selectors" as a list: ❌ ["h1.title", ".price"]
-        6. DO return "selectors" as a dictionary: ✅ {{"title": ["h1.title"], "price": [".price"]}}
-        """
-        schema_prompt = self._trim_system_prompt(schema_prompt)
-        return prompt_template.format(
-            processor_type=self.processor_type,
-            html_content=cleaned_html[:html_limit]
-            + ("..." if len(cleaned_html) > html_limit else ""),
-            schema=schema_prompt,
-        )
-    def get_cost_estimate(self, html_content: str) -> float:
-        """
-        Estimate cost for processing HTML content
-        Args:
-            html_content: HTML content to estimate
-        Returns:
-            Estimated cost in USD
-        """
-        # Clean HTML to get realistic token count
-        cleaned_html, _ = self.cleaner.clean_html(
-            html_content, aggressive_cleaning=True
-        )
-        # Rough token estimation (1 token ≈ 4 characters)
-        estimated_tokens = len(cleaned_html) / 4
-        # Add prompt overhead (approximately 500 tokens)
-        total_tokens = estimated_tokens + 500
-        # Estimate cost (Claude Haiku: ~$0.25 per 1M input tokens)
-        estimated_cost = (total_tokens / 1_000_000) * 0.25
-        return estimated_cost
-    def _fix_llm_response_format(self, result_data: dict, error_message: str) -> dict:
-        """Fix common LLM response format issues."""
-        fixed_data = result_data.copy()
-        # Fix selectors if it's a list instead of dict
-        if "selectors" in fixed_data and isinstance(fixed_data["selectors"], list):
-            logger.log_html_analysis_failed(
-                error_message="Fixing selectors format: list -> dict",
-                details={
-                    "processor_type": self.processor_type,
-                    "original_selectors": fixed_data["selectors"],
-                },
-            )
-            # Convert list to dict with generic field names
-            selectors_list = fixed_data["selectors"]
-            fixed_data["selectors"] = {}
-            # Try to intelligently map list items to field names
-            field_names = ["item", "title", "price", "description", "image", "link"]
-            for i, selector in enumerate(selectors_list):
-                if i < len(field_names):
-                    field_name = field_names[i]
-                else:
-                    field_name = f"field_{i+1}"
-                # Convert single selector to list
-                if isinstance(selector, str):
-                    fixed_data["selectors"][field_name] = [selector]
-                elif isinstance(selector, list):
-                    fixed_data["selectors"][field_name] = selector
-                else:
-                    fixed_data["selectors"][field_name] = [str(selector)]
-        # Ensure all required fields exist
-        required_fields = [
-            "detected_item_type",
-            "extraction_strategy",
-            "confidence",
-            "selectors",
-            "documentation",
-        ]
-        for field in required_fields:
-            if field not in fixed_data:
-                if field == "detected_item_type":
-                    fixed_data[field] = "unknown"
-                elif field == "extraction_strategy":
-                    fixed_data[field] = "fallback_strategy"
-                elif field == "confidence":
-                    fixed_data[field] = 0.1
-                elif field == "selectors":
-                    fixed_data[field] = {}
-                elif field == "documentation":
-                    fixed_data[field] = (
-                        "Extraction completed with fallback processing due to format issues."
-                    )
-        return fixed_data
-    def _create_fallback_result(self, result_data: dict, error_message: str) -> dict:
-        """Create a minimal valid result when all else fails."""
-        return {
-            "detected_item_type": "unknown",
-            "extraction_strategy": "fallback_strategy",
-            "confidence": 0.1,
-            "selectors": {},
-            "documentation": f"Extraction failed due to validation error: {error_message}. Raw data: {str(result_data)[:500]}...",
-        }

unrealon_llm/src/modules/html_processor/details_processor.py DELETED Viewed

@@ -1,85 +0,0 @@
-"""
-Details Processor
-Universal processor for detail/product/item pages.
-Handles ANY type of detail pages: product details, service info, article content, job descriptions, etc.
-"""
-from typing import Type
-from .base_processor import BaseHTMLProcessor
-from .models import UniversalExtractionSchema
-class DetailsProcessor(BaseHTMLProcessor):
-    """Universal details page pattern extractor"""
-    def get_processor_type(self) -> str:
-        """Return processor type identifier"""
-        return "details"
-    def get_schema_class(self) -> Type[UniversalExtractionSchema]:
-        """Return Pydantic schema class for details extraction"""
-        return UniversalExtractionSchema
-    def get_extraction_prompt_template(self) -> str:
-        """Return details-specific extraction prompt template"""
-        prompt = """{schema}
-        [__TASK_DESCRIPTION__]
-        Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
-        PROCESSOR TYPE: {processor_type}
-        THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
-        [/__TASK_DESCRIPTION__]
-        [__CRITICAL_FORMAT_REQUIREMENTS__]
-        🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
-        Example of CORRECT format:
-        "selectors": {{
-            "title": ["h1.product-title", "h1.page-title", ".item-name"],
-            "price": [".price", ".cost", "span[data-price]", ".product-price"],
-            "description": [".description", ".product-desc", ".item-details"],
-            "images": ["img.product-image", ".gallery img", "img[src*='product']"],
-            "specifications": [".specs", ".product-specs", ".item-specifications"],
-            "reviews": [".reviews", ".product-reviews", ".customer-reviews"]
-        }}
-        ❌ WRONG format (DO NOT USE):
-        "selectors": ["h1.title", ".price", ".description"]
-        ✅ CORRECT format (USE THIS):
-        "selectors": {{
-            "title": ["h1.title", ".product-name", "h1[itemprop='name']"],
-            "price": [".price", ".cost", "span[data-price]"],
-            "description": [".description", ".product-desc", ".item-details"]
-        }}
-        [/__CRITICAL_FORMAT_REQUIREMENTS__]
-        [__INSTRUCTIONS__]
-        YOUR TASK:
-        Analyze this details page and generate extraction patterns for ANY type of item.
-        This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
-        CRITICAL REQUIREMENTS:
-        1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
-        2. Include comprehensive markdown documentation
-        3. Provide real examples from the actual HTML
-        4. Explain the page structure and best extraction approach
-        5. Include confidence scores and fallback strategies
-        6. Document any special handling needed
-        ANALYZE THE HTML AND DETERMINE:
-        - What type of item this page describes
-        - What information is available (specs, pricing, reviews, etc.)
-        - How content is structured and organized
-        - What actions are possible (buy, contact, etc.)
-        - Best extraction strategy for this specific page
-        [/__INSTRUCTIONS__]
-        [__HTML_CONTENT__]
-        HTML CONTENT (first 50KB):
-        {html_content}
-        [/__HTML_CONTENT__]
-        """
-        return self._trim_system_prompt(prompt)

unrealon_llm/src/modules/html_processor/listing_processor.py DELETED Viewed

@@ -1,91 +0,0 @@
-"""
-Listing Processor
-Universal processor for listing/catalog pages.
-Handles ANY type of listings: products, services, articles, real estate, jobs, etc.
-"""
-from typing import Type
-from .base_processor import BaseHTMLProcessor
-from .models import UniversalExtractionSchema
-class ListingProcessor(BaseHTMLProcessor):
-    """Universal listing page pattern extractor"""
-    def get_processor_type(self) -> str:
-        """Return processor type identifier"""
-        return "listing"
-    def get_schema_class(self) -> Type[UniversalExtractionSchema]:
-        """Return Pydantic schema class for listing extraction"""
-        return UniversalExtractionSchema
-    def get_extraction_prompt_template(self) -> str:
-        """Return listing-specific extraction prompt template"""
-        prompt = """{schema}
-        [__TASK_DESCRIPTION__]
-        Analyze this LISTING/CATALOG page and generate universal extraction patterns.
-        PROCESSOR TYPE: {processor_type}
-        THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
-        [/__TASK_DESCRIPTION__]
-        [__CRITICAL_FORMAT_REQUIREMENTS__]
-        🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
-        Example of CORRECT format:
-        "selectors": {{
-            "items_container": ["div.product-grid", "ul.product-list", "div.items"],
-            "item_title": ["h3.product-title", "a.product-link", ".item-name"],
-            "item_price": [".price", ".cost", "span[data-price]"],
-            "item_image": ["img.product-image", ".item-img", "img[src*='product']"],
-            "pagination": [".pagination", ".page-nav", "nav[aria-label='pagination']"]
-        }}
-        ❌ WRONG format (DO NOT USE):
-        "selectors": ["div.product", "h3.title", ".price"]
-        ✅ CORRECT format (USE THIS):
-        "selectors": {{
-            "items": ["div.product", "li.item", ".product-card"],
-            "titles": ["h3.title", ".product-name", "a[title]"],
-            "prices": [".price", ".cost", "span[data-price]"]
-        }}
-        [/__CRITICAL_FORMAT_REQUIREMENTS__]
-        [__INSTRUCTIONS__]
-        YOUR TASK:
-        Analyze this listing page and generate extraction patterns for ANY type of items.
-        This could be: products, services, articles, jobs, real estate, people, cars, etc.
-        CRITICAL REQUIREMENTS:
-        1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
-        2. This is a LISTING PAGE with multiple items
-        3. Focus on identifying item containers and individual item patterns
-        4. Detect ANY type of items - not just products!
-        5. Provide multiple fallback selectors for reliability
-        6. Include pagination and navigation patterns
-        7. Use realistic confidence scores (0.1-1.0)
-        8. Auto-detect what type of content this listing contains
-        9. Provide extraction strategy advice
-        10. Look for structured data (JSON-LD, microdata)
-        11. Generate patterns that work with BeautifulSoup4 .select() method
-        12. RETURN JSON that EXACTLY matches the Pydantic schema above!
-        ANALYZE THE HTML AND DETERMINE:
-        - What type of items are listed (products, services, articles, etc.)
-        - How items are structured and contained
-        - What navigation elements exist
-        - What metadata is available
-        - Best extraction strategy for this specific page
-        [/__INSTRUCTIONS__]
-        [__HTML_CONTENT__]
-        HTML CONTENT (first 50KB):
-        {html_content}
-        [/__HTML_CONTENT__]
-        """
-        return self._trim_system_prompt(prompt)

unrealon_llm/src/modules/html_processor/models/__init__.py DELETED Viewed

@@ -1,20 +0,0 @@
-"""
-HTML Processor Models
-Simplified universal model for HTML pattern extraction with markdown documentation.
-"""
-# Universal model
-from .universal_model import UniversalExtractionSchema
-# Processing models
-from .processing_models import ProcessingInfo, ExtractionResult
-__all__ = [
-    # Universal model
-    "UniversalExtractionSchema",
-    # Processing models
-    "ProcessingInfo",
-    "ExtractionResult",
-]

unrealon_llm/src/modules/html_processor/models/processing_models.py DELETED Viewed

@@ -1,40 +0,0 @@
-"""
-Processing Models for HTML Processing
-Pydantic models for processing metadata and results.
-"""
-from typing import Dict, Any
-from pydantic import BaseModel, Field, ConfigDict
-class ProcessingInfo(BaseModel):
-    """Processing metadata and statistics"""
-    model_config = ConfigDict(
-        validate_assignment=True,
-        extra="forbid",
-        title="Processing Information"
-    )
-    original_html_size: int = Field(..., description="Original HTML size in bytes")
-    cleaned_html_size: int = Field(..., description="Cleaned HTML size in bytes")
-    cleaning_stats: Dict[str, Any] = Field(..., description="HTML cleaning statistics")
-    extracted_js_data: Dict[str, Any] = Field(..., description="Extracted JavaScript data")
-    processor_type: str = Field(..., description="Type of processor used")
-    llm_model: str = Field(..., description="LLM model used for extraction")
-    tokens_used: int = Field(..., description="Total tokens used in LLM request")
-    cost_usd: float = Field(..., description="Cost of LLM request in USD")
-class ExtractionResult(BaseModel):
-    """Complete extraction result with metadata"""
-    model_config = ConfigDict(
-        validate_assignment=True,
-        extra="forbid",
-        title="Extraction Result"
-    )
-    extraction_result: Dict[str, Any] = Field(..., description="Raw extraction patterns")
-    processing_info: ProcessingInfo = Field(..., description="Processing metadata")

unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl

unrealon 1.0.9py3-none-any.whl → 1.1.0py3-none-any.whl