PyPI - unrealon - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

unrealon 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

unrealon/__init__.py +28 -0
unrealon/sdk_config.py +88 -0
{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/METADATA +1 -1
{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/RECORD +13 -11
unrealon_browser/__init__.py +1 -1
unrealon_driver/__init__.py +1 -1
unrealon_llm/__init__.py +1 -1
unrealon_llm/src/modules/html_processor/base_processor.py +134 -13
unrealon_llm/src/modules/html_processor/details_processor.py +58 -34
unrealon_llm/src/modules/html_processor/listing_processor.py +64 -40
unrealon_sdk/__init__.py +1 -1
{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/LICENSE +0 -0
{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/WHEEL +0 -0

unrealon/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""
+UnrealOn SDK - Main Package
+"""
+# Import from centralized config
+from .sdk_config import (
+    VERSION as __version__,
+    AUTHOR as __author__,
+    AUTHOR_EMAIL as __email__,
+    LICENSE as __license__,
+    PROJECT_URL as __url__,
+    VERSION_INFO,
+    PROJECT_INFO,
+    get_version,
+    is_debug_mode,
+)
+__all__ = [
+    "__version__",
+    "__author__",
+    "__email__",
+    "__license__",
+    "__url__",
+    "VERSION_INFO",
+    "PROJECT_INFO",
+    "get_version",
+    "is_debug_mode",
+]

unrealon/sdk_config.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""
+UnrealOn SDK Simple Config
+Simple centralized configuration with Pydantic v2 models.
+"""
+import os
+from pydantic import BaseModel, Field, ConfigDict
+# Simple version constants
+VERSION = "1.0.6"
+# Project info
+AUTHOR = "UnrealOn Team"
+AUTHOR_EMAIL = "dev@unrealon.com"
+LICENSE = "MIT"
+PROJECT_URL = "https://unrealon.com"
+class VersionInfo(BaseModel):
+    """Version information model."""
+    model_config = ConfigDict(validate_assignment=True, extra="forbid")
+    version: str = Field(default=VERSION)
+class ProjectInfo(BaseModel):
+    """Project information model."""
+    model_config = ConfigDict(validate_assignment=True, extra="forbid")
+    author: str = Field(default=AUTHOR)
+    email: str = Field(default=AUTHOR_EMAIL)
+    license: str = Field(default=LICENSE)
+    url: str = Field(default=PROJECT_URL)
+# Global instances
+VERSION_INFO = VersionInfo()
+PROJECT_INFO = ProjectInfo()
+def get_version() -> str:
+    """Get SDK version."""
+    return VERSION
+def is_debug_mode() -> bool:
+    """Check if debug mode enabled."""
+    return os.getenv("UNREALON_DEBUG", "").lower() in ("1", "true", "debug")
+# Compatibility check
+def check_compatibility(required_version: str) -> bool:
+    """Check if SDK version is compatible with required version."""
+    try:
+        required = tuple(map(int, required_version.split(".")))
+        current = tuple(map(int, VERSION.split(".")))
+        return current >= required
+    except (ValueError, AttributeError):
+        return False
+# Debug output
+if os.getenv("UNREALON_DEBUG", "").lower() in ("1", "true", "debug"):
+    logger = logging.getLogger(__name__)
+    logger.info(f"🚀 UnrealOn SDK v{VERSION} loaded")
+    logger.info(f"   🎯 Service-based architecture")
+    logger.info(f"   📦 KISS principle - simple & reliable")
+    logger.info(f"   🔗 Available services: {', '.join(__all__)}")
+__all__ = [
+    "VERSION",
+    "AUTHOR",
+    "AUTHOR_EMAIL",
+    "LICENSE",
+    "PROJECT_URL",
+    "VersionInfo",
+    "ProjectInfo",
+    "VERSION_INFO",
+    "PROJECT_INFO",
+    "get_version",
+    "is_debug_mode",
+    "check_compatibility",
+]

{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: unrealon
-Version: 1.0.4
+Version: 1.0.6
 Summary: AI-powered web scraping platform with real-time orchestration
 License: MIT
 Author: Unrealon Team

{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,7 @@
+unrealon/__init__.py,sha256=IWhHl4jLgYR5HjEtHR1_-BF3tUkXJpP67IqqekdsVxk,494
+unrealon/sdk_config.py,sha256=ak2gRMEo2I-YtiJplOhgNIdsQnnoZlKx6c87b8ws6EI,2118
 unrealon_browser/README.md,sha256=9pP6RrfMGHtdT5uDLFAUB1e4nNGzZudXViEo1940gKw,396
-unrealon_browser/__init__.py,sha256=ClwFNHvhcUJzqYdHuA7hODnBWKke8_CqvMXsPxLmEIg,622
+unrealon_browser/__init__.py,sha256=pLHhYYhYJmzrED1Chi3uQAETVFxqQrikwEAwO2TTZ70,631
 unrealon_browser/pyproject.toml,sha256=_PTGU6Pwh7antWDqZeA6KU-Vx3Xw4jwTlU_Wgt6t0Cg,4945
 unrealon_browser/src/__init__.py,sha256=iw8FNzemhVx-AYGQVUthpW59IZy8FxCXpy0DvhRok4c,1339
 unrealon_browser/src/cli/__init__.py,sha256=b3r88oeCYsqZF8EU8EZXP9v54Q8cIimN7UmxJsXcB84,264
@@ -23,7 +25,7 @@ unrealon_browser/src/managers/logger_bridge.py,sha256=d4H67QcJOdupn_VpCNjZqsI5w3
 unrealon_browser/src/managers/profile.py,sha256=jrhjDgik697abHvioWub0smHb9YZWJy4opFPqdZli5A,18179
 unrealon_browser/src/managers/stealth.py,sha256=eSLAqpCHyyntUD1RzZC0jpNpYFuHpKl4J9WxmICx3Ww,13890
 unrealon_driver/README.md,sha256=KHcX4P_C58wh3PjpZS2sYf-GRUs3lsBcioi-1gbcHS8,5254
-unrealon_driver/__init__.py,sha256=Ze-6b2wdZY6mIJg97uPYdMt2ZkLJHj-gbNJPTZWJ5Xk,605
+unrealon_driver/__init__.py,sha256=q1whUYDWp5ZLs01Y6YrMiuXmPuG_b3fRDfP_dKIoNI4,614
 unrealon_driver/pyproject.toml,sha256=_YeD0-KH1O5KrWNrPmK7mLDk_BkFhFrrOfNtIYbv-0M,4589
 unrealon_driver/src/__init__.py,sha256=0LA_rASCvvo42-7mlU2F1XFfEcrYIicCC4_E3nwj_wg,2450
 unrealon_driver/src/cli/__init__.py,sha256=6AE6FJoXxhr5bMGn9PVuavryEsvcjMiGFbQdn2c6L6o,260
@@ -61,7 +63,7 @@ unrealon_driver/src/utils/__init__.py,sha256=XIvXAbiMUNGXdTl5yxIfjYs8CdKTxBpI_ps
 unrealon_driver/src/utils/service_factory.py,sha256=D9aefhF4px2y7BB1JcFaRM3g7Izb_VIzrGuDznA3p9o,10941
 unrealon_driver/src/utils/time_formatter.py,sha256=5Vm0WTqc1X5rYBl8yA0_GFEunV5EgXXL_fra3YoEfHs,10105
 unrealon_llm/README.md,sha256=ln5eOG5igajQy7e0LEHxDSjGNL3yLtPBeNnj4YZ7Q2c,1102
-unrealon_llm/__init__.py,sha256=0NGg06sK5C-VS2KoBYDF2SnJgukNUkOiJw3Whf34_gQ,611
+unrealon_llm/__init__.py,sha256=nN4_w3AIzMhMKkSwYppYLqvCzJ73jE302WRvhVqP80c,620
 unrealon_llm/pyproject.toml,sha256=to4asInBDsxznIp9KP1dZELcQLzikzd2AJe1AAp87r0,3755
 unrealon_llm/src/__init__.py,sha256=ga0mWhimHG6Xt-UJLKFb3IvpBawJtUWlSqkZLu8Q0Xg,5338
 unrealon_llm/src/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -89,9 +91,9 @@ unrealon_llm/src/managers/cost_manager.py,sha256=Bu4LUWcKB9JSwIz2m5FxAe5iEN3dVRX
 unrealon_llm/src/managers/request_manager.py,sha256=oMsn2x1P6AF_6C84kmJrl_SCTHpWzgBkR50M9-bIyd0,10702
 unrealon_llm/src/modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unrealon_llm/src/modules/html_processor/__init__.py,sha256=-Z3ExABUbUn0l8GEZhbW3WYFJGv7uheGpftiyOji0yg,552
-unrealon_llm/src/modules/html_processor/base_processor.py,sha256=BUSK3wou_KWxxr9nFgFAK_HrZCbVjt44EwM6zGZLIP8,10930
-unrealon_llm/src/modules/html_processor/details_processor.py,sha256=XAQFlf0OvfcgjXDd7cAV_LUf07WrnS_3V2ia3zHPZTA,2033
-unrealon_llm/src/modules/html_processor/listing_processor.py,sha256=PSdFlSlZLZ_dIUS8d_umJKBepPDRyijEv9zruioX2r4,2271
+unrealon_llm/src/modules/html_processor/base_processor.py,sha256=azggKCsE52cUnRAFi-7bak7LZPrYrrH5tJFWxI-HoBo,16423
+unrealon_llm/src/modules/html_processor/details_processor.py,sha256=eiayCPIB9p4PSDtMUEgdPhtb8yu-LqN-WkFd86BuhE4,3479
+unrealon_llm/src/modules/html_processor/listing_processor.py,sha256=h8D79XnVsWkEqcbDoVhcsbS1LYV2AngVo1ROy5oRD7o,3669
 unrealon_llm/src/modules/html_processor/models/__init__.py,sha256=Hn-ztJWI4SokVxhx-cakm5h_Xhjg0rn9_IXyToIjwRE,420
 unrealon_llm/src/modules/html_processor/models/processing_models.py,sha256=SoOE8KCwivU3FGLK3fxx0rKtKXyI-C7ibEHyWCApZAQ,1460
 unrealon_llm/src/modules/html_processor/models/universal_model.py,sha256=Zi3L_t4rcwHpbJKADWSdK7yBErVIGBf3ZRCsZxUsMDM,1738
@@ -108,7 +110,7 @@ unrealon_llm/src/utils/models_cache.py,sha256=IL1POtqFIybRyOakMEZnRLkqFx7FCfk1gV
 unrealon_llm/src/utils/smart_counter.py,sha256=F96Y_Yj-5uuio4hkgkuVjjAsRSgu0_LfQxFJKOu_cbA,8048
 unrealon_llm/src/utils/token_counter.py,sha256=TB0cNbjipiVkin_6yra4VjpbcUZqLEfVv8cgTPTcX9s,6209
 unrealon_sdk/README.md,sha256=_rRg7h9Fqsc-w_kkv8FM1wHF7dL6TH7-WRr79iJOM80,470
-unrealon_sdk/__init__.py,sha256=qjFvskNOMfE3UBGTC2todD7zmYgNJZlRtPo1JE8FYqw,755
+unrealon_sdk/__init__.py,sha256=4XWzXb7QYSL1s9v7gVci6Ks3eqv5TAU19FgaVigWpRM,764
 unrealon_sdk/pyproject.toml,sha256=WsbZqROrphT3fXvOTF4200ZiVTMjJjnavRtoqipH1UQ,5799
 unrealon_sdk/src/__init__.py,sha256=skeneU3XDXF6_nsEKbBZ4Gq5uHXU0v1vArVOGJ6Asxo,3552
 unrealon_sdk/src/cli/__init__.py,sha256=0jzVnvi6pCOA8TxkyE9rDmJPvesYvXGKGWJiKDd0Kd8,180
@@ -238,7 +240,7 @@ unrealon_sdk/src/internal/http_client.py,sha256=uU3BdNYj4ZL16y0BpBxOtWLOo-pE-8LW
 unrealon_sdk/src/internal/websocket_client.py,sha256=1TteTv_6dUMXS5xTXwld6XB2Q0hcOsycLv9l_KEB8aA,15700
 unrealon_sdk/src/provider.py,sha256=kyKjUjuo6s8hcTld8gIc7aO4SM7ozhsUlIM0EXOyblw,14104
 unrealon_sdk/src/utils.py,sha256=nj8a83a7p_RXA985yRdHQxPr2S4rwKiwp1wD3qj7EEU,5440
-unrealon-1.0.4.dist-info/LICENSE,sha256=eEH8mWZW49YMpl4Sh5MtKqkZ8aVTzKQXiNPEnvL14ns,1070
-unrealon-1.0.4.dist-info/METADATA,sha256=y5iPh0hGKFGc87R9ydbNdtstgynXL8ACisIsjUrRAlc,29082
-unrealon-1.0.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-unrealon-1.0.4.dist-info/RECORD,,
+unrealon-1.0.6.dist-info/LICENSE,sha256=eEH8mWZW49YMpl4Sh5MtKqkZ8aVTzKQXiNPEnvL14ns,1070
+unrealon-1.0.6.dist-info/METADATA,sha256=_XESL3BDHrP03yZDXTZT7tfpI72rmOy8GpWmTlRzurg,29082
+unrealon-1.0.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+unrealon-1.0.6.dist-info/RECORD,,

unrealon_browser/__init__.py CHANGED Viewed

@@ -13,7 +13,7 @@ __description__ = "Enterprise browser automation with stealth capabilities"
 # Import from centralized config
-from sdk_config import (
+from unrealon.sdk_config import (
     VERSION as __version__,
     AUTHOR as __author__,
     AUTHOR_EMAIL as __email__,

unrealon_driver/__init__.py CHANGED Viewed

@@ -13,7 +13,7 @@ __description__ = "Simple, modular parser development tools for UnrealOn SDK"
 # Import from centralized config
-from sdk_config import (
+from unrealon.sdk_config import (
     VERSION as __version__,
     AUTHOR as __author__,
     AUTHOR_EMAIL as __email__,

unrealon_llm/__init__.py CHANGED Viewed

@@ -13,7 +13,7 @@ __description__ = "Large Language Model integration tools for UnrealOn SDK"
 # Import from centralized config
-from sdk_config import (
+from unrealon.sdk_config import (
     VERSION as __version__,
     AUTHOR as __author__,
     AUTHOR_EMAIL as __email__,

unrealon_llm/src/modules/html_processor/base_processor.py CHANGED Viewed

@@ -10,6 +10,7 @@ import json
 import random
 from typing import Type
 import traceback
+import re
 from unrealon_llm.src.core import SmartLLMClient
 from unrealon_llm.src.dto import ChatMessage, MessageRole
@@ -77,6 +78,10 @@ class BaseHTMLProcessor(ABC):
         """Return extraction prompt template for this processor type"""
         pass
+    def _trim_system_prompt(self, system_prompt: str) -> str:
+        """Trim system prompt to remove empty lines"""
+        return "\n".join(system_prompt.split("\n")[1:])
     async def extract_patterns(self, html_content: str) -> ExtractionResult:
         """
         Extract patterns from HTML using LLM intelligence
@@ -116,15 +121,27 @@ class BaseHTMLProcessor(ABC):
             prompt_tokens=0,
             details={
                 "full_prompt": prompt[:2000] + "..." if len(prompt) > 2000 else prompt,
-                "schema_json": json.dumps(self.schema_class.model_json_schema(), indent=2)
-            }
+                "schema_json": json.dumps(
+                    self.schema_class.model_json_schema(), indent=2
+                ),
+            },
         )
+        # Add critical format requirements to the prompt
+        SYSTEM_PROMPT = f"""
+        You are an HTML-to-JSON expert at analyzing {self.processor_type} pages.
+        You MUST return JSON that EXACTLY matches the Pydantic schema provided.
+        RESPOND ONLY WITH VALID JSON.
+        NO EXPLANATIONS, NO TEXT, ONLY JSON!
+        Include ALL required fields from the schema!
+        CRITICAL: The 'selectors' field must be a DICTIONARY/OBJECT, not a list!
+        """
         # Prepare LLM messages
         messages = [
             ChatMessage(
                 role=MessageRole.SYSTEM,
-                content=f"You are an HTML-to-JSON expert at analyzing {self.processor_type} pages. You MUST return JSON that EXACTLY matches the Pydantic schema provided. RESPOND ONLY WITH VALID JSON. NO EXPLANATIONS, NO TEXT, ONLY JSON! Include ALL required fields from the schema!",
+                content=self._trim_system_prompt(SYSTEM_PROMPT),
             ),
             ChatMessage(
                 role=MessageRole.USER,
@@ -144,8 +161,7 @@ class BaseHTMLProcessor(ABC):
         try:
             # Call LLM
             response = await self.llm_client.chat_completion(
-                messages,
-                response_model=self.schema_class
+                messages, response_model=self.schema_class
             )
             # Log full LLM response for debugging
@@ -167,7 +183,7 @@ class BaseHTMLProcessor(ABC):
             )
             # Use the validated model from LLM response
-            if hasattr(response, 'extracted_model') and response.extracted_model:
+            if hasattr(response, "extracted_model") and response.extracted_model:
                 validated_model = response.extracted_model
                 validated_result = validated_model.model_dump()
                 logger.log_html_analysis_completed(
@@ -203,8 +219,36 @@ class BaseHTMLProcessor(ABC):
                             "raw_llm_response": result_data,
                         },
                     )
-                    # Fall back to raw data
-                    validated_result = result_data
+                    # 🔥 SMART FALLBACK: Try to fix common LLM format issues
+                    try:
+                        fixed_data = self._fix_llm_response_format(result_data, str(e))
+                        validated_model = self.schema_class(**fixed_data)
+                        validated_result = validated_model.model_dump()
+                        logger.log_html_analysis_completed(
+                            selectors_generated=len(str(fixed_data)),
+                            confidence_score=fixed_data.get("confidence", 0.0),
+                            details={
+                                "processor_type": self.processor_type,
+                                "validation_success": True,
+                                "schema_matched": True,
+                                "format_fixed": True,
+                            },
+                        )
+                    except Exception as fix_error:
+                        logger.log_html_analysis_failed(
+                            error_message=f"Format fixing also failed: {str(fix_error)}",
+                            details={
+                                "processor_type": self.processor_type,
+                                "validation_error": str(e),
+                                "fix_error": str(fix_error),
+                                "raw_llm_response": result_data,
+                            },
+                        )
+                        # Final fallback: create minimal valid structure
+                        validated_result = self._create_fallback_result(
+                            result_data, str(e)
+                        )
             # Create Pydantic processing metadata
             processing_info = ProcessingInfo(
@@ -253,12 +297,20 @@ class BaseHTMLProcessor(ABC):
         # Add random number to bypass any caching
         cache_buster = random.randint(100000, 999999)
-        schema_prompt = f"""PYDANTIC 2 SCHEMA (Request #{cache_buster}):
-{schema_json}
-CRITICAL: Return JSON that EXACTLY matches this schema structure!
-The response must include ALL required fields: detected_item_type, extraction_strategy, confidence, selectors, documentation."""
+        schema_prompt = f"""
+        PYDANTIC 2 SCHEMA (Request #{cache_buster}):
+        {schema_json}
+        🚨 CRITICAL FORMAT REQUIREMENTS:
+        1. Return JSON that EXACTLY matches this schema structure!
+        2. The response must include ALL required fields: detected_item_type, extraction_strategy, confidence, selectors, documentation
+        3. The "selectors" field MUST be a DICTIONARY/OBJECT with field names as keys and arrays of CSS selectors as values
+        4. Example: "selectors": {{"title": ["h1.title", ".product-name"], "price": [".price", ".cost"]}}
+        5. DO NOT return "selectors" as a list: ❌ ["h1.title", ".price"]
+        6. DO return "selectors" as a dictionary: ✅ {{"title": ["h1.title"], "price": [".price"]}}
+        """
+        schema_prompt = self._trim_system_prompt(schema_prompt)
         return prompt_template.format(
             processor_type=self.processor_type,
@@ -292,3 +344,72 @@ The response must include ALL required fields: detected_item_type, extraction_st
         estimated_cost = (total_tokens / 1_000_000) * 0.25
         return estimated_cost
+    def _fix_llm_response_format(self, result_data: dict, error_message: str) -> dict:
+        """Fix common LLM response format issues."""
+        fixed_data = result_data.copy()
+        # Fix selectors if it's a list instead of dict
+        if "selectors" in fixed_data and isinstance(fixed_data["selectors"], list):
+            logger.log_html_analysis_failed(
+                error_message="Fixing selectors format: list -> dict",
+                details={
+                    "processor_type": self.processor_type,
+                    "original_selectors": fixed_data["selectors"],
+                },
+            )
+            # Convert list to dict with generic field names
+            selectors_list = fixed_data["selectors"]
+            fixed_data["selectors"] = {}
+            # Try to intelligently map list items to field names
+            field_names = ["item", "title", "price", "description", "image", "link"]
+            for i, selector in enumerate(selectors_list):
+                if i < len(field_names):
+                    field_name = field_names[i]
+                else:
+                    field_name = f"field_{i+1}"
+                # Convert single selector to list
+                if isinstance(selector, str):
+                    fixed_data["selectors"][field_name] = [selector]
+                elif isinstance(selector, list):
+                    fixed_data["selectors"][field_name] = selector
+                else:
+                    fixed_data["selectors"][field_name] = [str(selector)]
+        # Ensure all required fields exist
+        required_fields = [
+            "detected_item_type",
+            "extraction_strategy",
+            "confidence",
+            "selectors",
+            "documentation",
+        ]
+        for field in required_fields:
+            if field not in fixed_data:
+                if field == "detected_item_type":
+                    fixed_data[field] = "unknown"
+                elif field == "extraction_strategy":
+                    fixed_data[field] = "fallback_strategy"
+                elif field == "confidence":
+                    fixed_data[field] = 0.1
+                elif field == "selectors":
+                    fixed_data[field] = {}
+                elif field == "documentation":
+                    fixed_data[field] = (
+                        "Extraction completed with fallback processing due to format issues."
+                    )
+        return fixed_data
+    def _create_fallback_result(self, result_data: dict, error_message: str) -> dict:
+        """Create a minimal valid result when all else fails."""
+        return {
+            "detected_item_type": "unknown",
+            "extraction_strategy": "fallback_strategy",
+            "confidence": 0.1,
+            "selectors": {},
+            "documentation": f"Extraction failed due to validation error: {error_message}. Raw data: {str(result_data)[:500]}...",
+        }

unrealon_llm/src/modules/html_processor/details_processor.py CHANGED Viewed

@@ -25,37 +25,61 @@ class DetailsProcessor(BaseHTMLProcessor):
     def get_extraction_prompt_template(self) -> str:
         """Return details-specific extraction prompt template"""
-        return """{schema}
-[__TASK_DESCRIPTION__]
-Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
-PROCESSOR TYPE: {processor_type}
-THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
-[/__TASK_DESCRIPTION__]
-[__INSTRUCTIONS__]
-YOUR TASK:
-Analyze this details page and generate extraction patterns for ANY type of item.
-This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
-CRITICAL REQUIREMENTS:
-1. Return simple CSS selectors in the "selectors" object
-2. Include comprehensive markdown documentation
-3. Provide real examples from the actual HTML
-4. Explain the page structure and best extraction approach
-5. Include confidence scores and fallback strategies
-6. Document any special handling needed
-ANALYZE THE HTML AND DETERMINE:
-- What type of item this page describes
-- What information is available (specs, pricing, reviews, etc.)
-- How content is structured and organized
-- What actions are possible (buy, contact, etc.)
-- Best extraction strategy for this specific page
-[/__INSTRUCTIONS__]
-[__HTML_CONTENT__]
-HTML CONTENT (first 50KB):
-{html_content}
-[/__HTML_CONTENT__]
-"""
+        prompt = """{schema}
+        [__TASK_DESCRIPTION__]
+        Analyze this DETAILS/PRODUCT/ITEM page and generate universal extraction patterns.
+        PROCESSOR TYPE: {processor_type}
+        THIS IS A DETAILS PAGE containing information about a single item/product/service/article.
+        [/__TASK_DESCRIPTION__]
+        [__CRITICAL_FORMAT_REQUIREMENTS__]
+        🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
+        Example of CORRECT format:
+        "selectors": {{
+            "title": ["h1.product-title", "h1.page-title", ".item-name"],
+            "price": [".price", ".cost", "span[data-price]", ".product-price"],
+            "description": [".description", ".product-desc", ".item-details"],
+            "images": ["img.product-image", ".gallery img", "img[src*='product']"],
+            "specifications": [".specs", ".product-specs", ".item-specifications"],
+            "reviews": [".reviews", ".product-reviews", ".customer-reviews"]
+        }}
+        ❌ WRONG format (DO NOT USE):
+        "selectors": ["h1.title", ".price", ".description"]
+        ✅ CORRECT format (USE THIS):
+        "selectors": {{
+            "title": ["h1.title", ".product-name", "h1[itemprop='name']"],
+            "price": [".price", ".cost", "span[data-price]"],
+            "description": [".description", ".product-desc", ".item-details"]
+        }}
+        [/__CRITICAL_FORMAT_REQUIREMENTS__]
+        [__INSTRUCTIONS__]
+        YOUR TASK:
+        Analyze this details page and generate extraction patterns for ANY type of item.
+        This could be: product details, service info, article content, job description, real estate listing, person profile, etc.
+        CRITICAL REQUIREMENTS:
+        1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
+        2. Include comprehensive markdown documentation
+        3. Provide real examples from the actual HTML
+        4. Explain the page structure and best extraction approach
+        5. Include confidence scores and fallback strategies
+        6. Document any special handling needed
+        ANALYZE THE HTML AND DETERMINE:
+        - What type of item this page describes
+        - What information is available (specs, pricing, reviews, etc.)
+        - How content is structured and organized
+        - What actions are possible (buy, contact, etc.)
+        - Best extraction strategy for this specific page
+        [/__INSTRUCTIONS__]
+        [__HTML_CONTENT__]
+        HTML CONTENT (first 50KB):
+        {html_content}
+        [/__HTML_CONTENT__]
+        """
+        return self._trim_system_prompt(prompt)

unrealon_llm/src/modules/html_processor/listing_processor.py CHANGED Viewed

@@ -25,43 +25,67 @@ class ListingProcessor(BaseHTMLProcessor):
     def get_extraction_prompt_template(self) -> str:
         """Return listing-specific extraction prompt template"""
-        return """{schema}
-[__TASK_DESCRIPTION__]
-Analyze this LISTING/CATALOG page and generate universal extraction patterns.
-PROCESSOR TYPE: {processor_type}
-THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
-[/__TASK_DESCRIPTION__]
-[__INSTRUCTIONS__]
-YOUR TASK:
-Analyze this listing page and generate extraction patterns for ANY type of items.
-This could be: products, services, articles, jobs, real estate, people, cars, etc.
-CRITICAL REQUIREMENTS:
-1. Return simple CSS selectors in the "selectors" object
-2. This is a LISTING PAGE with multiple items
-3. Focus on identifying item containers and individual item patterns
-4. Detect ANY type of items - not just products!
-5. Provide multiple fallback selectors for reliability
-6. Include pagination and navigation patterns
-7. Use realistic confidence scores (0.1-1.0)
-8. Auto-detect what type of content this listing contains
-9. Provide extraction strategy advice
-10. Look for structured data (JSON-LD, microdata)
-11. Generate patterns that work with BeautifulSoup4 .select() method
-12. RETURN JSON that EXACTLY matches the Pydantic schema above!
-ANALYZE THE HTML AND DETERMINE:
-- What type of items are listed (products, services, articles, etc.)
-- How items are structured and contained
-- What navigation elements exist
-- What metadata is available
-- Best extraction strategy for this specific page
-[/__INSTRUCTIONS__]
-[__HTML_CONTENT__]
-HTML CONTENT (first 50KB):
-{html_content}
-[/__HTML_CONTENT__]
-"""
+        prompt = """{schema}
+        [__TASK_DESCRIPTION__]
+        Analyze this LISTING/CATALOG page and generate universal extraction patterns.
+        PROCESSOR TYPE: {processor_type}
+        THIS IS A LISTING PAGE containing multiple items arranged in a list or grid.
+        [/__TASK_DESCRIPTION__]
+        [__CRITICAL_FORMAT_REQUIREMENTS__]
+        🚨 SELECTORS FORMAT: The "selectors" field MUST be a DICTIONARY/OBJECT, NOT a list!
+        Example of CORRECT format:
+        "selectors": {{
+            "items_container": ["div.product-grid", "ul.product-list", "div.items"],
+            "item_title": ["h3.product-title", "a.product-link", ".item-name"],
+            "item_price": [".price", ".cost", "span[data-price]"],
+            "item_image": ["img.product-image", ".item-img", "img[src*='product']"],
+            "pagination": [".pagination", ".page-nav", "nav[aria-label='pagination']"]
+        }}
+        ❌ WRONG format (DO NOT USE):
+        "selectors": ["div.product", "h3.title", ".price"]
+        ✅ CORRECT format (USE THIS):
+        "selectors": {{
+            "items": ["div.product", "li.item", ".product-card"],
+            "titles": ["h3.title", ".product-name", "a[title]"],
+            "prices": [".price", ".cost", "span[data-price]"]
+        }}
+        [/__CRITICAL_FORMAT_REQUIREMENTS__]
+        [__INSTRUCTIONS__]
+        YOUR TASK:
+        Analyze this listing page and generate extraction patterns for ANY type of items.
+        This could be: products, services, articles, jobs, real estate, people, cars, etc.
+        CRITICAL REQUIREMENTS:
+        1. The "selectors" field MUST be a DICTIONARY with field names as keys and arrays of CSS selectors as values
+        2. This is a LISTING PAGE with multiple items
+        3. Focus on identifying item containers and individual item patterns
+        4. Detect ANY type of items - not just products!
+        5. Provide multiple fallback selectors for reliability
+        6. Include pagination and navigation patterns
+        7. Use realistic confidence scores (0.1-1.0)
+        8. Auto-detect what type of content this listing contains
+        9. Provide extraction strategy advice
+        10. Look for structured data (JSON-LD, microdata)
+        11. Generate patterns that work with BeautifulSoup4 .select() method
+        12. RETURN JSON that EXACTLY matches the Pydantic schema above!
+        ANALYZE THE HTML AND DETERMINE:
+        - What type of items are listed (products, services, articles, etc.)
+        - How items are structured and contained
+        - What navigation elements exist
+        - What metadata is available
+        - Best extraction strategy for this specific page
+        [/__INSTRUCTIONS__]
+        [__HTML_CONTENT__]
+        HTML CONTENT (first 50KB):
+        {html_content}
+        [/__HTML_CONTENT__]
+        """
+        return self._trim_system_prompt(prompt)

unrealon_sdk/__init__.py CHANGED Viewed

@@ -16,7 +16,7 @@ __description__ = "Enterprise Parsing Platform SDK for UnrealOn"
 # Import from centralized config
-from sdk_config import (
+from unrealon.sdk_config import (
     VERSION as __version__,
     AUTHOR as __author__,
     AUTHOR_EMAIL as __email__,

{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{unrealon-1.0.4.dist-info → unrealon-1.0.6.dist-info}/WHEEL RENAMED Viewed

File without changes

unrealon 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

unrealon 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl