PyPI - unrealon - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.4__py3-none-any.whl - Mend

unrealon 1.1.1py3-none-any.whl → 1.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

unrealon/__init__.py +16 -6
unrealon-1.1.4.dist-info/METADATA +658 -0
unrealon-1.1.4.dist-info/RECORD +54 -0
{unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/entry_points.txt +1 -1
unrealon_browser/__init__.py +3 -6
unrealon_browser/core/browser_manager.py +86 -84
unrealon_browser/dto/models/config.py +2 -0
unrealon_browser/managers/captcha.py +165 -185
unrealon_browser/managers/cookies.py +57 -28
unrealon_browser/managers/logger_bridge.py +94 -34
unrealon_browser/managers/profile.py +186 -158
unrealon_browser/managers/stealth.py +58 -47
unrealon_driver/__init__.py +8 -21
unrealon_driver/exceptions.py +5 -0
unrealon_driver/html_analyzer/__init__.py +32 -0
unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} +330 -405
unrealon_driver/html_analyzer/config.py +64 -0
unrealon_driver/html_analyzer/manager.py +247 -0
unrealon_driver/html_analyzer/models.py +115 -0
unrealon_driver/html_analyzer/websocket_analyzer.py +157 -0
unrealon_driver/models/__init__.py +31 -0
unrealon_driver/models/websocket.py +98 -0
unrealon_driver/parser/__init__.py +4 -23
unrealon_driver/parser/cli_manager.py +6 -5
unrealon_driver/parser/daemon_manager.py +242 -66
unrealon_driver/parser/managers/__init__.py +0 -21
unrealon_driver/parser/managers/config.py +15 -3
unrealon_driver/parser/parser_manager.py +225 -395
unrealon_driver/smart_logging/__init__.py +24 -0
unrealon_driver/smart_logging/models.py +44 -0
unrealon_driver/smart_logging/smart_logger.py +406 -0
unrealon_driver/smart_logging/unified_logger.py +525 -0
unrealon_driver/websocket/__init__.py +31 -0
unrealon_driver/websocket/client.py +249 -0
unrealon_driver/websocket/config.py +188 -0
unrealon_driver/websocket/manager.py +90 -0
unrealon-1.1.1.dist-info/METADATA +0 -722
unrealon-1.1.1.dist-info/RECORD +0 -82
unrealon_bridge/__init__.py +0 -114
unrealon_bridge/cli.py +0 -316
unrealon_bridge/client/__init__.py +0 -93
unrealon_bridge/client/base.py +0 -78
unrealon_bridge/client/commands.py +0 -89
unrealon_bridge/client/connection.py +0 -90
unrealon_bridge/client/events.py +0 -65
unrealon_bridge/client/health.py +0 -38
unrealon_bridge/client/html_parser.py +0 -146
unrealon_bridge/client/logging.py +0 -139
unrealon_bridge/client/proxy.py +0 -70
unrealon_bridge/client/scheduler.py +0 -450
unrealon_bridge/client/session.py +0 -70
unrealon_bridge/configs/__init__.py +0 -14
unrealon_bridge/configs/bridge_config.py +0 -212
unrealon_bridge/configs/bridge_config.yaml +0 -39
unrealon_bridge/models/__init__.py +0 -138
unrealon_bridge/models/base.py +0 -28
unrealon_bridge/models/command.py +0 -41
unrealon_bridge/models/events.py +0 -40
unrealon_bridge/models/html_parser.py +0 -79
unrealon_bridge/models/logging.py +0 -55
unrealon_bridge/models/parser.py +0 -63
unrealon_bridge/models/proxy.py +0 -41
unrealon_bridge/models/requests.py +0 -95
unrealon_bridge/models/responses.py +0 -88
unrealon_bridge/models/scheduler.py +0 -592
unrealon_bridge/models/session.py +0 -28
unrealon_bridge/server/__init__.py +0 -91
unrealon_bridge/server/base.py +0 -171
unrealon_bridge/server/handlers/__init__.py +0 -23
unrealon_bridge/server/handlers/command.py +0 -110
unrealon_bridge/server/handlers/html_parser.py +0 -139
unrealon_bridge/server/handlers/logging.py +0 -95
unrealon_bridge/server/handlers/parser.py +0 -95
unrealon_bridge/server/handlers/proxy.py +0 -75
unrealon_bridge/server/handlers/scheduler.py +0 -545
unrealon_bridge/server/handlers/session.py +0 -66
unrealon_driver/browser/__init__.py +0 -8
unrealon_driver/browser/config.py +0 -74
unrealon_driver/browser/manager.py +0 -416
unrealon_driver/parser/managers/browser.py +0 -51
unrealon_driver/parser/managers/logging.py +0 -609
{unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/WHEEL +0 -0
{unrealon-1.1.1.dist-info → unrealon-1.1.4.dist-info}/licenses/LICENSE +0 -0

unrealon_driver/{parser/managers/html.py → html_analyzer/cleaner.py} RENAMED Viewed

@@ -1,123 +1,28 @@
 """
-HTML Manager - Smart HTML processing and cleaning with Pydantic v2
+Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization.
-Strict compliance with CRITICAL_REQUIREMENTS.md:
-- No Dict[str, Any] usage
-- Complete type annotations
-- Pydantic v2 models everywhere
-- Custom exception hierarchy
+Intelligent HTML cleaning that removes noise but preserves useful data.
+Optimizes HTML for LLM token efficiency while keeping valuable content.
 """
 import json
 import re
-from typing import Optional, List, Union
-from pathlib import Path
-from pydantic import BaseModel, Field, ConfigDict, field_validator
 import asyncio
 import concurrent.futures
+from typing import Optional, List, Dict, Any, Tuple
+from pydantic import BaseModel, Field, ConfigDict
 from bs4 import BeautifulSoup, Comment
+from unrealon_driver.smart_logging import create_smart_logger
-from unrealon_rpc.logging import get_logger
-class HTMLCleaningConfig(BaseModel):
-    """HTML cleaning configuration with strict typing"""
-    model_config = ConfigDict(
-        validate_assignment=True,
-        extra="forbid"
-    )
-    # Cleaning modes
-    aggressive_cleaning: bool = Field(
-        default=True,
-        description="Enable aggressive cleaning"
-    )
-    preserve_js_data: bool = Field(
-        default=True,
-        description="Preserve JavaScript data during cleaning"
-    )
-    # Content preservation
-    preserve_images: bool = Field(
-        default=False,
-        description="Preserve image tags"
-    )
-    preserve_links: bool = Field(
-        default=True,
-        description="Preserve link tags"
-    )
-    preserve_forms: bool = Field(
-        default=False,
-        description="Preserve form elements"
-    )
-    # Size limits
-    max_html_size: int = Field(
-        default=1000000,
-        ge=1000,
-        le=10000000,
-        description="Maximum HTML size in characters"
-    )
-    max_text_length: int = Field(
-        default=300,
-        ge=50,
-        le=1000,
-        description="Maximum text content length per element"
-    )
-    max_url_length: int = Field(
-        default=500,
-        ge=100,
-        le=2000,
-        description="Maximum URL length"
-    )
-    # Noise removal
-    remove_comments: bool = Field(
-        default=True,
-        description="Remove HTML comments"
-    )
-    remove_scripts: bool = Field(
-        default=True,
-        description="Remove script tags"
-    )
-    remove_styles: bool = Field(
-        default=True,
-        description="Remove style tags"
-    )
-    remove_tracking: bool = Field(
-        default=True,
-        description="Remove tracking URLs and attributes"
-    )
-    # Whitespace handling
-    normalize_whitespace: bool = Field(
-        default=True,
-        description="Normalize whitespace"
-    )
-    remove_empty_elements: bool = Field(
-        default=True,
-        description="Remove empty elements"
-    )
-    # Custom selectors
-    noise_selectors: List[str] = Field(
-        default_factory=lambda: [
-            '[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]',
-            '[class*="footer"]', '[class*="header"]', '[class*="ads"]',
-            '[class*="popup"]', '[class*="modal"]', '[class*="cookie"]'
-        ],
-        description="CSS selectors for noise elements to remove"
-    )
+from .config import HTMLCleaningConfig
 class HTMLCleaningStats(BaseModel):
     """HTML cleaning statistics"""
-    model_config = ConfigDict(
-        validate_assignment=True,
-        extra="forbid"
-    )
+    model_config = ConfigDict(validate_assignment=True, extra="forbid")
     original_size_bytes: int = Field(ge=0)
     cleaned_size_bytes: int = Field(ge=0)
     size_reduction_bytes: int = Field(ge=0)
@@ -130,297 +35,296 @@ class HTMLCleaningStats(BaseModel):
 class ExtractedJSData(BaseModel):
     """Extracted JavaScript data structure"""
-    model_config = ConfigDict(
-        validate_assignment=True,
-        extra="forbid"
-    )
-    ssr_data: dict[str, str] = Field(default_factory=dict)
-    structured_data: List[dict[str, str]] = Field(default_factory=list)
-    raw_extracts: List[dict[str, str]] = Field(default_factory=list)
-class HTMLManagerError(Exception):
-    """Base exception for HTML manager"""
-    def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
-        self.message = message
-        self.operation = operation
-        self.details = details or {}
-        super().__init__(message)
+    model_config = ConfigDict(validate_assignment=True, extra="forbid")
-class HTMLParsingError(HTMLManagerError):
-    """Raised when HTML parsing fails"""
-    pass
+    ssr_data: Dict[str, Any] = Field(default_factory=dict)
+    structured_data: List[Dict[str, Any]] = Field(default_factory=list)
+    analytics_data: Dict[str, Any] = Field(default_factory=dict)
+    product_data: Dict[str, Any] = Field(default_factory=dict)
+    raw_extracts: List[Dict[str, Any]] = Field(default_factory=list)
-class HTMLCleaningError(HTMLManagerError):
+class HTMLCleaningError(Exception):
     """Raised when HTML cleaning fails"""
-    pass
+    def __init__(self, message: str, operation: str, details: Optional[dict[str, str]] = None):
+        self.message = message
+        self.operation = operation
+        self.details = details or {}
+        super().__init__(message)
-class HTMLManager:
+class HTMLCleaner:
     """
-    🧹 HTML Manager - Smart HTML processing and cleaning
+    🧹 Smart HTML Cleaner - Intelligent HTML cleaning for LLM optimization
     Features:
-    - LLM Optimized: Removes noise, preserves valuable content
-    - Token Efficient: Reduces HTML size for cost-effective LLM analysis
-    - Smart Extraction: Preserves JavaScript data and structured content
-    - Performance: Fast cleaning with configurable aggressiveness
-    - Safe: Handles malformed HTML gracefully
-    - Type Safety: Full Pydantic v2 compliance
+    - Removes noise (scripts, styles, comments)
+    - Preserves useful JavaScript data (JSON objects, SSR data)
+    - Cleans whitespace and formatting
+    - Maintains semantic structure
+    - Extracts and preserves Next.js/Nuxt.js SSR data
+    - Optimizes for LLM token efficiency
     """
-    def __init__(self, config: Optional[HTMLCleaningConfig] = None):
+    def __init__(self, parser_id: str, config: Optional[HTMLCleaningConfig] = None):
         self.config = config or HTMLCleaningConfig()
-        self.logger = get_logger()
+        # Initialize smart logger
+        self.parser_id = parser_id
+        self.logger = create_smart_logger(parser_id=self.parser_id)
+        # Tags to completely remove
+        self.noise_tags = {"script", "style", "meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
+        # Add conditional tags based on config
+        if not self.config.preserve_forms:
+            self.noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
+        # Universal noise selectors to remove (for any site)
+        self.universal_noise_selectors = [
+            '[id*="nav"]',
+            '[class*="nav"]',  # Navigation
+            '[id*="menu"]',
+            '[class*="menu"]',  # Menus
+            '[id*="sidebar"]',
+            '[class*="sidebar"]',  # Sidebars
+            '[id*="footer"]',
+            '[class*="footer"]',  # Footers
+            '[id*="header"]',
+            '[class*="header"]',  # Headers
+            '[class*="ads"]',
+            '[class*="advertisement"]',  # Ads
+            '[class*="sponsored"]',
+            '[class*="promo"]',  # Sponsored content
+            '[class*="popup"]',
+            '[class*="modal"]',  # Popups/modals
+            '[class*="overlay"]',
+            '[class*="tooltip"]',  # Overlays
+            '[class*="cookie"]',
+            '[class*="gdpr"]',  # Cookie notices
+            '[class*="newsletter"]',
+            '[class*="subscription"]',  # Email signup
+            '[class*="social"]',
+            '[class*="share"]',  # Social media
+            '[class*="comment"]',
+            '[class*="discussion"]',  # Comments
+            '[class*="tracking"]',
+            '[class*="analytics"]',  # Tracking
+        ]
+        # Attributes to keep (semantic ones)
+        self.keep_attributes = {"id", "class", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role", "alt", "title", "href", "src", "action", "name", "value", "placeholder", "type"}
         # Compile regex patterns for performance
         self._compile_patterns()
     def _compile_patterns(self) -> None:
         """Compile regex patterns for performance"""
-        # Tracking URL patterns
+        # URL patterns to remove or shorten (for tracking/analytics)
         self.tracking_url_patterns = [
-            re.compile(r'https://aax-[^\s"]{200,}', re.IGNORECASE),
-            re.compile(r'https://[^\s"]*tracking[^\s"]{100,}', re.IGNORECASE),
-            re.compile(r'https://[^\s"]*analytics[^\s"]{100,}', re.IGNORECASE),
-            re.compile(r'https://[^\s"]*gtm[^\s"]{100,}', re.IGNORECASE),
+            r'https://aax-[^\s"]{200,}',  # Amazon tracking URLs over 200 chars
+            r'https://[^\s"]*tracking[^\s"]{100,}',  # General tracking URLs
+            r'https://[^\s"]*analytics[^\s"]{100,}',  # Analytics URLs
+            r'https://[^\s"]*gtm[^\s"]{100,}',  # Google Tag Manager URLs
         ]
-        # Base64 patterns
+        # Base64 patterns to remove or replace
         self.base64_patterns = [
-            re.compile(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}'),
-            re.compile(r'data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
-            re.compile(r'data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}'),
+            r"data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}",  # Base64 images over 50 chars
+            r"data:application/[^;]+;base64,[A-Za-z0-9+/=]{100,}",  # Base64 applications
+            r"data:text/[^;]+;base64,[A-Za-z0-9+/=]{100,}",  # Base64 text
         ]
-        # JavaScript data patterns
-        self.js_data_patterns = [
-            re.compile(r'__NEXT_DATA__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
-            re.compile(r'__NUXT__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
-            re.compile(r'window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?', re.DOTALL | re.IGNORECASE),
-            re.compile(r'dataLayer\s*=\s*(\[.+?\]);?', re.DOTALL | re.IGNORECASE),
+        # Patterns to detect valuable JavaScript data
+        self.useful_js_patterns = [
+            # Next.js/Nuxt.js SSR data
+            r"__NEXT_DATA__\s*=\s*(\{.+?\});?",
+            r"__NUXT__\s*=\s*(\{.+?\});?",
+            r"window\.__INITIAL_STATE__\s*=\s*(\{.+?\});?",
+            # React/Vue hydration data
+            r"window\.__REACT_QUERY_STATE__\s*=\s*(\{.+?\});?",
+            r"window\.__VUE_SSR_CONTEXT__\s*=\s*(\{.+?\});?",
+            # E-commerce data
+            r"window\.productData\s*=\s*(\{.+?\});?",
+            r"window\.cartData\s*=\s*(\{.+?\});?",
+            r"dataLayer\s*=\s*(\[.+?\]);?",
+            # Analytics and tracking (structured data)
+            r'gtag\s*\(\s*[\'"]config[\'"],\s*[\'"][^\'\"]+[\'"],\s*(\{.+?\})\s*\);?',
+            # JSON-LD structured data (often in script tags)
+            r'"@context"\s*:\s*"https?://schema\.org"[^}]*\}',
+            # Generic JSON objects (be more selective)
+            r"(?:window\.|var\s+|let\s+|const\s+)\w+\s*=\s*(\{.+?\});?",
         ]
+        # Compiled regex patterns for efficiency
+        self.compiled_patterns = [re.compile(pattern, re.DOTALL | re.IGNORECASE) for pattern in self.useful_js_patterns]
     # ==========================================
     # MAIN CLEANING METHODS
     # ==========================================
-    async def clean_html(
-        self,
-        html: str,
-        aggressive: Optional[bool] = None,
-        preserve_js_data: Optional[bool] = None
-    ) -> str:
+    async def clean_html(self, html_content: str, preserve_js_data: bool = True, aggressive_cleaning: bool = False) -> Tuple[str, Dict[str, Any]]:
         """
-        Clean HTML content for LLM analysis
+        Clean HTML content while preserving valuable data
         Args:
-            html: Raw HTML content
-            aggressive: Override aggressive cleaning setting
-            preserve_js_data: Override JS data preservation setting
+            html_content: Raw HTML content
+            preserve_js_data: Whether to extract and preserve JS data
+            aggressive_cleaning: Whether to apply more aggressive cleaning
         Returns:
-            Cleaned HTML optimized for LLM
+            Tuple of (cleaned_html, extracted_data)
         """
-        if not html or not html.strip():
-            return ""
-        # Use config defaults or overrides
-        aggressive_cleaning = aggressive if aggressive is not None else self.config.aggressive_cleaning
-        preserve_js = preserve_js_data if preserve_js_data is not None else self.config.preserve_js_data
+        if not html_content or not html_content.strip():
+            return "", {}
         try:
-            self.logger.info(f"Cleaning HTML: {len(html)} characters")
+            self.logger.info(f"🧹 Cleaning HTML: {len(html_content)} characters")
             # Check size limits
-            if len(html) > self.config.max_html_size:
-                self.logger.warning(f"HTML size ({len(html)}) exceeds limit ({self.config.max_html_size})")
-                html = html[:self.config.max_html_size]
+            if len(html_content) > self.config.max_html_size:
+                self.logger.warning(f"⚠️ HTML size ({len(html_content)}) exceeds limit ({self.config.max_html_size}), truncating")
+                html_content = html_content[: self.config.max_html_size]
             # Parse HTML
-            soup = BeautifulSoup(html, 'html.parser')
-            # Extract JavaScript data before cleaning
-            extracted_data = ExtractedJSData()
-            if preserve_js:
+            soup = BeautifulSoup(html_content, "html.parser")
+            extracted_data = {}
+            # Extract valuable JavaScript data before removing scripts
+            if preserve_js_data:
                 extracted_data = self._extract_js_data(soup)
-            # Apply cleaning steps
+            # Remove universal noise elements for aggressive cleaning
             if aggressive_cleaning:
-                self._aggressive_cleaning(soup)
-            else:
-                self._standard_cleaning(soup)
-            # Get cleaned HTML
-            cleaned_html = str(soup)
+                self._remove_universal_noise(soup)
+                self._truncate_long_urls(soup)  # Do this before tracking URL cleaning
+                self._clean_tracking_urls(soup)
+                self._clean_base64_data(soup)
+                self._remove_long_attributes(soup)
+                self._remove_html_comments(soup)
+                self._clean_whitespace(soup)
+            # Remove noise elements
+            self._remove_noise_elements(soup)
+            # Clean attributes
+            self._clean_attributes(soup, aggressive_cleaning)
+            # Remove comments
+            self._remove_comments(soup)
+            # Clean text and whitespace
+            cleaned_html = self._clean_text_and_whitespace(soup)
             # Final cleanup
             cleaned_html = self._final_cleanup(cleaned_html)
             # Log results
-            original_size = len(html)
+            original_size = len(html_content)
             cleaned_size = len(cleaned_html)
             reduction = ((original_size - cleaned_size) / original_size * 100) if original_size > 0 else 0
-            self.logger.info(
-                f"HTML cleaned: {original_size} → {cleaned_size} chars "
-                f"({reduction:.1f}% reduction)"
-            )
-            return cleaned_html
+            self.logger.info(f"✅ HTML cleaned: {original_size} → {cleaned_size} chars " f"({reduction:.1f}% reduction)")
+            return cleaned_html, extracted_data
         except Exception as e:
-            self.logger.error(f"HTML cleaning failed: {e}")
-            raise HTMLCleaningError(
-                message=f"Failed to clean HTML: {e}",
-                operation="clean_html",
-                details={"html_size": str(len(html))}
-            ) from e
-    def clean_html_sync(self, html: str, **kwargs) -> str:
+            self.logger.error(f"❌ HTML cleaning failed: {e}")
+            raise HTMLCleaningError(message=f"Failed to clean HTML: {e}", operation="clean_html", details={"html_size": str(len(html_content))}) from e
+    def clean_html_sync(self, html_content: str, **kwargs) -> Tuple[str, Dict[str, Any]]:
         """
         Synchronous HTML cleaning
         Args:
-            html: Raw HTML content
+            html_content: Raw HTML content
             **kwargs: Cleaning options
         Returns:
-            Cleaned HTML
+            Tuple of (cleaned_html, extracted_data)
         """
         # Handle running event loop
         try:
             loop = asyncio.get_running_loop()
             # If we're in an event loop, create a new thread
             with concurrent.futures.ThreadPoolExecutor() as executor:
-                future = executor.submit(asyncio.run, self.clean_html(html, **kwargs))
+                future = executor.submit(asyncio.run, self.clean_html(html_content, **kwargs))
                 return future.result()
         except RuntimeError:
             # No event loop running, safe to use asyncio.run
-            return asyncio.run(self.clean_html(html, **kwargs))
-    async def parse_and_clean_html(
-        self,
-        html: str,
-        schema: Optional[dict[str, str]] = None,
-        instructions: Optional[str] = None,
-        **kwargs
-    ) -> dict[str, str]:
-        """
-        Parse and clean HTML with LLM analysis preparation
-        Args:
-            html: Raw HTML content
-            schema: Optional data schema for extraction
-            instructions: Optional parsing instructions
-            **kwargs: Additional options
-        Returns:
-            Dictionary with cleaned HTML and metadata
-        """
-        try:
-            # Clean HTML
-            cleaned_html = await self.clean_html(html, **kwargs)
-            # Get cleaning stats
-            stats = self.get_cleaning_stats(html, cleaned_html)
-            result = {
-                "cleaned_html": cleaned_html,
-                "original_size": str(stats.original_size_bytes),
-                "cleaned_size": str(stats.cleaned_size_bytes),
-                "reduction_percent": f"{stats.size_reduction_percent:.1f}",
-                "estimated_token_savings": str(stats.estimated_token_savings)
-            }
-            if schema:
-                result["schema"] = str(schema)
-            if instructions:
-                result["instructions"] = instructions
-            return result
-        except Exception as e:
-            raise HTMLCleaningError(
-                message=f"Failed to parse and clean HTML: {e}",
-                operation="parse_and_clean_html"
-            ) from e
+            return asyncio.run(self.clean_html(html_content, **kwargs))
     # ==========================================
     # CLEANING IMPLEMENTATION
     # ==========================================
     def _standard_cleaning(self, soup: BeautifulSoup) -> None:
         """Apply standard cleaning"""
         # Remove noise elements
         self._remove_noise_elements(soup)
         # Clean attributes
         self._clean_attributes(soup)
         # Remove comments
         if self.config.remove_comments:
             self._remove_comments(soup)
         # Normalize whitespace
         if self.config.normalize_whitespace:
             self._normalize_whitespace(soup)
     def _aggressive_cleaning(self, soup: BeautifulSoup) -> None:
         """Apply aggressive cleaning"""
         # Standard cleaning first
         self._standard_cleaning(soup)
         # Remove noise selectors
         self._remove_noise_selectors(soup)
         # Clean tracking URLs
         if self.config.remove_tracking:
             self._clean_tracking_urls(soup)
         # Clean base64 data
         self._clean_base64_data(soup)
         # Truncate long URLs
         self._truncate_long_urls(soup)
         # Remove long attributes
         self._remove_long_attributes(soup)
         # Truncate long text
         self._truncate_long_text(soup)
     def _remove_noise_elements(self, soup: BeautifulSoup) -> None:
         """Remove noise HTML elements"""
         # Define noise tags
-        noise_tags = {
-            'meta', 'link', 'base', 'title', 'head', 'noscript',
-            'iframe', 'embed', 'object', 'svg', 'canvas',
-            'audio', 'video', 'source', 'track', 'area', 'map', 'param'
-        }
+        noise_tags = {"meta", "link", "base", "title", "head", "noscript", "iframe", "embed", "object", "svg", "canvas", "audio", "video", "source", "track", "area", "map", "param"}
         # Add conditional tags
         if self.config.remove_scripts:
-            noise_tags.add('script')
+            noise_tags.add("script")
         if self.config.remove_styles:
-            noise_tags.add('style')
+            noise_tags.add("style")
         if not self.config.preserve_forms:
-            noise_tags.update({'form', 'input', 'button', 'select', 'textarea', 'fieldset', 'legend'})
+            noise_tags.update({"form", "input", "button", "select", "textarea", "fieldset", "legend"})
         # Remove noise tags
         for tag_name in noise_tags:
             for tag in soup.find_all(tag_name):
                 tag.decompose()
         # Remove empty elements
         if self.config.remove_empty_elements:
-            for tag in soup.find_all(['div', 'span', 'p']):
+            for tag in soup.find_all(["div", "span", "p"]):
                 if not tag.get_text(strip=True) and not tag.find_all():
                     tag.decompose()
     def _remove_noise_selectors(self, soup: BeautifulSoup) -> None:
         """Remove elements matching noise selectors"""
         for selector in self.config.noise_selectors:
@@ -431,78 +335,96 @@ class HTMLManager:
             except Exception:
                 # Skip invalid selectors
                 continue
     def _clean_attributes(self, soup: BeautifulSoup) -> None:
         """Clean HTML attributes"""
         # Attributes to remove
         noise_attributes = {
-            'style', 'onclick', 'onload', 'onchange', 'onmouseover',
-            'onmouseout', 'onfocus', 'onblur', 'onsubmit', 'onreset',
-            'onerror', 'onabort', 'autocomplete', 'autofocus',
-            'checked', 'defer', 'disabled', 'hidden', 'loop',
-            'multiple', 'muted', 'open', 'readonly', 'required',
-            'tabindex', 'translate', 'draggable', 'contenteditable'
+            "style",
+            "onclick",
+            "onload",
+            "onchange",
+            "onmouseover",
+            "onmouseout",
+            "onfocus",
+            "onblur",
+            "onsubmit",
+            "onreset",
+            "onerror",
+            "onabort",
+            "autocomplete",
+            "autofocus",
+            "checked",
+            "defer",
+            "disabled",
+            "hidden",
+            "loop",
+            "multiple",
+            "muted",
+            "open",
+            "readonly",
+            "required",
+            "tabindex",
+            "translate",
+            "draggable",
+            "contenteditable",
         }
         # Attributes to keep
-        keep_attributes = {
-            'id', 'class', 'href', 'src', 'alt', 'title',
-            'data-testid', 'data-test', 'data-cy',
-            'aria-label', 'aria-labelledby', 'aria-describedby', 'role'
-        }
+        keep_attributes = {"id", "class", "href", "src", "alt", "title", "data-testid", "data-test", "data-cy", "aria-label", "aria-labelledby", "aria-describedby", "role"}
         for tag in soup.find_all(True):
-            if hasattr(tag, 'attrs'):
+            if hasattr(tag, "attrs"):
                 # Remove unwanted attributes
                 attrs_to_remove = set(tag.attrs.keys()) - keep_attributes
                 for attr in attrs_to_remove:
                     if attr in noise_attributes:
                         del tag.attrs[attr]
     def _clean_tracking_urls(self, soup: BeautifulSoup) -> None:
         """Remove or replace tracking URLs"""
         # Clean href attributes
-        for tag in soup.find_all(['a'], href=True):
-            href = tag.get('href', '')
+        for tag in soup.find_all(["a"], href=True):
+            href = tag.get("href", "")
             if href:
                 for pattern in self.tracking_url_patterns:
                     if pattern.match(href):
-                        tag['href'] = '#tracking-url-removed'
+                        tag["href"] = "#tracking-url-removed"
                         break
         # Clean src attributes
-        for tag in soup.find_all(['img'], src=True):
-            src = tag.get('src', '')
+        for tag in soup.find_all(["img"], src=True):
+            src = tag.get("src", "")
             if src:
                 for pattern in self.tracking_url_patterns:
                     if pattern.match(src):
-                        tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
+                        tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
                         break
     def _clean_base64_data(self, soup: BeautifulSoup) -> None:
         """Remove large base64 encoded data"""
-        for tag in soup.find_all(['img'], src=True):
-            src = tag.get('src', '')
+        for tag in soup.find_all(["img"], src=True):
+            src = tag.get("src", "")
             if src:
                 for pattern in self.base64_patterns:
                     if pattern.search(src):
-                        tag['src'] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
+                        tag["src"] = 'data:image/svg+xml,%3Csvg xmlns="http://www.w3.org/2000/svg" width="1" height="1"/%3E'
                         break
     def _truncate_long_urls(self, soup: BeautifulSoup) -> None:
         """Truncate URLs longer than max_url_length"""
         max_length = self.config.max_url_length
-        for tag in soup.find_all(['a'], href=True):
-            href = tag.get('href', '')
+        for tag in soup.find_all(["a"], href=True):
+            href = tag.get("href", "")
             if isinstance(href, str) and len(href) > max_length:
-                tag['href'] = href[:max_length] + '...truncated'
-        for tag in soup.find_all(['img'], src=True):
-            src = tag.get('src', '')
-            if isinstance(src, str) and len(src) > max_length and not src.startswith('data:'):
-                tag['src'] = src[:max_length] + '...truncated'
+                tag["href"] = href[:max_length] + "...truncated"
+        for tag in soup.find_all(["img"], src=True):
+            src = tag.get("src", "")
+            if isinstance(src, str) and len(src) > max_length and not src.startswith("data:"):
+                tag["src"] = src[:max_length] + "...truncated"
     def _remove_long_attributes(self, soup: BeautifulSoup) -> None:
         """Remove attributes with extremely long values"""
         for tag in soup.find_all():
@@ -510,96 +432,95 @@ class HTMLManager:
             for attr, value in tag.attrs.items():
                 if isinstance(value, str) and len(value) > 800:
                     attrs_to_remove.append(attr)
-                elif any(tracking in attr.lower() for tracking in
-                        ['tracking', 'analytics', 'gtm', 'pixel']):
+                elif any(tracking in attr.lower() for tracking in ["tracking", "analytics", "gtm", "pixel"]):
                     attrs_to_remove.append(attr)
             for attr in attrs_to_remove:
                 del tag.attrs[attr]
     def _truncate_long_text(self, soup: BeautifulSoup) -> None:
         """Truncate text content longer than max_text_length"""
         max_length = self.config.max_text_length
         for element in soup.find_all(text=True):
-            if element.parent.name not in ['script', 'style']:
+            if element.parent.name not in ["script", "style"]:
                 text_content = str(element).strip()
                 if text_content and len(text_content) > max_length:
-                    truncated_text = text_content[:max_length] + '...'
+                    truncated_text = text_content[:max_length] + "..."
                     element.replace_with(truncated_text)
     def _remove_comments(self, soup: BeautifulSoup) -> None:
         """Remove HTML comments"""
         for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
             comment.extract()
     def _normalize_whitespace(self, soup: BeautifulSoup) -> None:
         """Normalize whitespace in text content"""
         for element in soup.find_all(text=True):
-            if element.parent.name not in ['script', 'style']:
+            if element.parent.name not in ["script", "style"]:
                 # Replace multiple spaces with single space
-                cleaned_text = re.sub(r' {3,}', '  ', str(element))
+                cleaned_text = re.sub(r" {3,}", "  ", str(element))
                 # Replace multiple newlines with maximum 2
-                cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
+                cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)
                 # Replace multiple tabs with single space
-                cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
+                cleaned_text = re.sub(r"\t+", " ", cleaned_text)
                 element.replace_with(cleaned_text)
     def _final_cleanup(self, html: str) -> str:
         """Final cleanup and optimization"""
         # Remove empty attributes
-        html = re.sub(r'\s+\w+=""', '', html)
+        html = re.sub(r'\s+\w+=""', "", html)
         # Remove extra spaces in attributes
         html = re.sub(r'(\w+)=\s*"([^"]*)"', r'\1="\2"', html)
         # Normalize quotes
         html = re.sub(r"(\w+)='([^']*)'", r'\1="\2"', html)
         # Remove trailing spaces before closing tags
-        html = re.sub(r'\s+(/?>)', r'\1', html)
+        html = re.sub(r"\s+(/?>)", r"\1", html)
         # Advanced whitespace cleanup
         html = self._advanced_whitespace_cleanup(html)
         return html.strip()
     def _advanced_whitespace_cleanup(self, html: str) -> str:
         """Advanced whitespace cleanup"""
         # Remove excessive spaces
-        html = re.sub(r' {3,}', '  ', html)
+        html = re.sub(r" {3,}", "  ", html)
         # Remove excessive newlines
-        html = re.sub(r'\n{3,}', '\n\n', html)
+        html = re.sub(r"\n{3,}", "\n\n", html)
         # Clean space between tags
-        html = re.sub(r'>\s{2,}<', '> <', html)
+        html = re.sub(r">\s{2,}<", "> <", html)
         return html
     # ==========================================
     # JAVASCRIPT DATA EXTRACTION
     # ==========================================
     def _extract_js_data(self, soup: BeautifulSoup) -> ExtractedJSData:
         """Extract valuable JavaScript data"""
         extracted_data = ExtractedJSData()
         # Find all script tags
-        script_tags = soup.find_all('script')
+        script_tags = soup.find_all("script")
         for script in script_tags:
             if not script.string:
                 continue
             script_content = script.string.strip()
             # Skip empty scripts
             if len(script_content) < 10:
                 continue
             # Check for JSON-LD structured data
-            if script.get('type') == 'application/ld+json':
+            if script.get("type") == "application/ld+json":
                 try:
                     json_data = json.loads(script_content)
                     # Convert to string dict for Pydantic compliance
@@ -608,56 +529,56 @@ class HTMLManager:
                     continue
                 except json.JSONDecodeError:
                     pass
             # Extract data using patterns
             self._extract_with_patterns(script_content, extracted_data)
         return extracted_data
     def _extract_with_patterns(self, script_content: str, extracted_data: ExtractedJSData) -> None:
         """Extract data using compiled regex patterns"""
         for pattern in self.js_data_patterns:
             matches = pattern.finditer(script_content)
             for match in matches:
                 self._try_parse_json(match.group(1), extracted_data)
     def _try_parse_json(self, json_str: str, extracted_data: ExtractedJSData) -> None:
         """Try to parse JSON string and add to extracted data"""
         try:
             json_data = json.loads(json_str)
             if isinstance(json_data, dict):
                 # Convert to string dict for Pydantic compliance
                 str_data = {}
                 for k, v in json_data.items():
                     if isinstance(k, (str, int, float)) and isinstance(v, (str, int, float, bool)):
                         str_data[str(k)] = str(v)
                 if str_data:
                     extracted_data.ssr_data.update(str_data)
         except json.JSONDecodeError:
             # Skip invalid JSON
             pass
     # ==========================================
     # UTILITY METHODS
     # ==========================================
     def get_cleaning_stats(self, original_html: str, cleaned_html: str) -> HTMLCleaningStats:
         """Get statistics about the cleaning process"""
         original_size = len(original_html)
         cleaned_size = len(cleaned_html)
         # Estimate token reduction (rough approximation)
         original_tokens = original_size // 4  # Rough estimate: 4 chars per token
         cleaned_tokens = cleaned_size // 4
         size_reduction = original_size - cleaned_size
         size_reduction_percent = (size_reduction / original_size * 100) if original_size > 0 else 0.0
         token_savings = original_tokens - cleaned_tokens
         token_savings_percent = (token_savings / original_tokens * 100) if original_tokens > 0 else 0.0
         return HTMLCleaningStats(
             original_size_bytes=original_size,
             cleaned_size_bytes=cleaned_size,
@@ -666,15 +587,15 @@ class HTMLManager:
             estimated_original_tokens=original_tokens,
             estimated_cleaned_tokens=cleaned_tokens,
             estimated_token_savings=token_savings,
-            estimated_token_savings_percent=token_savings_percent
+            estimated_token_savings_percent=token_savings_percent,
         )
     def update_config(self, **kwargs) -> None:
         """Update configuration with new values"""
         current_data = self.config.model_dump()
         current_data.update(kwargs)
         self.config = HTMLCleaningConfig.model_validate(current_data)
         # Recompile patterns if needed
         self._compile_patterns()
@@ -683,50 +604,54 @@ class HTMLManager:
 # CONVENIENCE FUNCTIONS
 # ==========================================
-def get_html_manager(config: Optional[HTMLCleaningConfig] = None) -> HTMLManager:
+def create_html_cleaner(parser_id: str, config: Optional[HTMLCleaningConfig] = None) -> HTMLCleaner:
     """
-    Get an HTML manager instance
+    Create an HTML cleaner instance
     Args:
         config: Optional HTML cleaning configuration
+        parser_id: Parser identifier for logging
     Returns:
-        Configured HTMLManager instance
+        Configured HTMLCleaner instance
     """
-    return HTMLManager(config=config)
+    return HTMLCleaner(parser_id=parser_id, config=config)
-async def quick_clean_html(html: str, **kwargs) -> str:
+async def quick_clean_html(html: str, parser_id: str, **kwargs) -> str:
     """
     Quick HTML cleaning convenience function
     Args:
         html: Raw HTML content
+        parser_id: Parser identifier for logging
         **kwargs: Cleaning options
     Returns:
         Cleaned HTML
     """
     config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
     config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
-    manager = get_html_manager(config)
-    return await manager.clean_html(html, **kwargs)
+    cleaner = create_html_cleaner(parser_id, config)
+    return await cleaner.clean_html(html, **kwargs)
-def quick_clean_html_sync(html: str, **kwargs) -> str:
+def quick_clean_html_sync(html: str, parser_id: str, **kwargs) -> str:
     """
     Quick synchronous HTML cleaning convenience function
     Args:
         html: Raw HTML content
+        parser_id: Parser identifier for logging
         **kwargs: Cleaning options
     Returns:
         Cleaned HTML
     """
     config_data = {k: v for k, v in kwargs.items() if k in HTMLCleaningConfig.model_fields}
     config = HTMLCleaningConfig.model_validate(config_data) if config_data else None
-    manager = get_html_manager(config)
-    return manager.clean_html_sync(html, **kwargs)
+    cleaner = create_html_cleaner(parser_id, config)
+    return cleaner.clean_html_sync(html, **kwargs)

unrealon 1.1.1__py3-none-any.whl → 1.1.4__py3-none-any.whl

unrealon 1.1.1py3-none-any.whl → 1.1.4py3-none-any.whl