PyPI - segment_classifier - Versions diffs - 0.1.0__tar.gz → 0.1.1__tar.gz - Mend

segment_classifier 0.1.0tar.gz → 0.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{segment_classifier-0.1.0 → segment_classifier-0.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: segment_classifier
-Version: 0.1.0
+Version: 0.1.1
 Summary: Async segment classifier library
 Author: Gagandeep Singh
 Author-email: gagan@innerkore.com

{segment_classifier-0.1.0 → segment_classifier-0.1.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "segment_classifier"
-version = "0.1.0"
+version = "0.1.1"
 description = "Async segment classifier library"
 authors = ["Gagandeep Singh <gagan@innerkore.com>"]
 readme = "README.md"

{segment_classifier-0.1.0 → segment_classifier-0.1.1}/segment_classifier/config.py RENAMED Viewed

@@ -17,9 +17,9 @@ class ModelFeatureConfig(BaseModel):
     - text_density_ratio (very high or very low = complex)
     - sibling_count == 0 (one-off sections = complex)
     """
-    high_complexity_model: str = "anthropic/claude-opus-4"
-    standard_model: str = "anthropic/claude-sonnet-4-5"
-    fast_model: str = "anthropic/claude-haiku-4-5"
+    high_complexity_model: str = "high-complexity"
+    standard_model: str = "standard"
+    fast_model: str = "fast"
     high_complexity_dom_depth_threshold: int = 6
     high_complexity_unique_tag_threshold: int = 8
@@ -39,6 +39,7 @@ class ClassifierSettings(BaseSettings):
     model_config = SettingsConfigDict(env_file=".env", env_prefix="CLASSIFIER_")
     # LiteLLM
+    litellm_config_path: str = "litellm_config.yaml"
     litellm_api_key: str = ""
     litellm_batch_size: int = 20         # max segments per LLM batch call
     litellm_max_concurrent_batches: int = 5

{segment_classifier-0.1.0 → segment_classifier-0.1.1}/segment_classifier/models.py RENAMED Viewed

@@ -112,10 +112,10 @@ class ClusterRecord(BaseModel):
 class LLMClassificationRequest(BaseModel):
-    """Batch item sent to LLM."""
+    """Batch item sent to LLM. Use the provided raw HTML in normalized_html to understand purpose and content."""
     segment_id: str
     fingerprint_hash: str
-    normalized_html: str          # skeleton only, no content
+    normalized_html: str          # raw HTML content of the segment
     position_hint: SegmentPosition
     sibling_count: int
     url_hints: list[str]

{segment_classifier-0.1.0 → segment_classifier-0.1.1}/segment_classifier/pipeline.py RENAMED Viewed

@@ -121,29 +121,92 @@ class ClassifierPipeline:
         # Stage 4: LLM Batch
         llm_calls_made = 0
         if pending:
-            llm_items = [
-                (seg, fingerprints[seg.segment_id][0], fingerprints[seg.segment_id][1])
-                for seg in pending
-            ]
-            llm_results = await self.llm_classifier.classify_batch(llm_items)
+            # 1. Deduplicate by exact fingerprint
+            fp_to_segments: dict[str, list[InputSegment]] = {}
+            for seg in pending:
+                _, fp_hash = fingerprints[seg.segment_id]
+                fp_to_segments.setdefault(fp_hash, []).append(seg)
+            unique_fps = list(fp_to_segments.keys())
+            # 2. Group unique fingerprints into fuzzy clusters dynamically
+            dynamic_clusters: list[list[str]] = []
+            cluster_vectors: list[list[float]] = []
+            for fp_hash in unique_fps:
+                rep_seg = fp_to_segments[fp_hash][0]
+                normalized, _ = fingerprints[rep_seg.segment_id]
+                fp_string = self.fuzzy_stage._build_fingerprint_string(normalized)
+                vector = self.fuzzy_stage._vectorize(fp_string)
+                best_cluster_idx = -1
+                best_sim = -1.0
+                for i, c_vec in enumerate(cluster_vectors):
+                    # Vectors from TfidfTransformer are L2-normalized, so dot product is cosine similarity
+                    sim = sum(a * b for a, b in zip(vector, c_vec))
+                    if sim > best_sim:
+                        best_sim = sim
+                        best_cluster_idx = i
+                if best_sim >= self.settings.cache.l2_similarity_threshold:
+                    dynamic_clusters[best_cluster_idx].append(fp_hash)
+                else:
+                    dynamic_clusters.append([fp_hash])
+                    cluster_vectors.append(vector)
+            # 3. Prepare LLM items (one representative per dynamic cluster)
+            llm_items = []
+            for cluster_fps in dynamic_clusters:
+                rep_fp = cluster_fps[0]
+                rep_seg = fp_to_segments[rep_fp][0]
+                normalized = fingerprints[rep_seg.segment_id][0]
+                llm_items.append((rep_seg, normalized, rep_fp))
-            # For each LLM result, register in L1 + L2
-            for seg, result in zip(pending, llm_results):
-                normalized, fp_hash = fingerprints[seg.segment_id]
-                await self.l1_cache.set(fp_hash, FingerprintRecord(
-                    fingerprint_hash=fp_hash,
-                    component_type=result.component_type,
-                    confidence=result.confidence,
-                    example_segment_id=seg.segment_id
-                ))
-                await self.fuzzy_stage.register(
-                    fingerprint_hash=fp_hash,
-                    normalized=normalized,
-                    component_type=result.component_type,
-                    confidence=result.confidence
-                )
+            llm_results = await self.llm_classifier.classify_batch(llm_items)
-            classified.extend(llm_results)
+            # 4. Apply results to all segments in the dynamic clusters
+            for cluster_fps, result in zip(dynamic_clusters, llm_results):
+                rep_fp = cluster_fps[0]
+                for fp_hash in cluster_fps:
+                    group_rep_seg = fp_to_segments[fp_hash][0]
+                    normalized = fingerprints[group_rep_seg.segment_id][0]
+                    # Fuzzy match penalty if not the exact representative fingerprint
+                    confidence = result.confidence if fp_hash == rep_fp else max(0.0, result.confidence - 0.05)
+                    stage = result.classification_stage if fp_hash == rep_fp else ClassificationStage.L2_FUZZY_CACHE
+                    # Register in caches
+                    await self.l1_cache.set(fp_hash, FingerprintRecord(
+                        fingerprint_hash=fp_hash,
+                        component_type=result.component_type,
+                        confidence=confidence,
+                        example_segment_id=group_rep_seg.segment_id
+                    ))
+                    await self.fuzzy_stage.register(
+                        fingerprint_hash=fp_hash,
+                        normalized=normalized,
+                        component_type=result.component_type,
+                        confidence=confidence
+                    )
+                    # Create ClassifiedSegment for all input segments sharing this fp_hash
+                    for seg in fp_to_segments[fp_hash]:
+                        classified.append(ClassifiedSegment(
+                            segment_id=seg.segment_id,
+                            page_url=seg.page_url,
+                            page_slug=seg.page_slug,
+                            raw_html=seg.raw_html,
+                            text_content=seg.text_content,
+                            position_hint=seg.position_hint,
+                            component_type=result.component_type,
+                            classification_stage=stage,
+                            confidence=confidence,
+                            fingerprint_hash=fp_hash,
+                            cluster_id=result.cluster_id,
+                            llm_model_used=result.llm_model_used,
+                            llm_raw_response=result.llm_raw_response
+                        ))
             # Calculate total LLM batch calls
             grouped_by_model: dict[str, int] = {}

{segment_classifier-0.1.0 → segment_classifier-0.1.1}/segment_classifier/stages/llm_classifier.py RENAMED Viewed

@@ -1,7 +1,11 @@
 import asyncio
 import json
 import logging
+import os
+import re
+import yaml
 import litellm
+from litellm import Router
 from typing import Any
 from segment_classifier.models import (
     InputSegment, ClassifiedSegment, LLMClassificationRequest,
@@ -25,6 +29,20 @@ class LLMBatchClassifier:
         if settings.litellm_api_key:
             litellm.api_key = settings.litellm_api_key
+        # Initialize LiteLLM Router if config exists
+        self.router = None
+        if settings.litellm_config_path and os.path.exists(settings.litellm_config_path):
+            try:
+                with open(settings.litellm_config_path, "r") as f:
+                    config = yaml.safe_load(f)
+                self.router = Router(
+                    model_list=config.get("model_list", []),
+                    **config.get("router_settings", {})
+                )
+                logger.info(f"Initialized LiteLLM Router with config: {settings.litellm_config_path}")
+            except Exception as e:
+                logger.error(f"Failed to load LiteLLM config from {settings.litellm_config_path}: {e}")
     def select_model(
         self,
         normalized: NormalizedSegment,
@@ -55,10 +73,10 @@ class LLMBatchClassifier:
         fingerprint_hash: str,
     ) -> LLMClassificationRequest:
         """Construct LLMClassificationRequest from segment + normalized data."""
-        return LLMClassificationRequest(
+        req = LLMClassificationRequest(
             segment_id=segment.segment_id,
             fingerprint_hash=fingerprint_hash,
-            normalized_html=normalized.skeleton,
+            normalized_html=normalized.normalized_html,
             position_hint=segment.position_hint,
             sibling_count=segment.sibling_count,
             url_hints=segment.url_path_segments,
@@ -66,6 +84,7 @@ class LLMBatchClassifier:
             child_tag_counts=normalized.child_tag_counts,
             text_density_ratio=normalized.text_density_ratio
         )
+        return req
     async def _call_litellm(
         self,
@@ -93,7 +112,7 @@ Available component types:
 ]
 Rules:
-- Use normalized_html structure only, ignore content values
+- Use the provided raw HTML in normalized_html to understand the component purpose and content
 - sibling_count >= 3 strongly suggests a collection item
 - position_hint=top/bottom suggests layout components
 - url_hints provide page context
@@ -108,29 +127,55 @@ Rules:
         ]
         try:
-            response = await litellm.acompletion(
-                model=model,
-                messages=messages,
-                timeout=self.settings.litellm_timeout_seconds,
-            )
+            if self.router:
+                response = await self.router.acompletion(
+                    model=model,
+                    messages=messages,
+                    timeout=self.settings.litellm_timeout_seconds,
+                )
+            else:
+                response = await litellm.acompletion(
+                    model=model,
+                    messages=messages,
+                    timeout=self.settings.litellm_timeout_seconds,
+                )
             # Record usage
             self._model_usage[model] = self._model_usage.get(model, 0) + 1
-            raw_response = response.choices[0].message.content
-            # Strip markdown
-            raw_response = raw_response.strip()
-            if raw_response.startswith("```json"):
-                raw_response = raw_response[7:]
-            elif raw_response.startswith("```"):
-                raw_response = raw_response[3:]
-            if raw_response.endswith("```"):
-                raw_response = raw_response[:-3]
+            raw_response = response.choices[0].message.content.strip()
+            # Robust JSON extraction
+            json_str = raw_response
+            if "```" in json_str:
+                # Try to extract from markdown blocks
+                blocks = re.findall(r'```(?:json)?\s*(.*?)\s*```', json_str, re.DOTALL)
+                if blocks:
+                    json_str = blocks[0]
+            # If still not parsing, try to find the first [ and last ]
+            try:
+                parsed = json.loads(json_str)
+            except json.JSONDecodeError:
+                start = json_str.find('[')
+                end = json_str.rfind(']')
+                if start != -1 and end != -1:
+                    try:
+                        parsed = json.loads(json_str[start:end+1])
+                    except:
+                        raise ValueError(f"Could not parse LLM response as JSON: {raw_response[:200]}...")
+                else:
+                    raise ValueError(f"No JSON array found in LLM response: {raw_response[:200]}...")
-            parsed = json.loads(raw_response.strip())
+            if not isinstance(parsed, list):
+                raise ValueError(f"LLM response is not a JSON array: {type(parsed)}")
             results = []
             for item in parsed:
+                if not isinstance(item, dict):
+                    logger.warning(f"Skipping non-dict item in LLM response: {item}")
+                    continue
                 try:
                     results.append(LLMClassificationResult.model_validate(item))
                 except Exception as e:
@@ -139,7 +184,7 @@ Rules:
                         segment_id=item.get("segment_id", ""),
                         component_type=ComponentType.UNKNOWN,
                         confidence=0.0,
-                        reasoning=f"Parse error: {e}"
+                        reasoning=f"Validation error: {e}"
                     ))
             # Ensure all segments are accounted for

{segment_classifier-0.1.0 → segment_classifier-0.1.1}/segment_classifier/utils/html_normalizer.py RENAMED Viewed

@@ -16,10 +16,11 @@ STRUCTURAL_CLASS_PATTERN = re.compile(
 PRESENTATIONAL_CLASS_PATTERN = re.compile(
     r'\b(mt|mb|ml|mr|mx|my|pt|pb|pl|pr|px|py|w-|h-|text-|bg-|'
     r'border|rounded|shadow|flex|grid-cols|gap|p-|m-|font-|'
-    r'color|opacity|z-|hidden|block|inline)\b'
+    r'color|opacity|z-|hidden|block|inline)\b',
+    re.IGNORECASE
 )
-STRUCTURAL_ATTRS = {"role", "type", "aria-label", "aria-role", "data-component", "data-type"}
+STRUCTURAL_ATTRS = {"role", "type", "aria-label", "aria-role", "data-component", "data-type", "href", "src", "alt", "title", "placeholder"}
 @dataclass
@@ -32,6 +33,7 @@ class NormalizedSegment:
     root_tag: str
     text_density_ratio: float
     unique_tag_count: int
+    normalized_html: str = ""
     def fingerprint_hash(self) -> str:
         payload = {
@@ -46,6 +48,16 @@ class NormalizedSegment:
             json.dumps(payload, sort_keys=True).encode()
         ).hexdigest()
+    def to_normalized_html(self, max_depth: int = 8) -> str:
+        """
+        Generates a clean, structural HTML string for LLM consumption.
+        Strips text and keeps only structural tags/attributes.
+        """
+        # This is a bit tricky because the dataclass doesn't store the full tree.
+        # But we can reconstruct a representative HTML from the skeleton or
+        # better yet, modify the normalizer to produce this during the initial walk.
+        return self.skeleton # Fallback for now, we'll improve the producer.
 def normalize_segment(html: str, text_content: str) -> NormalizedSegment:
     """
@@ -54,9 +66,10 @@ def normalize_segment(html: str, text_content: str) -> NormalizedSegment:
     soup = BeautifulSoup(html, "html.parser")
     root = soup.find()
     if not root or not isinstance(root, Tag):
-        return NormalizedSegment("", "", [], {}, 0, "unknown", 0.0, 0)
+        return NormalizedSegment("", "", [], {}, 0, "unknown", 0.0, 0, "")
     skeleton = _extract_skeleton(root)
+    normalized_html = _generate_normalized_html(root)
     attrs_fp = _extract_attrs_fingerprint(root)
     class_tokens = _extract_class_tokens(root)
     child_counts = _count_tags(root)
@@ -73,6 +86,7 @@ def normalize_segment(html: str, text_content: str) -> NormalizedSegment:
         root_tag=root.name,
         text_density_ratio=round(text_ratio, 4),
         unique_tag_count=unique_tags,
+        normalized_html=normalized_html,
     )
@@ -92,6 +106,54 @@ def _extract_skeleton(tag: Tag, depth: int = 0, max_depth: int = 8) -> str:
     return f"{tag.name}>" + "+".join(child_skeletons)
+def _generate_normalized_html(tag: Tag, depth: int = 0, max_depth: int = 10) -> str:
+    """Recursive tag-only HTML with structural classes/attributes."""
+    if depth >= max_depth:
+        return ""
+    tag_name = tag.name
+    if tag_name in {"script", "style", "meta", "link", "noscript", "svg", "path", "circle", "rect", "line", "polyline", "polygon", "ellipse"}:
+        # Simplification: skip heavy SVG/non-visible tags
+        return ""
+    attrs = []
+    # Keep structural attributes
+    for attr in STRUCTURAL_ATTRS:
+        val = tag.get(attr)
+        if val:
+            if isinstance(val, list):
+                val = " ".join(val)
+            attrs.append(f'{attr}="{val}"')
+    # Keep structural classes
+    classes = tag.get("class", [])
+    if isinstance(classes, str):
+        classes = [classes]
+    relevant_classes = [c for c in classes if STRUCTURAL_CLASS_PATTERN.search(c) or PRESENTATIONAL_CLASS_PATTERN.search(c)]
+    if relevant_classes:
+        attrs.append(f'class="{" ".join(relevant_classes)}"')
+    attr_str = " " + " ".join(attrs) if attrs else ""
+    children_html = ""
+    for child in tag.children:
+        if isinstance(child, Tag):
+            children_html += _generate_normalized_html(child, depth + 1, max_depth)
+        elif isinstance(child, str):
+            text = child.strip()
+            if text:
+                if len(text) > 200:
+                    text = text[:197] + "..."
+                children_html += text
+    if not children_html:
+        # Self-closing for empty tags is fine, or keep them open if preferred
+        return f"<{tag_name}{attr_str}></{tag_name}>"
+    return f"<{tag_name}{attr_str}>{children_html}</{tag_name}>"
 def _extract_attrs_fingerprint(tag: Tag) -> str:
     """Walk all tags, keep only STRUCTURAL_ATTRS values and href/src presence booleans."""
     parts = []