segment_classifier 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.3
2
+ Name: segment_classifier
3
+ Version: 0.1.0
4
+ Summary: Async segment classifier library
5
+ Author: Gagandeep Singh
6
+ Author-email: gagan@innerkore.com
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiofiles (>=23.0,<24.0)
12
+ Requires-Dist: beautifulsoup4 (>=4.12,<5.0)
13
+ Requires-Dist: litellm (>=1.40,<2.0)
14
+ Requires-Dist: lxml (>=5.0,<6.0)
15
+ Requires-Dist: numpy (>=1.26,<2.0)
16
+ Requires-Dist: pydantic (>=2.7,<3.0)
17
+ Requires-Dist: pydantic-settings (>=2.2,<3.0)
18
+ Requires-Dist: scikit-learn (>=1.5,<2.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Segment Classifier
22
+
23
+ An asynchronous Python library that classifies HTML segments extracted by a page-segmenter into structured component types.
24
+
25
+ ## Overview
26
+
27
+ The `segment_classifier` implements a 4-stage classification pipeline with progressive fallback to optimize for cost and speed:
28
+
29
+ 1. **Rule-based heuristics** — Zero LLM cost. Uses DOM structure, text density, siblings, and attributes.
30
+ 2. **L1 exact fingerprint cache** — Zero LLM cost. Exact matching on structural DOM fingerprint hashes.
31
+ 3. **L2 fuzzy cluster cache** — Zero LLM cost. TF-IDF and cosine similarity on fingerprint tokens.
32
+ 4. **LLM batch classification** — Batched fallback via LiteLLM with feature-based model routing based on segment complexity.
33
+
34
+ ## Installation
35
+
36
+ You can install the package using poetry:
37
+ ```bash
38
+ poetry install
39
+ ```
40
+
41
+ Or via pip (once published):
42
+ ```bash
43
+ pip install segment-classifier
44
+ ```
45
+
46
+ ## Setup
47
+
48
+ The library uses `pydantic-settings` to manage configuration via a `.env` file or environment variables.
49
+
50
+ Required environment variables:
51
+ ```env
52
+ CLASSIFIER_LITELLM_API_KEY="your-api-key"
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ```python
58
+ import asyncio
59
+ from segment_classifier import ClassifierPipeline
60
+ from segment_classifier.config import ClassifierSettings
61
+ from segment_classifier.models import InputSegment, SegmentPosition
62
+
63
+ async def main():
64
+ settings = ClassifierSettings()
65
+ pipeline = ClassifierPipeline(settings)
66
+ await pipeline.initialize()
67
+
68
+ segments = [
69
+ InputSegment(
70
+ segment_id="seg_001",
71
+ page_url="https://example.com/products",
72
+ page_slug="products",
73
+ raw_html="<div class='product-card'>...</div>",
74
+ text_content="Product Item",
75
+ position_hint=SegmentPosition.MIDDLE,
76
+ sibling_count=3,
77
+ )
78
+ ]
79
+
80
+ result = await pipeline.run(segments)
81
+ await pipeline.shutdown()
82
+
83
+ for seg in result.classified:
84
+ print(seg.component_type)
85
+
86
+ asyncio.run(main())
87
+ ```
88
+
89
+ ## Caching
90
+
91
+ Caches are stored by default in `.cache/l1_fingerprints.json` and `.cache/l2_clusters.json` / `.cache/l2_embeddings.npy`.
92
+
93
+ ## Stages Breakdown
94
+ Every returned `ClassifiedSegment` will be marked with a `classification_stage` indicating which of the 4 stages resolved the query.
95
+
@@ -0,0 +1,74 @@
1
+ # Segment Classifier
2
+
3
+ An asynchronous Python library that classifies HTML segments extracted by a page-segmenter into structured component types.
4
+
5
+ ## Overview
6
+
7
+ The `segment_classifier` implements a 4-stage classification pipeline with progressive fallback to optimize for cost and speed:
8
+
9
+ 1. **Rule-based heuristics** — Zero LLM cost. Uses DOM structure, text density, siblings, and attributes.
10
+ 2. **L1 exact fingerprint cache** — Zero LLM cost. Exact matching on structural DOM fingerprint hashes.
11
+ 3. **L2 fuzzy cluster cache** — Zero LLM cost. TF-IDF and cosine similarity on fingerprint tokens.
12
+ 4. **LLM batch classification** — Batched fallback via LiteLLM with feature-based model routing based on segment complexity.
13
+
14
+ ## Installation
15
+
16
+ You can install the package using poetry:
17
+ ```bash
18
+ poetry install
19
+ ```
20
+
21
+ Or via pip (once published):
22
+ ```bash
23
+ pip install segment-classifier
24
+ ```
25
+
26
+ ## Setup
27
+
28
+ The library uses `pydantic-settings` to manage configuration via a `.env` file or environment variables.
29
+
30
+ Required environment variables:
31
+ ```env
32
+ CLASSIFIER_LITELLM_API_KEY="your-api-key"
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ```python
38
+ import asyncio
39
+ from segment_classifier import ClassifierPipeline
40
+ from segment_classifier.config import ClassifierSettings
41
+ from segment_classifier.models import InputSegment, SegmentPosition
42
+
43
+ async def main():
44
+ settings = ClassifierSettings()
45
+ pipeline = ClassifierPipeline(settings)
46
+ await pipeline.initialize()
47
+
48
+ segments = [
49
+ InputSegment(
50
+ segment_id="seg_001",
51
+ page_url="https://example.com/products",
52
+ page_slug="products",
53
+ raw_html="<div class='product-card'>...</div>",
54
+ text_content="Product Item",
55
+ position_hint=SegmentPosition.MIDDLE,
56
+ sibling_count=3,
57
+ )
58
+ ]
59
+
60
+ result = await pipeline.run(segments)
61
+ await pipeline.shutdown()
62
+
63
+ for seg in result.classified:
64
+ print(seg.component_type)
65
+
66
+ asyncio.run(main())
67
+ ```
68
+
69
+ ## Caching
70
+
71
+ Caches are stored by default in `.cache/l1_fingerprints.json` and `.cache/l2_clusters.json` / `.cache/l2_embeddings.npy`.
72
+
73
+ ## Stages Breakdown
74
+ Every returned `ClassifiedSegment` will be marked with a `classification_stage` indicating which of the 4 stages resolved the query.
@@ -0,0 +1,24 @@
1
+ [tool.poetry]
2
+ name = "segment_classifier"
3
+ version = "0.1.0"
4
+ description = "Async segment classifier library"
5
+ authors = ["Gagandeep Singh <gagan@innerkore.com>"]
6
+ readme = "README.md"
7
+ packages = [
8
+ { include = "segment_classifier" }
9
+ ]
10
+
11
+ [tool.poetry.dependencies]
12
+ python = "^3.12"
13
+ beautifulsoup4 = "^4.12"
14
+ lxml = "^5.0"
15
+ pydantic = "^2.7"
16
+ pydantic-settings = "^2.2"
17
+ litellm = "^1.40"
18
+ scikit-learn = "^1.5"
19
+ numpy = "^1.26"
20
+ aiofiles = "^23.0"
21
+
22
+ [build-system]
23
+ requires = ["poetry-core"]
24
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,4 @@
1
+ from .pipeline import ClassifierPipeline
2
+ from .config import ClassifierSettings
3
+
4
+ __all__ = ["ClassifierPipeline", "ClassifierSettings"]
@@ -0,0 +1,4 @@
1
+ from .l1_cache import L1FingerprintCache
2
+ from .l2_cache import L2FuzzyCache
3
+
4
+ __all__ = ["L1FingerprintCache", "L2FuzzyCache"]
@@ -0,0 +1,71 @@
1
+ import asyncio
2
+ import json
3
+ from pathlib import Path
4
+ import aiofiles
5
+ from pydantic import ValidationError
6
+ from segment_classifier.models import FingerprintRecord
7
+
8
+
9
+ class L1FingerprintCache:
10
+ def __init__(self, cache_path: str, auto_persist_every: int = 50):
11
+ self._path = Path(cache_path)
12
+ self._store: dict[str, FingerprintRecord] = {}
13
+ self._lock = asyncio.Lock()
14
+ self._write_count = 0
15
+ self._auto_persist_every = auto_persist_every
16
+
17
+ async def load(self) -> None:
18
+ if not self._path.exists():
19
+ return
20
+
21
+ async with aiofiles.open(self._path, "r", encoding="utf-8") as f:
22
+ content = await f.read()
23
+ if not content.strip():
24
+ return
25
+
26
+ try:
27
+ data = json.loads(content)
28
+ for key, val in data.items():
29
+ self._store[key] = FingerprintRecord.model_validate(val)
30
+ except (json.JSONDecodeError, ValidationError) as e:
31
+ # Log error in real app, we'll just ignore and start fresh here
32
+ pass
33
+
34
+ async def get(self, fingerprint_hash: str) -> FingerprintRecord | None:
35
+ async with self._lock:
36
+ return self._store.get(fingerprint_hash)
37
+
38
+ async def set(self, fingerprint_hash: str, record: FingerprintRecord) -> None:
39
+ async with self._lock:
40
+ self._store[fingerprint_hash] = record
41
+ self._write_count += 1
42
+ if self._write_count >= self._auto_persist_every:
43
+ self._write_count = 0
44
+ await self._persist_unsafe()
45
+
46
+ async def increment_hit(self, fingerprint_hash: str) -> None:
47
+ async with self._lock:
48
+ record = self._store.get(fingerprint_hash)
49
+ if record:
50
+ record.hit_count += 1
51
+ self._store[fingerprint_hash] = record
52
+ self._write_count += 1
53
+ if self._write_count >= self._auto_persist_every:
54
+ self._write_count = 0
55
+ await self._persist_unsafe()
56
+
57
+ async def _persist_unsafe(self) -> None:
58
+ """Called internally when lock is already acquired."""
59
+ self._path.parent.mkdir(parents=True, exist_ok=True)
60
+ data = {k: v.model_dump(mode="json") for k, v in self._store.items()}
61
+ async with aiofiles.open(self._path, "w", encoding="utf-8") as f:
62
+ await f.write(json.dumps(data, indent=2))
63
+
64
+ async def persist(self) -> None:
65
+ async with self._lock:
66
+ self._write_count = 0
67
+ await self._persist_unsafe()
68
+
69
+ @property
70
+ def size(self) -> int:
71
+ return len(self._store)
@@ -0,0 +1,157 @@
1
+ import asyncio
2
+ import json
3
+ import uuid
4
+ from pathlib import Path
5
+ import aiofiles
6
+ import numpy as np
7
+ from pydantic import ValidationError
8
+ from segment_classifier.models import ClusterRecord, ComponentType
9
+
10
+
11
+ class L2FuzzyCache:
12
+ def __init__(
13
+ self,
14
+ cache_path: str,
15
+ embeddings_path: str,
16
+ similarity_threshold: float = 0.85,
17
+ max_cluster_size: int = 50,
18
+ persist_on_update: bool = True
19
+ ):
20
+ self._path = Path(cache_path)
21
+ self._embeddings_path = Path(embeddings_path)
22
+ self._similarity_threshold = similarity_threshold
23
+ self._max_cluster_size = max_cluster_size
24
+ self._persist_on_update = persist_on_update
25
+
26
+ self._store: list[ClusterRecord] = []
27
+ # Matrix storing centroids, parallel to self._store
28
+ self._centroids: np.ndarray | None = None
29
+
30
+ self._lock = asyncio.Lock()
31
+
32
+ async def load(self) -> None:
33
+ if not self._path.exists() or not self._embeddings_path.exists():
34
+ return
35
+
36
+ async with aiofiles.open(self._path, "r", encoding="utf-8") as f:
37
+ content = await f.read()
38
+ if content.strip():
39
+ try:
40
+ data = json.loads(content)
41
+ self._store = [ClusterRecord.model_validate(val) for val in data]
42
+ except (json.JSONDecodeError, ValidationError):
43
+ self._store = []
44
+
45
+ if self._store:
46
+ try:
47
+ self._centroids = np.load(str(self._embeddings_path))
48
+ if len(self._store) != self._centroids.shape[0]:
49
+ # Mismatch between JSON and npy, reset
50
+ self._store = []
51
+ self._centroids = None
52
+ except Exception:
53
+ self._store = []
54
+ self._centroids = None
55
+
56
+ async def find_nearest(self, vector: list[float], threshold: float | None = None) -> ClusterRecord | None:
57
+ async with self._lock:
58
+ if not self._store or self._centroids is None:
59
+ return None
60
+
61
+ query = np.array(vector)
62
+ query_norm = np.linalg.norm(query)
63
+ if query_norm == 0:
64
+ return None
65
+
66
+ # Cosine similarity
67
+ dot_products = np.dot(self._centroids, query)
68
+ centroid_norms = np.linalg.norm(self._centroids, axis=1)
69
+ # Avoid division by zero
70
+ centroid_norms[centroid_norms == 0] = 1
71
+
72
+ similarities = dot_products / (centroid_norms * query_norm)
73
+
74
+ best_idx = np.argmax(similarities)
75
+ best_sim = similarities[best_idx]
76
+
77
+ check_threshold = threshold if threshold is not None else self._similarity_threshold
78
+
79
+ if best_sim >= check_threshold:
80
+ return self._store[best_idx]
81
+ return None
82
+
83
+ async def add_to_cluster(self, cluster_id: str, fingerprint_hash: str, vector: list[float]) -> None:
84
+ async with self._lock:
85
+ idx = next((i for i, c in enumerate(self._store) if c.cluster_id == cluster_id), None)
86
+ if idx is not None and self._centroids is not None:
87
+ cluster = self._store[idx]
88
+
89
+ # Check size
90
+ if len(cluster.member_fingerprints) < self._max_cluster_size:
91
+ if fingerprint_hash not in cluster.member_fingerprints:
92
+ cluster.member_fingerprints.append(fingerprint_hash)
93
+
94
+ # Update centroid (running mean)
95
+ n = len(cluster.member_fingerprints)
96
+ old_centroid = self._centroids[idx]
97
+ new_vec = np.array(vector)
98
+ # (old_centroid * (n-1) + new_vec) / n
99
+ new_centroid = (old_centroid * (n - 1) + new_vec) / n
100
+
101
+ self._centroids[idx] = new_centroid
102
+ cluster.centroid_vector = new_centroid.tolist()
103
+
104
+ if self._persist_on_update:
105
+ await self._persist_unsafe()
106
+
107
+ async def create_cluster(
108
+ self,
109
+ fingerprint_hash: str,
110
+ vector: list[float],
111
+ component_type: ComponentType,
112
+ confidence: float,
113
+ ) -> ClusterRecord:
114
+ async with self._lock:
115
+ cluster_id = str(uuid.uuid4())
116
+ record = ClusterRecord(
117
+ cluster_id=cluster_id,
118
+ centroid_vector=vector,
119
+ component_type=component_type,
120
+ confidence=confidence,
121
+ member_fingerprints=[fingerprint_hash]
122
+ )
123
+
124
+ self._store.append(record)
125
+
126
+ new_vec = np.array([vector])
127
+ if self._centroids is None:
128
+ self._centroids = new_vec
129
+ else:
130
+ self._centroids = np.vstack([self._centroids, new_vec])
131
+
132
+ if self._persist_on_update:
133
+ await self._persist_unsafe()
134
+
135
+ return record
136
+
137
+ async def _persist_unsafe(self) -> None:
138
+ self._path.parent.mkdir(parents=True, exist_ok=True)
139
+
140
+ # Write JSON
141
+ data = [c.model_dump(mode="json") for c in self._store]
142
+ async with aiofiles.open(self._path, "w", encoding="utf-8") as f:
143
+ await f.write(json.dumps(data, indent=2))
144
+
145
+ # Write npy
146
+ if self._centroids is not None:
147
+ # We can't do aiofiles easily for numpy, doing it sync for now
148
+ # as it's a small matrix and we use it as memory store
149
+ np.save(str(self._embeddings_path), self._centroids)
150
+
151
+ async def persist(self) -> None:
152
+ async with self._lock:
153
+ await self._persist_unsafe()
154
+
155
+ @property
156
+ def size(self) -> int:
157
+ return len(self._store)
@@ -0,0 +1,53 @@
1
+ from pydantic import BaseModel
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+
5
+ class ModelFeatureConfig(BaseModel):
6
+ """
7
+ Feature-based LLM model routing.
8
+
9
+ Selection priority (highest to lowest):
10
+ 1. high_complexity → use for ambiguous, deeply nested, multi-role segments
11
+ 2. standard → default for most unknown segments
12
+ 3. fast → simple segments with weak signals but not rule-matchable
13
+
14
+ Complexity is determined by:
15
+ - dom_depth > threshold
16
+ - child_tag_counts diversity (many unique tags = complex)
17
+ - text_density_ratio (very high or very low = complex)
18
+ - sibling_count == 0 (one-off sections = complex)
19
+ """
20
+ high_complexity_model: str = "anthropic/claude-opus-4"
21
+ standard_model: str = "anthropic/claude-sonnet-4-5"
22
+ fast_model: str = "anthropic/claude-haiku-4-5"
23
+
24
+ high_complexity_dom_depth_threshold: int = 6
25
+ high_complexity_unique_tag_threshold: int = 8
26
+ fast_model_max_dom_depth: int = 3
27
+
28
+
29
+ class CacheConfig(BaseModel):
30
+ l1_cache_path: str = ".cache/l1_fingerprints.json"
31
+ l2_cache_path: str = ".cache/l2_clusters.json"
32
+ l2_embeddings_path: str = ".cache/l2_embeddings.npy"
33
+ l2_similarity_threshold: float = 0.85
34
+ l2_max_cluster_size: int = 50
35
+ persist_on_update: bool = True
36
+
37
+
38
+ class ClassifierSettings(BaseSettings):
39
+ model_config = SettingsConfigDict(env_file=".env", env_prefix="CLASSIFIER_")
40
+
41
+ # LiteLLM
42
+ litellm_api_key: str = ""
43
+ litellm_batch_size: int = 20 # max segments per LLM batch call
44
+ litellm_max_concurrent_batches: int = 5
45
+ litellm_timeout_seconds: int = 60
46
+
47
+ # Pipeline
48
+ rule_based_confidence_threshold: float = 0.90
49
+ l1_min_confidence: float = 0.85
50
+ l2_min_confidence: float = 0.75
51
+
52
+ model_routing: ModelFeatureConfig = ModelFeatureConfig()
53
+ cache: CacheConfig = CacheConfig()
@@ -0,0 +1,142 @@
1
+ from enum import Enum
2
+ from typing import Any
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ClassificationStage(str, Enum):
7
+ RULE_BASED = "rule_based"
8
+ L1_EXACT_CACHE = "l1_exact_cache"
9
+ L2_FUZZY_CACHE = "l2_fuzzy_cache"
10
+ LLM = "llm"
11
+
12
+
13
+ class SegmentPosition(str, Enum):
14
+ TOP = "top" # top 5% of page
15
+ BOTTOM = "bottom" # bottom 10% of page
16
+ MIDDLE = "middle"
17
+ UNKNOWN = "unknown"
18
+
19
+
20
+ class ComponentType(str, Enum):
21
+ # Layout
22
+ LAYOUT_HEADER = "layout.header"
23
+ LAYOUT_FOOTER = "layout.footer"
24
+ LAYOUT_NAV = "layout.nav"
25
+ LAYOUT_SIDEBAR = "layout.sidebar"
26
+ LAYOUT_BREADCRUMB = "layout.breadcrumb"
27
+
28
+ # Collections
29
+ COLLECTION_PRODUCT_CARD = "collection.product_card"
30
+ COLLECTION_PRODUCT_LIST = "collection.product_list"
31
+ COLLECTION_BLOG_CARD = "collection.blog_card"
32
+ COLLECTION_BLOG_LIST = "collection.blog_list"
33
+ COLLECTION_NEWS_ITEM = "collection.news_item"
34
+ COLLECTION_NEWS_LIST = "collection.news_list"
35
+
36
+ # Sections
37
+ SECTION_HERO = "section.hero"
38
+ SECTION_FEATURE_GRID = "section.feature_grid"
39
+ SECTION_TESTIMONIAL = "section.testimonial"
40
+ SECTION_CTA = "section.cta"
41
+ SECTION_FAQ = "section.faq"
42
+ SECTION_PRICING = "section.pricing"
43
+
44
+ # UI Elements
45
+ UI_FORM = "ui.form"
46
+ UI_MODAL = "ui.modal"
47
+ UI_TABLE = "ui.table"
48
+ UI_CAROUSEL = "ui.carousel"
49
+ UI_PAGINATION = "ui.pagination"
50
+ UI_SEARCH = "ui.search"
51
+
52
+ # Content
53
+ CONTENT_ARTICLE = "content.article"
54
+ CONTENT_RICH_TEXT = "content.rich_text"
55
+ CONTENT_MEDIA = "content.media"
56
+
57
+ UNKNOWN = "unknown"
58
+
59
+
60
+ class InputSegment(BaseModel):
61
+ """Raw segment from the page-segmenter tool."""
62
+ segment_id: str
63
+ page_url: str
64
+ page_slug: str
65
+ raw_html: str
66
+ text_content: str
67
+ position_hint: SegmentPosition = SegmentPosition.UNKNOWN
68
+ dom_position: str = "" # CSS selector path e.g. "main > section:nth-child(2)"
69
+ sibling_count: int = 0 # how many same-fingerprint siblings on same page
70
+ url_path_segments: list[str] = Field(default_factory=list) # e.g. ["products", "shoes"]
71
+
72
+
73
+ class ClassifiedSegment(BaseModel):
74
+ """Segment with classification result and metadata."""
75
+ segment_id: str
76
+ page_url: str
77
+ page_slug: str
78
+ raw_html: str
79
+ text_content: str
80
+ position_hint: SegmentPosition
81
+
82
+ # Classification output
83
+ component_type: ComponentType
84
+ classification_stage: ClassificationStage
85
+ confidence: float = Field(ge=0.0, le=1.0)
86
+
87
+ # Fingerprint computed during pipeline
88
+ fingerprint_hash: str = ""
89
+ cluster_id: str | None = None
90
+
91
+ # LLM metadata (populated only for stage=LLM)
92
+ llm_model_used: str | None = None
93
+ llm_raw_response: str | None = None
94
+
95
+
96
+ class FingerprintRecord(BaseModel):
97
+ """Stored in L1 cache: fingerprint → classification."""
98
+ fingerprint_hash: str
99
+ component_type: ComponentType
100
+ confidence: float
101
+ hit_count: int = 1
102
+ example_segment_id: str = ""
103
+
104
+
105
+ class ClusterRecord(BaseModel):
106
+ """Stored in L2 cache: cluster of similar fingerprints."""
107
+ cluster_id: str
108
+ centroid_vector: list[float]
109
+ component_type: ComponentType
110
+ confidence: float
111
+ member_fingerprints: list[str] = Field(default_factory=list)
112
+
113
+
114
+ class LLMClassificationRequest(BaseModel):
115
+ """Batch item sent to LLM."""
116
+ segment_id: str
117
+ fingerprint_hash: str
118
+ normalized_html: str # skeleton only, no content
119
+ position_hint: SegmentPosition
120
+ sibling_count: int
121
+ url_hints: list[str]
122
+ dom_depth: int
123
+ child_tag_counts: dict[str, int]
124
+ text_density_ratio: float
125
+
126
+
127
+ class LLMClassificationResult(BaseModel):
128
+ """Parsed result from LLM for one segment."""
129
+ segment_id: str
130
+ component_type: ComponentType
131
+ confidence: float
132
+ reasoning: str
133
+
134
+
135
+ class PipelineResult(BaseModel):
136
+ """Final output of the full classification pipeline run."""
137
+ total_segments: int
138
+ classified: list[ClassifiedSegment]
139
+ stage_breakdown: dict[ClassificationStage, int]
140
+ llm_calls_made: int
141
+ llm_model_usage: dict[str, int] # model_name → call count
142
+ cache_hit_rate: float