segment_classifier 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- segment_classifier-0.1.0/PKG-INFO +95 -0
- segment_classifier-0.1.0/README.md +74 -0
- segment_classifier-0.1.0/pyproject.toml +24 -0
- segment_classifier-0.1.0/segment_classifier/__init__.py +4 -0
- segment_classifier-0.1.0/segment_classifier/cache/__init__.py +4 -0
- segment_classifier-0.1.0/segment_classifier/cache/l1_cache.py +71 -0
- segment_classifier-0.1.0/segment_classifier/cache/l2_cache.py +157 -0
- segment_classifier-0.1.0/segment_classifier/config.py +53 -0
- segment_classifier-0.1.0/segment_classifier/models.py +142 -0
- segment_classifier-0.1.0/segment_classifier/pipeline.py +173 -0
- segment_classifier-0.1.0/segment_classifier/stages/__init__.py +6 -0
- segment_classifier-0.1.0/segment_classifier/stages/fingerprint.py +10 -0
- segment_classifier-0.1.0/segment_classifier/stages/fuzzy_cluster.py +101 -0
- segment_classifier-0.1.0/segment_classifier/stages/llm_classifier.py +271 -0
- segment_classifier-0.1.0/segment_classifier/stages/rule_based.py +287 -0
- segment_classifier-0.1.0/segment_classifier/utils/__init__.py +3 -0
- segment_classifier-0.1.0/segment_classifier/utils/html_normalizer.py +165 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: segment_classifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Async segment classifier library
|
|
5
|
+
Author: Gagandeep Singh
|
|
6
|
+
Author-email: gagan@innerkore.com
|
|
7
|
+
Requires-Python: >=3.12,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
11
|
+
Requires-Dist: aiofiles (>=23.0,<24.0)
|
|
12
|
+
Requires-Dist: beautifulsoup4 (>=4.12,<5.0)
|
|
13
|
+
Requires-Dist: litellm (>=1.40,<2.0)
|
|
14
|
+
Requires-Dist: lxml (>=5.0,<6.0)
|
|
15
|
+
Requires-Dist: numpy (>=1.26,<2.0)
|
|
16
|
+
Requires-Dist: pydantic (>=2.7,<3.0)
|
|
17
|
+
Requires-Dist: pydantic-settings (>=2.2,<3.0)
|
|
18
|
+
Requires-Dist: scikit-learn (>=1.5,<2.0)
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Segment Classifier
|
|
22
|
+
|
|
23
|
+
An asynchronous Python library that classifies HTML segments extracted by a page-segmenter into structured component types.
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
The `segment_classifier` implements a 4-stage classification pipeline with progressive fallback to optimize for cost and speed:
|
|
28
|
+
|
|
29
|
+
1. **Rule-based heuristics** — Zero LLM cost. Uses DOM structure, text density, siblings, and attributes.
|
|
30
|
+
2. **L1 exact fingerprint cache** — Zero LLM cost. Exact matching on structural DOM fingerprint hashes.
|
|
31
|
+
3. **L2 fuzzy cluster cache** — Zero LLM cost. TF-IDF and cosine similarity on fingerprint tokens.
|
|
32
|
+
4. **LLM batch classification** — Batched fallback via LiteLLM with feature-based model routing based on segment complexity.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
You can install the package using poetry:
|
|
37
|
+
```bash
|
|
38
|
+
poetry install
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or via pip (once published):
|
|
42
|
+
```bash
|
|
43
|
+
pip install segment-classifier
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Setup
|
|
47
|
+
|
|
48
|
+
The library uses `pydantic-settings` to manage configuration via a `.env` file or environment variables.
|
|
49
|
+
|
|
50
|
+
Required environment variables:
|
|
51
|
+
```env
|
|
52
|
+
CLASSIFIER_LITELLM_API_KEY="your-api-key"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import asyncio
|
|
59
|
+
from segment_classifier import ClassifierPipeline
|
|
60
|
+
from segment_classifier.config import ClassifierSettings
|
|
61
|
+
from segment_classifier.models import InputSegment, SegmentPosition
|
|
62
|
+
|
|
63
|
+
async def main():
|
|
64
|
+
settings = ClassifierSettings()
|
|
65
|
+
pipeline = ClassifierPipeline(settings)
|
|
66
|
+
await pipeline.initialize()
|
|
67
|
+
|
|
68
|
+
segments = [
|
|
69
|
+
InputSegment(
|
|
70
|
+
segment_id="seg_001",
|
|
71
|
+
page_url="https://example.com/products",
|
|
72
|
+
page_slug="products",
|
|
73
|
+
raw_html="<div class='product-card'>...</div>",
|
|
74
|
+
text_content="Product Item",
|
|
75
|
+
position_hint=SegmentPosition.MIDDLE,
|
|
76
|
+
sibling_count=3,
|
|
77
|
+
)
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
result = await pipeline.run(segments)
|
|
81
|
+
await pipeline.shutdown()
|
|
82
|
+
|
|
83
|
+
for seg in result.classified:
|
|
84
|
+
print(seg.component_type)
|
|
85
|
+
|
|
86
|
+
asyncio.run(main())
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Caching
|
|
90
|
+
|
|
91
|
+
Caches are stored by default in `.cache/l1_fingerprints.json` and `.cache/l2_clusters.json` / `.cache/l2_embeddings.npy`.
|
|
92
|
+
|
|
93
|
+
## Stages Breakdown
|
|
94
|
+
Every returned `ClassifiedSegment` will be marked with a `classification_stage` indicating which of the 4 stages resolved the query.
|
|
95
|
+
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Segment Classifier
|
|
2
|
+
|
|
3
|
+
An asynchronous Python library that classifies HTML segments extracted by a page-segmenter into structured component types.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The `segment_classifier` implements a 4-stage classification pipeline with progressive fallback to optimize for cost and speed:
|
|
8
|
+
|
|
9
|
+
1. **Rule-based heuristics** — Zero LLM cost. Uses DOM structure, text density, siblings, and attributes.
|
|
10
|
+
2. **L1 exact fingerprint cache** — Zero LLM cost. Exact matching on structural DOM fingerprint hashes.
|
|
11
|
+
3. **L2 fuzzy cluster cache** — Zero LLM cost. TF-IDF and cosine similarity on fingerprint tokens.
|
|
12
|
+
4. **LLM batch classification** — Batched fallback via LiteLLM with feature-based model routing based on segment complexity.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
You can install the package using poetry:
|
|
17
|
+
```bash
|
|
18
|
+
poetry install
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or via pip (once published):
|
|
22
|
+
```bash
|
|
23
|
+
pip install segment-classifier
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Setup
|
|
27
|
+
|
|
28
|
+
The library uses `pydantic-settings` to manage configuration via a `.env` file or environment variables.
|
|
29
|
+
|
|
30
|
+
Required environment variables:
|
|
31
|
+
```env
|
|
32
|
+
CLASSIFIER_LITELLM_API_KEY="your-api-key"
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
import asyncio
|
|
39
|
+
from segment_classifier import ClassifierPipeline
|
|
40
|
+
from segment_classifier.config import ClassifierSettings
|
|
41
|
+
from segment_classifier.models import InputSegment, SegmentPosition
|
|
42
|
+
|
|
43
|
+
async def main():
|
|
44
|
+
settings = ClassifierSettings()
|
|
45
|
+
pipeline = ClassifierPipeline(settings)
|
|
46
|
+
await pipeline.initialize()
|
|
47
|
+
|
|
48
|
+
segments = [
|
|
49
|
+
InputSegment(
|
|
50
|
+
segment_id="seg_001",
|
|
51
|
+
page_url="https://example.com/products",
|
|
52
|
+
page_slug="products",
|
|
53
|
+
raw_html="<div class='product-card'>...</div>",
|
|
54
|
+
text_content="Product Item",
|
|
55
|
+
position_hint=SegmentPosition.MIDDLE,
|
|
56
|
+
sibling_count=3,
|
|
57
|
+
)
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
result = await pipeline.run(segments)
|
|
61
|
+
await pipeline.shutdown()
|
|
62
|
+
|
|
63
|
+
for seg in result.classified:
|
|
64
|
+
print(seg.component_type)
|
|
65
|
+
|
|
66
|
+
asyncio.run(main())
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Caching
|
|
70
|
+
|
|
71
|
+
Caches are stored by default in `.cache/l1_fingerprints.json` and `.cache/l2_clusters.json` / `.cache/l2_embeddings.npy`.
|
|
72
|
+
|
|
73
|
+
## Stages Breakdown
|
|
74
|
+
Every returned `ClassifiedSegment` will be marked with a `classification_stage` indicating which of the 4 stages resolved the query.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "segment_classifier"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Async segment classifier library"
|
|
5
|
+
authors = ["Gagandeep Singh <gagan@innerkore.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
packages = [
|
|
8
|
+
{ include = "segment_classifier" }
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[tool.poetry.dependencies]
|
|
12
|
+
python = "^3.12"
|
|
13
|
+
beautifulsoup4 = "^4.12"
|
|
14
|
+
lxml = "^5.0"
|
|
15
|
+
pydantic = "^2.7"
|
|
16
|
+
pydantic-settings = "^2.2"
|
|
17
|
+
litellm = "^1.40"
|
|
18
|
+
scikit-learn = "^1.5"
|
|
19
|
+
numpy = "^1.26"
|
|
20
|
+
aiofiles = "^23.0"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["poetry-core"]
|
|
24
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import aiofiles
|
|
5
|
+
from pydantic import ValidationError
|
|
6
|
+
from segment_classifier.models import FingerprintRecord
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class L1FingerprintCache:
|
|
10
|
+
def __init__(self, cache_path: str, auto_persist_every: int = 50):
|
|
11
|
+
self._path = Path(cache_path)
|
|
12
|
+
self._store: dict[str, FingerprintRecord] = {}
|
|
13
|
+
self._lock = asyncio.Lock()
|
|
14
|
+
self._write_count = 0
|
|
15
|
+
self._auto_persist_every = auto_persist_every
|
|
16
|
+
|
|
17
|
+
async def load(self) -> None:
|
|
18
|
+
if not self._path.exists():
|
|
19
|
+
return
|
|
20
|
+
|
|
21
|
+
async with aiofiles.open(self._path, "r", encoding="utf-8") as f:
|
|
22
|
+
content = await f.read()
|
|
23
|
+
if not content.strip():
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
data = json.loads(content)
|
|
28
|
+
for key, val in data.items():
|
|
29
|
+
self._store[key] = FingerprintRecord.model_validate(val)
|
|
30
|
+
except (json.JSONDecodeError, ValidationError) as e:
|
|
31
|
+
# Log error in real app, we'll just ignore and start fresh here
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
async def get(self, fingerprint_hash: str) -> FingerprintRecord | None:
|
|
35
|
+
async with self._lock:
|
|
36
|
+
return self._store.get(fingerprint_hash)
|
|
37
|
+
|
|
38
|
+
async def set(self, fingerprint_hash: str, record: FingerprintRecord) -> None:
|
|
39
|
+
async with self._lock:
|
|
40
|
+
self._store[fingerprint_hash] = record
|
|
41
|
+
self._write_count += 1
|
|
42
|
+
if self._write_count >= self._auto_persist_every:
|
|
43
|
+
self._write_count = 0
|
|
44
|
+
await self._persist_unsafe()
|
|
45
|
+
|
|
46
|
+
async def increment_hit(self, fingerprint_hash: str) -> None:
|
|
47
|
+
async with self._lock:
|
|
48
|
+
record = self._store.get(fingerprint_hash)
|
|
49
|
+
if record:
|
|
50
|
+
record.hit_count += 1
|
|
51
|
+
self._store[fingerprint_hash] = record
|
|
52
|
+
self._write_count += 1
|
|
53
|
+
if self._write_count >= self._auto_persist_every:
|
|
54
|
+
self._write_count = 0
|
|
55
|
+
await self._persist_unsafe()
|
|
56
|
+
|
|
57
|
+
async def _persist_unsafe(self) -> None:
|
|
58
|
+
"""Called internally when lock is already acquired."""
|
|
59
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
data = {k: v.model_dump(mode="json") for k, v in self._store.items()}
|
|
61
|
+
async with aiofiles.open(self._path, "w", encoding="utf-8") as f:
|
|
62
|
+
await f.write(json.dumps(data, indent=2))
|
|
63
|
+
|
|
64
|
+
async def persist(self) -> None:
|
|
65
|
+
async with self._lock:
|
|
66
|
+
self._write_count = 0
|
|
67
|
+
await self._persist_unsafe()
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def size(self) -> int:
|
|
71
|
+
return len(self._store)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import uuid
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import aiofiles
|
|
6
|
+
import numpy as np
|
|
7
|
+
from pydantic import ValidationError
|
|
8
|
+
from segment_classifier.models import ClusterRecord, ComponentType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class L2FuzzyCache:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
cache_path: str,
|
|
15
|
+
embeddings_path: str,
|
|
16
|
+
similarity_threshold: float = 0.85,
|
|
17
|
+
max_cluster_size: int = 50,
|
|
18
|
+
persist_on_update: bool = True
|
|
19
|
+
):
|
|
20
|
+
self._path = Path(cache_path)
|
|
21
|
+
self._embeddings_path = Path(embeddings_path)
|
|
22
|
+
self._similarity_threshold = similarity_threshold
|
|
23
|
+
self._max_cluster_size = max_cluster_size
|
|
24
|
+
self._persist_on_update = persist_on_update
|
|
25
|
+
|
|
26
|
+
self._store: list[ClusterRecord] = []
|
|
27
|
+
# Matrix storing centroids, parallel to self._store
|
|
28
|
+
self._centroids: np.ndarray | None = None
|
|
29
|
+
|
|
30
|
+
self._lock = asyncio.Lock()
|
|
31
|
+
|
|
32
|
+
async def load(self) -> None:
|
|
33
|
+
if not self._path.exists() or not self._embeddings_path.exists():
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
async with aiofiles.open(self._path, "r", encoding="utf-8") as f:
|
|
37
|
+
content = await f.read()
|
|
38
|
+
if content.strip():
|
|
39
|
+
try:
|
|
40
|
+
data = json.loads(content)
|
|
41
|
+
self._store = [ClusterRecord.model_validate(val) for val in data]
|
|
42
|
+
except (json.JSONDecodeError, ValidationError):
|
|
43
|
+
self._store = []
|
|
44
|
+
|
|
45
|
+
if self._store:
|
|
46
|
+
try:
|
|
47
|
+
self._centroids = np.load(str(self._embeddings_path))
|
|
48
|
+
if len(self._store) != self._centroids.shape[0]:
|
|
49
|
+
# Mismatch between JSON and npy, reset
|
|
50
|
+
self._store = []
|
|
51
|
+
self._centroids = None
|
|
52
|
+
except Exception:
|
|
53
|
+
self._store = []
|
|
54
|
+
self._centroids = None
|
|
55
|
+
|
|
56
|
+
async def find_nearest(self, vector: list[float], threshold: float | None = None) -> ClusterRecord | None:
|
|
57
|
+
async with self._lock:
|
|
58
|
+
if not self._store or self._centroids is None:
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
query = np.array(vector)
|
|
62
|
+
query_norm = np.linalg.norm(query)
|
|
63
|
+
if query_norm == 0:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
# Cosine similarity
|
|
67
|
+
dot_products = np.dot(self._centroids, query)
|
|
68
|
+
centroid_norms = np.linalg.norm(self._centroids, axis=1)
|
|
69
|
+
# Avoid division by zero
|
|
70
|
+
centroid_norms[centroid_norms == 0] = 1
|
|
71
|
+
|
|
72
|
+
similarities = dot_products / (centroid_norms * query_norm)
|
|
73
|
+
|
|
74
|
+
best_idx = np.argmax(similarities)
|
|
75
|
+
best_sim = similarities[best_idx]
|
|
76
|
+
|
|
77
|
+
check_threshold = threshold if threshold is not None else self._similarity_threshold
|
|
78
|
+
|
|
79
|
+
if best_sim >= check_threshold:
|
|
80
|
+
return self._store[best_idx]
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
async def add_to_cluster(self, cluster_id: str, fingerprint_hash: str, vector: list[float]) -> None:
|
|
84
|
+
async with self._lock:
|
|
85
|
+
idx = next((i for i, c in enumerate(self._store) if c.cluster_id == cluster_id), None)
|
|
86
|
+
if idx is not None and self._centroids is not None:
|
|
87
|
+
cluster = self._store[idx]
|
|
88
|
+
|
|
89
|
+
# Check size
|
|
90
|
+
if len(cluster.member_fingerprints) < self._max_cluster_size:
|
|
91
|
+
if fingerprint_hash not in cluster.member_fingerprints:
|
|
92
|
+
cluster.member_fingerprints.append(fingerprint_hash)
|
|
93
|
+
|
|
94
|
+
# Update centroid (running mean)
|
|
95
|
+
n = len(cluster.member_fingerprints)
|
|
96
|
+
old_centroid = self._centroids[idx]
|
|
97
|
+
new_vec = np.array(vector)
|
|
98
|
+
# (old_centroid * (n-1) + new_vec) / n
|
|
99
|
+
new_centroid = (old_centroid * (n - 1) + new_vec) / n
|
|
100
|
+
|
|
101
|
+
self._centroids[idx] = new_centroid
|
|
102
|
+
cluster.centroid_vector = new_centroid.tolist()
|
|
103
|
+
|
|
104
|
+
if self._persist_on_update:
|
|
105
|
+
await self._persist_unsafe()
|
|
106
|
+
|
|
107
|
+
async def create_cluster(
|
|
108
|
+
self,
|
|
109
|
+
fingerprint_hash: str,
|
|
110
|
+
vector: list[float],
|
|
111
|
+
component_type: ComponentType,
|
|
112
|
+
confidence: float,
|
|
113
|
+
) -> ClusterRecord:
|
|
114
|
+
async with self._lock:
|
|
115
|
+
cluster_id = str(uuid.uuid4())
|
|
116
|
+
record = ClusterRecord(
|
|
117
|
+
cluster_id=cluster_id,
|
|
118
|
+
centroid_vector=vector,
|
|
119
|
+
component_type=component_type,
|
|
120
|
+
confidence=confidence,
|
|
121
|
+
member_fingerprints=[fingerprint_hash]
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self._store.append(record)
|
|
125
|
+
|
|
126
|
+
new_vec = np.array([vector])
|
|
127
|
+
if self._centroids is None:
|
|
128
|
+
self._centroids = new_vec
|
|
129
|
+
else:
|
|
130
|
+
self._centroids = np.vstack([self._centroids, new_vec])
|
|
131
|
+
|
|
132
|
+
if self._persist_on_update:
|
|
133
|
+
await self._persist_unsafe()
|
|
134
|
+
|
|
135
|
+
return record
|
|
136
|
+
|
|
137
|
+
async def _persist_unsafe(self) -> None:
|
|
138
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
139
|
+
|
|
140
|
+
# Write JSON
|
|
141
|
+
data = [c.model_dump(mode="json") for c in self._store]
|
|
142
|
+
async with aiofiles.open(self._path, "w", encoding="utf-8") as f:
|
|
143
|
+
await f.write(json.dumps(data, indent=2))
|
|
144
|
+
|
|
145
|
+
# Write npy
|
|
146
|
+
if self._centroids is not None:
|
|
147
|
+
# We can't do aiofiles easily for numpy, doing it sync for now
|
|
148
|
+
# as it's a small matrix and we use it as memory store
|
|
149
|
+
np.save(str(self._embeddings_path), self._centroids)
|
|
150
|
+
|
|
151
|
+
async def persist(self) -> None:
|
|
152
|
+
async with self._lock:
|
|
153
|
+
await self._persist_unsafe()
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def size(self) -> int:
|
|
157
|
+
return len(self._store)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ModelFeatureConfig(BaseModel):
|
|
6
|
+
"""
|
|
7
|
+
Feature-based LLM model routing.
|
|
8
|
+
|
|
9
|
+
Selection priority (highest to lowest):
|
|
10
|
+
1. high_complexity → use for ambiguous, deeply nested, multi-role segments
|
|
11
|
+
2. standard → default for most unknown segments
|
|
12
|
+
3. fast → simple segments with weak signals but not rule-matchable
|
|
13
|
+
|
|
14
|
+
Complexity is determined by:
|
|
15
|
+
- dom_depth > threshold
|
|
16
|
+
- child_tag_counts diversity (many unique tags = complex)
|
|
17
|
+
- text_density_ratio (very high or very low = complex)
|
|
18
|
+
- sibling_count == 0 (one-off sections = complex)
|
|
19
|
+
"""
|
|
20
|
+
high_complexity_model: str = "anthropic/claude-opus-4"
|
|
21
|
+
standard_model: str = "anthropic/claude-sonnet-4-5"
|
|
22
|
+
fast_model: str = "anthropic/claude-haiku-4-5"
|
|
23
|
+
|
|
24
|
+
high_complexity_dom_depth_threshold: int = 6
|
|
25
|
+
high_complexity_unique_tag_threshold: int = 8
|
|
26
|
+
fast_model_max_dom_depth: int = 3
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CacheConfig(BaseModel):
|
|
30
|
+
l1_cache_path: str = ".cache/l1_fingerprints.json"
|
|
31
|
+
l2_cache_path: str = ".cache/l2_clusters.json"
|
|
32
|
+
l2_embeddings_path: str = ".cache/l2_embeddings.npy"
|
|
33
|
+
l2_similarity_threshold: float = 0.85
|
|
34
|
+
l2_max_cluster_size: int = 50
|
|
35
|
+
persist_on_update: bool = True
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ClassifierSettings(BaseSettings):
|
|
39
|
+
model_config = SettingsConfigDict(env_file=".env", env_prefix="CLASSIFIER_")
|
|
40
|
+
|
|
41
|
+
# LiteLLM
|
|
42
|
+
litellm_api_key: str = ""
|
|
43
|
+
litellm_batch_size: int = 20 # max segments per LLM batch call
|
|
44
|
+
litellm_max_concurrent_batches: int = 5
|
|
45
|
+
litellm_timeout_seconds: int = 60
|
|
46
|
+
|
|
47
|
+
# Pipeline
|
|
48
|
+
rule_based_confidence_threshold: float = 0.90
|
|
49
|
+
l1_min_confidence: float = 0.85
|
|
50
|
+
l2_min_confidence: float = 0.75
|
|
51
|
+
|
|
52
|
+
model_routing: ModelFeatureConfig = ModelFeatureConfig()
|
|
53
|
+
cache: CacheConfig = CacheConfig()
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ClassificationStage(str, Enum):
|
|
7
|
+
RULE_BASED = "rule_based"
|
|
8
|
+
L1_EXACT_CACHE = "l1_exact_cache"
|
|
9
|
+
L2_FUZZY_CACHE = "l2_fuzzy_cache"
|
|
10
|
+
LLM = "llm"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SegmentPosition(str, Enum):
|
|
14
|
+
TOP = "top" # top 5% of page
|
|
15
|
+
BOTTOM = "bottom" # bottom 10% of page
|
|
16
|
+
MIDDLE = "middle"
|
|
17
|
+
UNKNOWN = "unknown"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ComponentType(str, Enum):
|
|
21
|
+
# Layout
|
|
22
|
+
LAYOUT_HEADER = "layout.header"
|
|
23
|
+
LAYOUT_FOOTER = "layout.footer"
|
|
24
|
+
LAYOUT_NAV = "layout.nav"
|
|
25
|
+
LAYOUT_SIDEBAR = "layout.sidebar"
|
|
26
|
+
LAYOUT_BREADCRUMB = "layout.breadcrumb"
|
|
27
|
+
|
|
28
|
+
# Collections
|
|
29
|
+
COLLECTION_PRODUCT_CARD = "collection.product_card"
|
|
30
|
+
COLLECTION_PRODUCT_LIST = "collection.product_list"
|
|
31
|
+
COLLECTION_BLOG_CARD = "collection.blog_card"
|
|
32
|
+
COLLECTION_BLOG_LIST = "collection.blog_list"
|
|
33
|
+
COLLECTION_NEWS_ITEM = "collection.news_item"
|
|
34
|
+
COLLECTION_NEWS_LIST = "collection.news_list"
|
|
35
|
+
|
|
36
|
+
# Sections
|
|
37
|
+
SECTION_HERO = "section.hero"
|
|
38
|
+
SECTION_FEATURE_GRID = "section.feature_grid"
|
|
39
|
+
SECTION_TESTIMONIAL = "section.testimonial"
|
|
40
|
+
SECTION_CTA = "section.cta"
|
|
41
|
+
SECTION_FAQ = "section.faq"
|
|
42
|
+
SECTION_PRICING = "section.pricing"
|
|
43
|
+
|
|
44
|
+
# UI Elements
|
|
45
|
+
UI_FORM = "ui.form"
|
|
46
|
+
UI_MODAL = "ui.modal"
|
|
47
|
+
UI_TABLE = "ui.table"
|
|
48
|
+
UI_CAROUSEL = "ui.carousel"
|
|
49
|
+
UI_PAGINATION = "ui.pagination"
|
|
50
|
+
UI_SEARCH = "ui.search"
|
|
51
|
+
|
|
52
|
+
# Content
|
|
53
|
+
CONTENT_ARTICLE = "content.article"
|
|
54
|
+
CONTENT_RICH_TEXT = "content.rich_text"
|
|
55
|
+
CONTENT_MEDIA = "content.media"
|
|
56
|
+
|
|
57
|
+
UNKNOWN = "unknown"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class InputSegment(BaseModel):
|
|
61
|
+
"""Raw segment from the page-segmenter tool."""
|
|
62
|
+
segment_id: str
|
|
63
|
+
page_url: str
|
|
64
|
+
page_slug: str
|
|
65
|
+
raw_html: str
|
|
66
|
+
text_content: str
|
|
67
|
+
position_hint: SegmentPosition = SegmentPosition.UNKNOWN
|
|
68
|
+
dom_position: str = "" # CSS selector path e.g. "main > section:nth-child(2)"
|
|
69
|
+
sibling_count: int = 0 # how many same-fingerprint siblings on same page
|
|
70
|
+
url_path_segments: list[str] = Field(default_factory=list) # e.g. ["products", "shoes"]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ClassifiedSegment(BaseModel):
|
|
74
|
+
"""Segment with classification result and metadata."""
|
|
75
|
+
segment_id: str
|
|
76
|
+
page_url: str
|
|
77
|
+
page_slug: str
|
|
78
|
+
raw_html: str
|
|
79
|
+
text_content: str
|
|
80
|
+
position_hint: SegmentPosition
|
|
81
|
+
|
|
82
|
+
# Classification output
|
|
83
|
+
component_type: ComponentType
|
|
84
|
+
classification_stage: ClassificationStage
|
|
85
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
86
|
+
|
|
87
|
+
# Fingerprint computed during pipeline
|
|
88
|
+
fingerprint_hash: str = ""
|
|
89
|
+
cluster_id: str | None = None
|
|
90
|
+
|
|
91
|
+
# LLM metadata (populated only for stage=LLM)
|
|
92
|
+
llm_model_used: str | None = None
|
|
93
|
+
llm_raw_response: str | None = None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class FingerprintRecord(BaseModel):
|
|
97
|
+
"""Stored in L1 cache: fingerprint → classification."""
|
|
98
|
+
fingerprint_hash: str
|
|
99
|
+
component_type: ComponentType
|
|
100
|
+
confidence: float
|
|
101
|
+
hit_count: int = 1
|
|
102
|
+
example_segment_id: str = ""
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class ClusterRecord(BaseModel):
|
|
106
|
+
"""Stored in L2 cache: cluster of similar fingerprints."""
|
|
107
|
+
cluster_id: str
|
|
108
|
+
centroid_vector: list[float]
|
|
109
|
+
component_type: ComponentType
|
|
110
|
+
confidence: float
|
|
111
|
+
member_fingerprints: list[str] = Field(default_factory=list)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class LLMClassificationRequest(BaseModel):
|
|
115
|
+
"""Batch item sent to LLM."""
|
|
116
|
+
segment_id: str
|
|
117
|
+
fingerprint_hash: str
|
|
118
|
+
normalized_html: str # skeleton only, no content
|
|
119
|
+
position_hint: SegmentPosition
|
|
120
|
+
sibling_count: int
|
|
121
|
+
url_hints: list[str]
|
|
122
|
+
dom_depth: int
|
|
123
|
+
child_tag_counts: dict[str, int]
|
|
124
|
+
text_density_ratio: float
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class LLMClassificationResult(BaseModel):
|
|
128
|
+
"""Parsed result from LLM for one segment."""
|
|
129
|
+
segment_id: str
|
|
130
|
+
component_type: ComponentType
|
|
131
|
+
confidence: float
|
|
132
|
+
reasoning: str
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class PipelineResult(BaseModel):
|
|
136
|
+
"""Final output of the full classification pipeline run."""
|
|
137
|
+
total_segments: int
|
|
138
|
+
classified: list[ClassifiedSegment]
|
|
139
|
+
stage_breakdown: dict[ClassificationStage, int]
|
|
140
|
+
llm_calls_made: int
|
|
141
|
+
llm_model_usage: dict[str, int] # model_name → call count
|
|
142
|
+
cache_hit_rate: float
|