segment_classifier 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- segment_classifier/__init__.py +4 -0
- segment_classifier/cache/__init__.py +4 -0
- segment_classifier/cache/l1_cache.py +71 -0
- segment_classifier/cache/l2_cache.py +157 -0
- segment_classifier/config.py +53 -0
- segment_classifier/models.py +142 -0
- segment_classifier/pipeline.py +173 -0
- segment_classifier/stages/__init__.py +6 -0
- segment_classifier/stages/fingerprint.py +10 -0
- segment_classifier/stages/fuzzy_cluster.py +101 -0
- segment_classifier/stages/llm_classifier.py +271 -0
- segment_classifier/stages/rule_based.py +287 -0
- segment_classifier/utils/__init__.py +3 -0
- segment_classifier/utils/html_normalizer.py +165 -0
- segment_classifier-0.1.0.dist-info/METADATA +95 -0
- segment_classifier-0.1.0.dist-info/RECORD +17 -0
- segment_classifier-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
STRUCTURAL_CLASS_PATTERN = re.compile(
|
|
9
|
+
r'\b(card|grid|list|item|hero|nav|menu|header|footer|sidebar|'
|
|
10
|
+
r'form|modal|badge|price|rating|carousel|pagination|search|'
|
|
11
|
+
r'feature|testimonial|cta|faq|pricing|article|media|table|'
|
|
12
|
+
r'product|blog|news|collection|section|widget)\b',
|
|
13
|
+
re.IGNORECASE
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
PRESENTATIONAL_CLASS_PATTERN = re.compile(
|
|
17
|
+
r'\b(mt|mb|ml|mr|mx|my|pt|pb|pl|pr|px|py|w-|h-|text-|bg-|'
|
|
18
|
+
r'border|rounded|shadow|flex|grid-cols|gap|p-|m-|font-|'
|
|
19
|
+
r'color|opacity|z-|hidden|block|inline)\b'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
STRUCTURAL_ATTRS = {"role", "type", "aria-label", "aria-role", "data-component", "data-type"}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class NormalizedSegment:
|
|
27
|
+
skeleton: str
|
|
28
|
+
attrs_fingerprint: str
|
|
29
|
+
class_tokens: list[str]
|
|
30
|
+
child_tag_counts: dict[str, int]
|
|
31
|
+
dom_depth: int
|
|
32
|
+
root_tag: str
|
|
33
|
+
text_density_ratio: float
|
|
34
|
+
unique_tag_count: int
|
|
35
|
+
|
|
36
|
+
def fingerprint_hash(self) -> str:
|
|
37
|
+
payload = {
|
|
38
|
+
"skeleton": self.skeleton,
|
|
39
|
+
"attrs": self.attrs_fingerprint,
|
|
40
|
+
"classes": sorted(self.class_tokens),
|
|
41
|
+
"counts": self.child_tag_counts,
|
|
42
|
+
"depth": self.dom_depth,
|
|
43
|
+
"root": self.root_tag,
|
|
44
|
+
}
|
|
45
|
+
return hashlib.sha256(
|
|
46
|
+
json.dumps(payload, sort_keys=True).encode()
|
|
47
|
+
).hexdigest()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def normalize_segment(html: str, text_content: str) -> NormalizedSegment:
|
|
51
|
+
"""
|
|
52
|
+
Parse HTML and extract structural fingerprint components.
|
|
53
|
+
"""
|
|
54
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
55
|
+
root = soup.find()
|
|
56
|
+
if not root or not isinstance(root, Tag):
|
|
57
|
+
return NormalizedSegment("", "", [], {}, 0, "unknown", 0.0, 0)
|
|
58
|
+
|
|
59
|
+
skeleton = _extract_skeleton(root)
|
|
60
|
+
attrs_fp = _extract_attrs_fingerprint(root)
|
|
61
|
+
class_tokens = _extract_class_tokens(root)
|
|
62
|
+
child_counts = _count_tags(root)
|
|
63
|
+
depth = _max_depth(root)
|
|
64
|
+
text_ratio = len(text_content) / max(len(html), 1)
|
|
65
|
+
unique_tags = len(child_counts)
|
|
66
|
+
|
|
67
|
+
return NormalizedSegment(
|
|
68
|
+
skeleton=skeleton,
|
|
69
|
+
attrs_fingerprint=attrs_fp,
|
|
70
|
+
class_tokens=class_tokens,
|
|
71
|
+
child_tag_counts=child_counts,
|
|
72
|
+
dom_depth=depth,
|
|
73
|
+
root_tag=root.name,
|
|
74
|
+
text_density_ratio=round(text_ratio, 4),
|
|
75
|
+
unique_tag_count=unique_tags,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _extract_skeleton(tag: Tag, depth: int = 0, max_depth: int = 8) -> str:
|
|
80
|
+
"""Recursive tag-name-only skeleton. Siblings joined with '+', children with '>'."""
|
|
81
|
+
if depth >= max_depth:
|
|
82
|
+
return tag.name
|
|
83
|
+
|
|
84
|
+
child_skeletons = []
|
|
85
|
+
for child in tag.children:
|
|
86
|
+
if isinstance(child, Tag):
|
|
87
|
+
child_skeletons.append(_extract_skeleton(child, depth + 1, max_depth))
|
|
88
|
+
|
|
89
|
+
if not child_skeletons:
|
|
90
|
+
return tag.name
|
|
91
|
+
|
|
92
|
+
return f"{tag.name}>" + "+".join(child_skeletons)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _extract_attrs_fingerprint(tag: Tag) -> str:
|
|
96
|
+
"""Walk all tags, keep only STRUCTURAL_ATTRS values and href/src presence booleans."""
|
|
97
|
+
parts = []
|
|
98
|
+
|
|
99
|
+
def walk(node: Tag):
|
|
100
|
+
node_parts = []
|
|
101
|
+
for attr in STRUCTURAL_ATTRS:
|
|
102
|
+
val = node.get(attr)
|
|
103
|
+
if val:
|
|
104
|
+
if isinstance(val, list):
|
|
105
|
+
val = " ".join(val)
|
|
106
|
+
node_parts.append(f"{attr}={val}")
|
|
107
|
+
|
|
108
|
+
if node.has_attr("href"):
|
|
109
|
+
node_parts.append("has_href=true")
|
|
110
|
+
if node.has_attr("src"):
|
|
111
|
+
node_parts.append("has_src=true")
|
|
112
|
+
|
|
113
|
+
if node_parts:
|
|
114
|
+
parts.append(f"{node.name}[" + ",".join(sorted(node_parts)) + "]")
|
|
115
|
+
|
|
116
|
+
for child in node.children:
|
|
117
|
+
if isinstance(child, Tag):
|
|
118
|
+
walk(child)
|
|
119
|
+
|
|
120
|
+
walk(tag)
|
|
121
|
+
return "|".join(parts)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _extract_class_tokens(tag: Tag) -> list[str]:
|
|
125
|
+
"""Extract class names matching STRUCTURAL_CLASS_PATTERN from all tags."""
|
|
126
|
+
tokens = set()
|
|
127
|
+
|
|
128
|
+
def walk(node: Tag):
|
|
129
|
+
classes = node.get("class", [])
|
|
130
|
+
if isinstance(classes, str):
|
|
131
|
+
classes = [classes]
|
|
132
|
+
for c in classes:
|
|
133
|
+
if STRUCTURAL_CLASS_PATTERN.search(c) and not PRESENTATIONAL_CLASS_PATTERN.search(c):
|
|
134
|
+
tokens.add(c)
|
|
135
|
+
for child in node.children:
|
|
136
|
+
if isinstance(child, Tag):
|
|
137
|
+
walk(child)
|
|
138
|
+
|
|
139
|
+
walk(tag)
|
|
140
|
+
return list(tokens)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _count_tags(tag: Tag) -> dict[str, int]:
|
|
144
|
+
"""Count occurrences of each tag name in the full subtree."""
|
|
145
|
+
counts = {}
|
|
146
|
+
|
|
147
|
+
def walk(node: Tag):
|
|
148
|
+
counts[node.name] = counts.get(node.name, 0) + 1
|
|
149
|
+
for child in node.children:
|
|
150
|
+
if isinstance(child, Tag):
|
|
151
|
+
walk(child)
|
|
152
|
+
|
|
153
|
+
# Count tags in the full subtree including the root
|
|
154
|
+
walk(tag)
|
|
155
|
+
|
|
156
|
+
return counts
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _max_depth(tag: Tag, current: int = 0) -> int:
|
|
160
|
+
"""Return maximum nesting depth."""
|
|
161
|
+
child_depths = [current]
|
|
162
|
+
for child in tag.children:
|
|
163
|
+
if isinstance(child, Tag):
|
|
164
|
+
child_depths.append(_max_depth(child, current + 1))
|
|
165
|
+
return max(child_depths)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: segment_classifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Async segment classifier library
|
|
5
|
+
Author: Gagandeep Singh
|
|
6
|
+
Author-email: gagan@innerkore.com
|
|
7
|
+
Requires-Python: >=3.12,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
11
|
+
Requires-Dist: aiofiles (>=23.0,<24.0)
|
|
12
|
+
Requires-Dist: beautifulsoup4 (>=4.12,<5.0)
|
|
13
|
+
Requires-Dist: litellm (>=1.40,<2.0)
|
|
14
|
+
Requires-Dist: lxml (>=5.0,<6.0)
|
|
15
|
+
Requires-Dist: numpy (>=1.26,<2.0)
|
|
16
|
+
Requires-Dist: pydantic (>=2.7,<3.0)
|
|
17
|
+
Requires-Dist: pydantic-settings (>=2.2,<3.0)
|
|
18
|
+
Requires-Dist: scikit-learn (>=1.5,<2.0)
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Segment Classifier
|
|
22
|
+
|
|
23
|
+
An asynchronous Python library that classifies HTML segments extracted by a page-segmenter into structured component types.
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
The `segment_classifier` implements a 4-stage classification pipeline with progressive fallback to optimize for cost and speed:
|
|
28
|
+
|
|
29
|
+
1. **Rule-based heuristics** — Zero LLM cost. Uses DOM structure, text density, siblings, and attributes.
|
|
30
|
+
2. **L1 exact fingerprint cache** — Zero LLM cost. Exact matching on structural DOM fingerprint hashes.
|
|
31
|
+
3. **L2 fuzzy cluster cache** — Zero LLM cost. TF-IDF and cosine similarity on fingerprint tokens.
|
|
32
|
+
4. **LLM batch classification** — Batched fallback via LiteLLM with feature-based model routing based on segment complexity.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
You can install the package using poetry:
|
|
37
|
+
```bash
|
|
38
|
+
poetry install
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or via pip (once published):
|
|
42
|
+
```bash
|
|
43
|
+
pip install segment-classifier
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Setup
|
|
47
|
+
|
|
48
|
+
The library uses `pydantic-settings` to manage configuration via a `.env` file or environment variables.
|
|
49
|
+
|
|
50
|
+
Required environment variables:
|
|
51
|
+
```env
|
|
52
|
+
CLASSIFIER_LITELLM_API_KEY="your-api-key"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import asyncio
|
|
59
|
+
from segment_classifier import ClassifierPipeline
|
|
60
|
+
from segment_classifier.config import ClassifierSettings
|
|
61
|
+
from segment_classifier.models import InputSegment, SegmentPosition
|
|
62
|
+
|
|
63
|
+
async def main():
|
|
64
|
+
settings = ClassifierSettings()
|
|
65
|
+
pipeline = ClassifierPipeline(settings)
|
|
66
|
+
await pipeline.initialize()
|
|
67
|
+
|
|
68
|
+
segments = [
|
|
69
|
+
InputSegment(
|
|
70
|
+
segment_id="seg_001",
|
|
71
|
+
page_url="https://example.com/products",
|
|
72
|
+
page_slug="products",
|
|
73
|
+
raw_html="<div class='product-card'>...</div>",
|
|
74
|
+
text_content="Product Item",
|
|
75
|
+
position_hint=SegmentPosition.MIDDLE,
|
|
76
|
+
sibling_count=3,
|
|
77
|
+
)
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
result = await pipeline.run(segments)
|
|
81
|
+
await pipeline.shutdown()
|
|
82
|
+
|
|
83
|
+
for seg in result.classified:
|
|
84
|
+
print(seg.component_type)
|
|
85
|
+
|
|
86
|
+
asyncio.run(main())
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Caching
|
|
90
|
+
|
|
91
|
+
Caches are stored by default in `.cache/l1_fingerprints.json` and `.cache/l2_clusters.json` / `.cache/l2_embeddings.npy`.
|
|
92
|
+
|
|
93
|
+
## Stages Breakdown
|
|
94
|
+
Every returned `ClassifiedSegment` will be marked with a `classification_stage` indicating which of the 4 stages resolved the query.
|
|
95
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
segment_classifier/__init__.py,sha256=oMPRBM2ig-EBqXicWyrRabDqRF1sZ9cDcN5TOEgJY1M,136
|
|
2
|
+
segment_classifier/cache/__init__.py,sha256=XAlXVuoePqSVjL_oteuAbkFp038MYFmuXBwPTVOSAIk,126
|
|
3
|
+
segment_classifier/cache/l1_cache.py,sha256=9YWg5-I5RAmUA8pEUwcwDvKYeO2njLAnvF4iy0LO_cQ,2650
|
|
4
|
+
segment_classifier/cache/l2_cache.py,sha256=cbHksUE8dOUhwpycEIz8h9JaiNR9BRAYPfFJcoQhcFc,5729
|
|
5
|
+
segment_classifier/config.py,sha256=LwVhFZ5uJQqt4eXGwLzX7e-2eLy6SpiUTL1KR382q3w,1871
|
|
6
|
+
segment_classifier/models.py,sha256=p3ebmFim1q3G0qYM-AleV4OwHmLnbQVkgsPgf92YgpQ,4122
|
|
7
|
+
segment_classifier/pipeline.py,sha256=4R5IQcSMF1tSK5IkDQ8W8vbbpcqMMCUGGl0fpY43Sb8,7210
|
|
8
|
+
segment_classifier/stages/__init__.py,sha256=5s_AD_v3uTK0VzXqYKChOyhUiQOTotTp5iPOQUB5TyE,282
|
|
9
|
+
segment_classifier/stages/fingerprint.py,sha256=1JgS1WrpnD5bxAU0j1nWX-QH4Gv3ulMmoz7RhRP0IRk,483
|
|
10
|
+
segment_classifier/stages/fuzzy_cluster.py,sha256=uIPo2EgE0oyQqTx_NHlUZL1pYEjRL36JhoVBdttcwbE,4409
|
|
11
|
+
segment_classifier/stages/llm_classifier.py,sha256=lQl-bh0tlRrAqhBJQsKeW1hYLM1WV93kE9xPCzzZS6k,10874
|
|
12
|
+
segment_classifier/stages/rule_based.py,sha256=qTwvaMr_zSi5HHWh2S6PE_2QANstdpgASDAtr84N29U,11664
|
|
13
|
+
segment_classifier/utils/__init__.py,sha256=dvp1CDEiSMVjUIzDAy_DRmYQQolkPWOIbTrUgA7qLnQ,120
|
|
14
|
+
segment_classifier/utils/html_normalizer.py,sha256=VTD49C0QLLOWTKc0le8ta3jlqg4Ln-7tJ1tuSG35J_E,4981
|
|
15
|
+
segment_classifier-0.1.0.dist-info/METADATA,sha256=mM6Rr0nyjf8NMRRy22xQESkRvTK22ehAcZLoQCITY6U,2943
|
|
16
|
+
segment_classifier-0.1.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
17
|
+
segment_classifier-0.1.0.dist-info/RECORD,,
|