segment_classifier 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,165 @@
1
+ import hashlib
2
+ import json
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from bs4 import BeautifulSoup, Tag, NavigableString
6
+
7
+
8
+ STRUCTURAL_CLASS_PATTERN = re.compile(
9
+ r'\b(card|grid|list|item|hero|nav|menu|header|footer|sidebar|'
10
+ r'form|modal|badge|price|rating|carousel|pagination|search|'
11
+ r'feature|testimonial|cta|faq|pricing|article|media|table|'
12
+ r'product|blog|news|collection|section|widget)\b',
13
+ re.IGNORECASE
14
+ )
15
+
16
+ PRESENTATIONAL_CLASS_PATTERN = re.compile(
17
+ r'\b(mt|mb|ml|mr|mx|my|pt|pb|pl|pr|px|py|w-|h-|text-|bg-|'
18
+ r'border|rounded|shadow|flex|grid-cols|gap|p-|m-|font-|'
19
+ r'color|opacity|z-|hidden|block|inline)\b'
20
+ )
21
+
22
+ STRUCTURAL_ATTRS = {"role", "type", "aria-label", "aria-role", "data-component", "data-type"}
23
+
24
+
25
+ @dataclass
26
+ class NormalizedSegment:
27
+ skeleton: str
28
+ attrs_fingerprint: str
29
+ class_tokens: list[str]
30
+ child_tag_counts: dict[str, int]
31
+ dom_depth: int
32
+ root_tag: str
33
+ text_density_ratio: float
34
+ unique_tag_count: int
35
+
36
+ def fingerprint_hash(self) -> str:
37
+ payload = {
38
+ "skeleton": self.skeleton,
39
+ "attrs": self.attrs_fingerprint,
40
+ "classes": sorted(self.class_tokens),
41
+ "counts": self.child_tag_counts,
42
+ "depth": self.dom_depth,
43
+ "root": self.root_tag,
44
+ }
45
+ return hashlib.sha256(
46
+ json.dumps(payload, sort_keys=True).encode()
47
+ ).hexdigest()
48
+
49
+
50
+ def normalize_segment(html: str, text_content: str) -> NormalizedSegment:
51
+ """
52
+ Parse HTML and extract structural fingerprint components.
53
+ """
54
+ soup = BeautifulSoup(html, "html.parser")
55
+ root = soup.find()
56
+ if not root or not isinstance(root, Tag):
57
+ return NormalizedSegment("", "", [], {}, 0, "unknown", 0.0, 0)
58
+
59
+ skeleton = _extract_skeleton(root)
60
+ attrs_fp = _extract_attrs_fingerprint(root)
61
+ class_tokens = _extract_class_tokens(root)
62
+ child_counts = _count_tags(root)
63
+ depth = _max_depth(root)
64
+ text_ratio = len(text_content) / max(len(html), 1)
65
+ unique_tags = len(child_counts)
66
+
67
+ return NormalizedSegment(
68
+ skeleton=skeleton,
69
+ attrs_fingerprint=attrs_fp,
70
+ class_tokens=class_tokens,
71
+ child_tag_counts=child_counts,
72
+ dom_depth=depth,
73
+ root_tag=root.name,
74
+ text_density_ratio=round(text_ratio, 4),
75
+ unique_tag_count=unique_tags,
76
+ )
77
+
78
+
79
+ def _extract_skeleton(tag: Tag, depth: int = 0, max_depth: int = 8) -> str:
80
+ """Recursive tag-name-only skeleton. Siblings joined with '+', children with '>'."""
81
+ if depth >= max_depth:
82
+ return tag.name
83
+
84
+ child_skeletons = []
85
+ for child in tag.children:
86
+ if isinstance(child, Tag):
87
+ child_skeletons.append(_extract_skeleton(child, depth + 1, max_depth))
88
+
89
+ if not child_skeletons:
90
+ return tag.name
91
+
92
+ return f"{tag.name}>" + "+".join(child_skeletons)
93
+
94
+
95
+ def _extract_attrs_fingerprint(tag: Tag) -> str:
96
+ """Walk all tags, keep only STRUCTURAL_ATTRS values and href/src presence booleans."""
97
+ parts = []
98
+
99
+ def walk(node: Tag):
100
+ node_parts = []
101
+ for attr in STRUCTURAL_ATTRS:
102
+ val = node.get(attr)
103
+ if val:
104
+ if isinstance(val, list):
105
+ val = " ".join(val)
106
+ node_parts.append(f"{attr}={val}")
107
+
108
+ if node.has_attr("href"):
109
+ node_parts.append("has_href=true")
110
+ if node.has_attr("src"):
111
+ node_parts.append("has_src=true")
112
+
113
+ if node_parts:
114
+ parts.append(f"{node.name}[" + ",".join(sorted(node_parts)) + "]")
115
+
116
+ for child in node.children:
117
+ if isinstance(child, Tag):
118
+ walk(child)
119
+
120
+ walk(tag)
121
+ return "|".join(parts)
122
+
123
+
124
+ def _extract_class_tokens(tag: Tag) -> list[str]:
125
+ """Extract class names matching STRUCTURAL_CLASS_PATTERN from all tags."""
126
+ tokens = set()
127
+
128
+ def walk(node: Tag):
129
+ classes = node.get("class", [])
130
+ if isinstance(classes, str):
131
+ classes = [classes]
132
+ for c in classes:
133
+ if STRUCTURAL_CLASS_PATTERN.search(c) and not PRESENTATIONAL_CLASS_PATTERN.search(c):
134
+ tokens.add(c)
135
+ for child in node.children:
136
+ if isinstance(child, Tag):
137
+ walk(child)
138
+
139
+ walk(tag)
140
+ return list(tokens)
141
+
142
+
143
+ def _count_tags(tag: Tag) -> dict[str, int]:
144
+ """Count occurrences of each tag name in the full subtree."""
145
+ counts = {}
146
+
147
+ def walk(node: Tag):
148
+ counts[node.name] = counts.get(node.name, 0) + 1
149
+ for child in node.children:
150
+ if isinstance(child, Tag):
151
+ walk(child)
152
+
153
+ # Count tags in the full subtree including the root
154
+ walk(tag)
155
+
156
+ return counts
157
+
158
+
159
+ def _max_depth(tag: Tag, current: int = 0) -> int:
160
+ """Return maximum nesting depth."""
161
+ child_depths = [current]
162
+ for child in tag.children:
163
+ if isinstance(child, Tag):
164
+ child_depths.append(_max_depth(child, current + 1))
165
+ return max(child_depths)
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.3
2
+ Name: segment_classifier
3
+ Version: 0.1.0
4
+ Summary: Async segment classifier library
5
+ Author: Gagandeep Singh
6
+ Author-email: gagan@innerkore.com
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiofiles (>=23.0,<24.0)
12
+ Requires-Dist: beautifulsoup4 (>=4.12,<5.0)
13
+ Requires-Dist: litellm (>=1.40,<2.0)
14
+ Requires-Dist: lxml (>=5.0,<6.0)
15
+ Requires-Dist: numpy (>=1.26,<2.0)
16
+ Requires-Dist: pydantic (>=2.7,<3.0)
17
+ Requires-Dist: pydantic-settings (>=2.2,<3.0)
18
+ Requires-Dist: scikit-learn (>=1.5,<2.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Segment Classifier
22
+
23
+ An asynchronous Python library that classifies HTML segments extracted by a page-segmenter into structured component types.
24
+
25
+ ## Overview
26
+
27
+ The `segment_classifier` implements a 4-stage classification pipeline with progressive fallback to optimize for cost and speed:
28
+
29
+ 1. **Rule-based heuristics** — Zero LLM cost. Uses DOM structure, text density, siblings, and attributes.
30
+ 2. **L1 exact fingerprint cache** — Zero LLM cost. Exact matching on structural DOM fingerprint hashes.
31
+ 3. **L2 fuzzy cluster cache** — Zero LLM cost. TF-IDF and cosine similarity on fingerprint tokens.
32
+ 4. **LLM batch classification** — Batched fallback via LiteLLM with feature-based model routing based on segment complexity.
33
+
34
+ ## Installation
35
+
36
+ You can install the package using poetry:
37
+ ```bash
38
+ poetry install
39
+ ```
40
+
41
+ Or via pip (once published):
42
+ ```bash
43
+ pip install segment-classifier
44
+ ```
45
+
46
+ ## Setup
47
+
48
+ The library uses `pydantic-settings` to manage configuration via a `.env` file or environment variables.
49
+
50
+ Required environment variables:
51
+ ```env
52
+ CLASSIFIER_LITELLM_API_KEY="your-api-key"
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ```python
58
+ import asyncio
59
+ from segment_classifier import ClassifierPipeline
60
+ from segment_classifier.config import ClassifierSettings
61
+ from segment_classifier.models import InputSegment, SegmentPosition
62
+
63
+ async def main():
64
+ settings = ClassifierSettings()
65
+ pipeline = ClassifierPipeline(settings)
66
+ await pipeline.initialize()
67
+
68
+ segments = [
69
+ InputSegment(
70
+ segment_id="seg_001",
71
+ page_url="https://example.com/products",
72
+ page_slug="products",
73
+ raw_html="<div class='product-card'>...</div>",
74
+ text_content="Product Item",
75
+ position_hint=SegmentPosition.MIDDLE,
76
+ sibling_count=3,
77
+ )
78
+ ]
79
+
80
+ result = await pipeline.run(segments)
81
+ await pipeline.shutdown()
82
+
83
+ for seg in result.classified:
84
+ print(seg.component_type)
85
+
86
+ asyncio.run(main())
87
+ ```
88
+
89
+ ## Caching
90
+
91
+ Caches are stored by default in `.cache/l1_fingerprints.json` and `.cache/l2_clusters.json` / `.cache/l2_embeddings.npy`.
92
+
93
+ ## Stages Breakdown
94
+ Every returned `ClassifiedSegment` will be marked with a `classification_stage` indicating which of the 4 stages resolved the query.
95
+
@@ -0,0 +1,17 @@
1
+ segment_classifier/__init__.py,sha256=oMPRBM2ig-EBqXicWyrRabDqRF1sZ9cDcN5TOEgJY1M,136
2
+ segment_classifier/cache/__init__.py,sha256=XAlXVuoePqSVjL_oteuAbkFp038MYFmuXBwPTVOSAIk,126
3
+ segment_classifier/cache/l1_cache.py,sha256=9YWg5-I5RAmUA8pEUwcwDvKYeO2njLAnvF4iy0LO_cQ,2650
4
+ segment_classifier/cache/l2_cache.py,sha256=cbHksUE8dOUhwpycEIz8h9JaiNR9BRAYPfFJcoQhcFc,5729
5
+ segment_classifier/config.py,sha256=LwVhFZ5uJQqt4eXGwLzX7e-2eLy6SpiUTL1KR382q3w,1871
6
+ segment_classifier/models.py,sha256=p3ebmFim1q3G0qYM-AleV4OwHmLnbQVkgsPgf92YgpQ,4122
7
+ segment_classifier/pipeline.py,sha256=4R5IQcSMF1tSK5IkDQ8W8vbbpcqMMCUGGl0fpY43Sb8,7210
8
+ segment_classifier/stages/__init__.py,sha256=5s_AD_v3uTK0VzXqYKChOyhUiQOTotTp5iPOQUB5TyE,282
9
+ segment_classifier/stages/fingerprint.py,sha256=1JgS1WrpnD5bxAU0j1nWX-QH4Gv3ulMmoz7RhRP0IRk,483
10
+ segment_classifier/stages/fuzzy_cluster.py,sha256=uIPo2EgE0oyQqTx_NHlUZL1pYEjRL36JhoVBdttcwbE,4409
11
+ segment_classifier/stages/llm_classifier.py,sha256=lQl-bh0tlRrAqhBJQsKeW1hYLM1WV93kE9xPCzzZS6k,10874
12
+ segment_classifier/stages/rule_based.py,sha256=qTwvaMr_zSi5HHWh2S6PE_2QANstdpgASDAtr84N29U,11664
13
+ segment_classifier/utils/__init__.py,sha256=dvp1CDEiSMVjUIzDAy_DRmYQQolkPWOIbTrUgA7qLnQ,120
14
+ segment_classifier/utils/html_normalizer.py,sha256=VTD49C0QLLOWTKc0le8ta3jlqg4Ln-7tJ1tuSG35J_E,4981
15
+ segment_classifier-0.1.0.dist-info/METADATA,sha256=mM6Rr0nyjf8NMRRy22xQESkRvTK22ehAcZLoQCITY6U,2943
16
+ segment_classifier-0.1.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
17
+ segment_classifier-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.2
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any