segment_classifier 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,101 @@
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from segment_classifier.cache.l2_cache import L2FuzzyCache
3
+ from segment_classifier.config import CacheConfig
4
+ from segment_classifier.models import (
5
+ InputSegment, ClassifiedSegment, ClassificationStage, ComponentType
6
+ )
7
+ from segment_classifier.utils.html_normalizer import NormalizedSegment
8
+
9
+
10
+ class FuzzyClusterStage:
11
+ def __init__(self, cache: L2FuzzyCache, config: CacheConfig):
12
+ self.cache = cache
13
+ self.config = config
14
+
15
+ # Use TfidfVectorizer, but fit it once on a known dummy vocabulary
16
+ # or just use sklearn's HashingVectorizer which is stateless.
17
+ # But since prompt specifies "use sklearn TfidfVectorizer", we must pre-fit it
18
+ # or use HashingVectorizer with TfidfTransformer. HashingVectorizer directly
19
+ # isn't TfidfVectorizer. Let's stick to HashingVectorizer + tfidf for stability?
20
+ # Actually, prompt says "use sklearn TfidfVectorizer". We can use TfidfVectorizer
21
+ # but with a fixed vocabulary if we hash. Or we can just build a stateless tfidf
22
+ # using HashingVectorizer + TfidfTransformer.
23
+ # But wait, code review said: "You must either use a HashingVectorizer... or pre-fit the vectorizer".
24
+ # Let's use HashingVectorizer + TfidfTransformer to get TF-IDF scaling on fixed dimensions.
25
+ from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
26
+ from sklearn.pipeline import Pipeline
27
+ self.vectorizer = Pipeline([
28
+ ('hash', HashingVectorizer(
29
+ analyzer='char',
30
+ ngram_range=(2, 4),
31
+ n_features=512,
32
+ lowercase=True,
33
+ norm=None # Let TfidfTransformer handle normalization
34
+ )),
35
+ ('tfidf', TfidfTransformer())
36
+ ])
37
+ # Fit once on empty/dummy string to initialize tfidf idf_ to smooth values
38
+ self.vectorizer.fit(["dummy"])
39
+ self._is_fitted = True
40
+
41
+ def _build_fingerprint_string(self, normalized: NormalizedSegment) -> str:
42
+ return f"{normalized.skeleton} {normalized.attrs_fingerprint} {' '.join(normalized.class_tokens)}"
43
+
44
+ def _vectorize(self, fingerprint_string: str) -> list[float]:
45
+ matrix = self.vectorizer.transform([fingerprint_string])
46
+ return matrix.toarray()[0].tolist()
47
+
48
+ async def classify(
49
+ self,
50
+ segment: InputSegment,
51
+ normalized: NormalizedSegment,
52
+ fingerprint_hash: str,
53
+ ) -> ClassifiedSegment | None:
54
+ """
55
+ 1. Build fingerprint string
56
+ 2. Vectorize
57
+ 3. find_nearest in L2Cache
58
+ 4. Return ClassifiedSegment (stage=L2_FUZZY_CACHE) or None
59
+ """
60
+ fingerprint_string = self._build_fingerprint_string(normalized)
61
+ vector = self._vectorize(fingerprint_string)
62
+
63
+ nearest = await self.cache.find_nearest(vector, self.config.l2_similarity_threshold)
64
+ if nearest:
65
+ # Penalty of 0.05 for fuzzy
66
+ confidence = max(0.0, nearest.confidence - 0.05)
67
+
68
+ return ClassifiedSegment(
69
+ segment_id=segment.segment_id,
70
+ page_url=segment.page_url,
71
+ page_slug=segment.page_slug,
72
+ raw_html=segment.raw_html,
73
+ text_content=segment.text_content,
74
+ position_hint=segment.position_hint,
75
+ component_type=nearest.component_type,
76
+ classification_stage=ClassificationStage.L2_FUZZY_CACHE,
77
+ confidence=confidence,
78
+ fingerprint_hash=fingerprint_hash,
79
+ cluster_id=nearest.cluster_id
80
+ )
81
+
82
+ return None
83
+
84
+ async def register(
85
+ self,
86
+ fingerprint_hash: str,
87
+ normalized: NormalizedSegment,
88
+ component_type: ComponentType,
89
+ confidence: float,
90
+ ) -> None:
91
+ """
92
+ Called after LLM resolves a segment to register it in L2 for future lookup.
93
+ """
94
+ fingerprint_string = self._build_fingerprint_string(normalized)
95
+ vector = self._vectorize(fingerprint_string)
96
+
97
+ nearest = await self.cache.find_nearest(vector, self.config.l2_similarity_threshold)
98
+ if nearest:
99
+ await self.cache.add_to_cluster(nearest.cluster_id, fingerprint_hash, vector)
100
+ else:
101
+ await self.cache.create_cluster(fingerprint_hash, vector, component_type, confidence)
@@ -0,0 +1,271 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import litellm
5
+ from typing import Any
6
+ from segment_classifier.models import (
7
+ InputSegment, ClassifiedSegment, LLMClassificationRequest,
8
+ LLMClassificationResult, ClassificationStage, ComponentType
9
+ )
10
+ from segment_classifier.utils.html_normalizer import NormalizedSegment
11
+ from segment_classifier.config import ClassifierSettings, ModelFeatureConfig
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LLMBatchClassifier:
17
+ def __init__(self, settings: ClassifierSettings):
18
+ self.settings = settings
19
+ self._semaphore = asyncio.Semaphore(
20
+ settings.litellm_max_concurrent_batches
21
+ )
22
+ self._model_usage: dict[str, int] = {}
23
+
24
+ # Set api key globally or per call, litellm supports both
25
+ if settings.litellm_api_key:
26
+ litellm.api_key = settings.litellm_api_key
27
+
28
+ def select_model(
29
+ self,
30
+ normalized: NormalizedSegment,
31
+ segment: InputSegment,
32
+ ) -> str:
33
+ """Feature-based model routing."""
34
+ cfg = self.settings.model_routing
35
+
36
+ # 1. High complexity
37
+ if (normalized.dom_depth > cfg.high_complexity_dom_depth_threshold or
38
+ normalized.unique_tag_count > cfg.high_complexity_unique_tag_threshold or
39
+ segment.sibling_count == 0):
40
+ return cfg.high_complexity_model
41
+
42
+ # 2. Fast
43
+ elif (normalized.dom_depth <= cfg.fast_model_max_dom_depth and
44
+ normalized.unique_tag_count <= 3):
45
+ return cfg.fast_model
46
+
47
+ # 3. Standard
48
+ else:
49
+ return cfg.standard_model
50
+
51
+ def _build_request(
52
+ self,
53
+ segment: InputSegment,
54
+ normalized: NormalizedSegment,
55
+ fingerprint_hash: str,
56
+ ) -> LLMClassificationRequest:
57
+ """Construct LLMClassificationRequest from segment + normalized data."""
58
+ return LLMClassificationRequest(
59
+ segment_id=segment.segment_id,
60
+ fingerprint_hash=fingerprint_hash,
61
+ normalized_html=normalized.skeleton,
62
+ position_hint=segment.position_hint,
63
+ sibling_count=segment.sibling_count,
64
+ url_hints=segment.url_path_segments,
65
+ dom_depth=normalized.dom_depth,
66
+ child_tag_counts=normalized.child_tag_counts,
67
+ text_density_ratio=normalized.text_density_ratio
68
+ )
69
+
70
+ async def _call_litellm(
71
+ self,
72
+ model: str,
73
+ requests: list[LLMClassificationRequest],
74
+ ) -> list[LLMClassificationResult]:
75
+ """
76
+ Make one LiteLLM acompletion call for a batch.
77
+ Parse response. Return list of results.
78
+ On error: return UNKNOWN for all items.
79
+ """
80
+ prompt = """You are an expert UI component classifier. Given a list of HTML segment descriptors, classify each into exactly one component type.
81
+
82
+ Available component types:
83
+ """
84
+ prompt += ", ".join([c.value for c in ComponentType]) + "\n\n"
85
+ prompt += """For each segment respond with valid JSON array only (no markdown, no explanation):
86
+ [
87
+ {
88
+ "segment_id": "...",
89
+ "component_type": "...",
90
+ "confidence": 0.0-1.0,
91
+ "reasoning": "one sentence"
92
+ }
93
+ ]
94
+
95
+ Rules:
96
+ - Use normalized_html structure only, ignore content values
97
+ - sibling_count >= 3 strongly suggests a collection item
98
+ - position_hint=top/bottom suggests layout components
99
+ - url_hints provide page context
100
+ - Respond ONLY with the JSON array. No preamble."""
101
+
102
+ user_content = f"Classify these {len(requests)} segments:\n"
103
+ user_content += json.dumps([r.model_dump() for r in requests], indent=2)
104
+
105
+ messages = [
106
+ {"role": "system", "content": prompt},
107
+ {"role": "user", "content": user_content}
108
+ ]
109
+
110
+ try:
111
+ response = await litellm.acompletion(
112
+ model=model,
113
+ messages=messages,
114
+ timeout=self.settings.litellm_timeout_seconds,
115
+ )
116
+
117
+ # Record usage
118
+ self._model_usage[model] = self._model_usage.get(model, 0) + 1
119
+
120
+ raw_response = response.choices[0].message.content
121
+ # Strip markdown
122
+ raw_response = raw_response.strip()
123
+ if raw_response.startswith("```json"):
124
+ raw_response = raw_response[7:]
125
+ elif raw_response.startswith("```"):
126
+ raw_response = raw_response[3:]
127
+ if raw_response.endswith("```"):
128
+ raw_response = raw_response[:-3]
129
+
130
+ parsed = json.loads(raw_response.strip())
131
+
132
+ results = []
133
+ for item in parsed:
134
+ try:
135
+ results.append(LLMClassificationResult.model_validate(item))
136
+ except Exception as e:
137
+ logger.warning(f"Error validating LLM response item: {e}")
138
+ results.append(LLMClassificationResult(
139
+ segment_id=item.get("segment_id", ""),
140
+ component_type=ComponentType.UNKNOWN,
141
+ confidence=0.0,
142
+ reasoning=f"Parse error: {e}"
143
+ ))
144
+
145
+ # Ensure all segments are accounted for
146
+ parsed_ids = {r.segment_id for r in results}
147
+ for req in requests:
148
+ if req.segment_id not in parsed_ids:
149
+ results.append(LLMClassificationResult(
150
+ segment_id=req.segment_id,
151
+ component_type=ComponentType.UNKNOWN,
152
+ confidence=0.0,
153
+ reasoning="Missing from LLM response"
154
+ ))
155
+
156
+ return results
157
+
158
+ except Exception as e:
159
+ logger.warning(f"LLM call failed: {e}")
160
+ self._model_usage[model] = self._model_usage.get(model, 0) + 1 # Still count as call
161
+ return [
162
+ LLMClassificationResult(
163
+ segment_id=r.segment_id,
164
+ component_type=ComponentType.UNKNOWN,
165
+ confidence=0.0,
166
+ reasoning=f"LLM Error: {e}"
167
+ )
168
+ for r in requests
169
+ ]
170
+
171
+ async def classify_batch(
172
+ self,
173
+ items: list[tuple[InputSegment, NormalizedSegment, str]],
174
+ ) -> list[ClassifiedSegment]:
175
+ """
176
+ 1. Group items by selected model
177
+ 2. Split each group into sub-batches of litellm_batch_size
178
+ 3. asyncio.gather all sub-batches under semaphore
179
+ 4. Return flat list of ClassifiedSegments
180
+ """
181
+ # Group by model
182
+ grouped: dict[str, list[tuple[InputSegment, NormalizedSegment, str]]] = {}
183
+ for item in items:
184
+ segment, normalized, _ = item
185
+ model = self.select_model(normalized, segment)
186
+ if model not in grouped:
187
+ grouped[model] = []
188
+ grouped[model].append(item)
189
+
190
+ all_classified: list[ClassifiedSegment] = []
191
+
192
+ async def process_subbatch(model_name: str, subbatch: list[tuple[InputSegment, NormalizedSegment, str]]):
193
+ async with self._semaphore:
194
+ requests = [self._build_request(s, n, h) for s, n, h in subbatch]
195
+ results = await self._call_litellm(model_name, requests)
196
+
197
+ # Map results back
198
+ result_map = {r.segment_id: r for r in results}
199
+
200
+ for segment, normalized, fp_hash in subbatch:
201
+ res = result_map.get(segment.segment_id)
202
+ if res:
203
+ all_classified.append(
204
+ ClassifiedSegment(
205
+ segment_id=segment.segment_id,
206
+ page_url=segment.page_url,
207
+ page_slug=segment.page_slug,
208
+ raw_html=segment.raw_html,
209
+ text_content=segment.text_content,
210
+ position_hint=segment.position_hint,
211
+ component_type=res.component_type,
212
+ classification_stage=ClassificationStage.LLM,
213
+ confidence=res.confidence,
214
+ fingerprint_hash=fp_hash,
215
+ llm_model_used=model_name,
216
+ llm_raw_response=res.reasoning # Hack: put reasoning here
217
+ )
218
+ )
219
+ else:
220
+ all_classified.append(
221
+ ClassifiedSegment(
222
+ segment_id=segment.segment_id,
223
+ page_url=segment.page_url,
224
+ page_slug=segment.page_slug,
225
+ raw_html=segment.raw_html,
226
+ text_content=segment.text_content,
227
+ position_hint=segment.position_hint,
228
+ component_type=ComponentType.UNKNOWN,
229
+ classification_stage=ClassificationStage.LLM,
230
+ confidence=0.0,
231
+ fingerprint_hash=fp_hash,
232
+ llm_model_used=model_name
233
+ )
234
+ )
235
+
236
+ tasks = []
237
+ for model_name, group_items in grouped.items():
238
+ # Split into sub-batches
239
+ for i in range(0, len(group_items), self.settings.litellm_batch_size):
240
+ subbatch = group_items[i:i + self.settings.litellm_batch_size]
241
+ tasks.append(process_subbatch(model_name, subbatch))
242
+
243
+ await asyncio.gather(*tasks)
244
+
245
+ # Ensure the returned list is in the exact order as the input items
246
+ result_map = {res.segment_id: res for res in all_classified}
247
+ ordered_classified = []
248
+ for segment, _, _ in items:
249
+ if segment.segment_id in result_map:
250
+ ordered_classified.append(result_map[segment.segment_id])
251
+ else:
252
+ # Fallback, though process_subbatch should populate it
253
+ ordered_classified.append(
254
+ ClassifiedSegment(
255
+ segment_id=segment.segment_id,
256
+ page_url=segment.page_url,
257
+ page_slug=segment.page_slug,
258
+ raw_html=segment.raw_html,
259
+ text_content=segment.text_content,
260
+ position_hint=segment.position_hint,
261
+ component_type=ComponentType.UNKNOWN,
262
+ classification_stage=ClassificationStage.LLM,
263
+ confidence=0.0,
264
+ fingerprint_hash=""
265
+ )
266
+ )
267
+ return ordered_classified
268
+
269
+ @property
270
+ def model_usage(self) -> dict[str, int]:
271
+ return dict(self._model_usage)
@@ -0,0 +1,287 @@
1
+ from dataclasses import dataclass
2
+ from typing import Callable
3
+ from segment_classifier.models import (
4
+ InputSegment, ComponentType, ClassificationStage, ClassifiedSegment, SegmentPosition
5
+ )
6
+ from segment_classifier.utils.html_normalizer import NormalizedSegment
7
+
8
+
9
+ @dataclass
10
+ class ClassificationRule:
11
+ name: str
12
+ condition: Callable[[InputSegment, NormalizedSegment], bool]
13
+ component_type: ComponentType
14
+ confidence: float
15
+ priority: int = 50
16
+
17
+
18
+ class RuleBasedClassifier:
19
+ """
20
+ Apply ordered classification rules to a segment.
21
+ Rules evaluated highest priority first.
22
+ Returns ClassifiedSegment with stage=RULE_BASED or None if no match.
23
+ """
24
+
25
+ def __init__(self, confidence_threshold: float = 0.90):
26
+ self.confidence_threshold = confidence_threshold
27
+ self.rules: list[ClassificationRule] = self._build_rules()
28
+
29
+ def _build_rules(self) -> list[ClassificationRule]:
30
+ """
31
+ Instantiate all rules described above.
32
+ Sort by priority descending before returning.
33
+ """
34
+ rules = [
35
+ # Position rules
36
+ ClassificationRule(
37
+ name="top_header_nav",
38
+ condition=lambda s, n: s.position_hint == SegmentPosition.TOP and n.root_tag in {"header", "nav"},
39
+ component_type=ComponentType.LAYOUT_HEADER,
40
+ confidence=0.97,
41
+ priority=100
42
+ ),
43
+ ClassificationRule(
44
+ name="bottom_footer",
45
+ condition=lambda s, n: s.position_hint == SegmentPosition.BOTTOM and n.root_tag in {"footer"},
46
+ component_type=ComponentType.LAYOUT_FOOTER,
47
+ confidence=0.97,
48
+ priority=99
49
+ ),
50
+ ClassificationRule(
51
+ name="top_has_nav",
52
+ condition=lambda s, n: s.position_hint == SegmentPosition.TOP and n.child_tag_counts.get("nav", 0) > 0,
53
+ component_type=ComponentType.LAYOUT_NAV,
54
+ confidence=0.93,
55
+ priority=98
56
+ ),
57
+
58
+ # Tag-based rules
59
+ ClassificationRule(
60
+ name="root_footer",
61
+ condition=lambda s, n: n.root_tag == "footer",
62
+ component_type=ComponentType.LAYOUT_FOOTER,
63
+ confidence=0.99,
64
+ priority=90
65
+ ),
66
+ ClassificationRule(
67
+ name="root_header",
68
+ condition=lambda s, n: n.root_tag == "header",
69
+ component_type=ComponentType.LAYOUT_HEADER,
70
+ confidence=0.99,
71
+ priority=90
72
+ ),
73
+ ClassificationRule(
74
+ name="nav_element",
75
+ condition=lambda s, n: n.root_tag == "nav" or (n.child_tag_counts.get("nav", 0) > 0 and n.child_tag_counts.get("a", 0) > 3),
76
+ component_type=ComponentType.LAYOUT_NAV,
77
+ confidence=0.95,
78
+ priority=85
79
+ ),
80
+ ClassificationRule(
81
+ name="form_element",
82
+ condition=lambda s, n: n.root_tag == "form" or n.child_tag_counts.get("input", 0) > 1,
83
+ component_type=ComponentType.UI_FORM,
84
+ confidence=0.93,
85
+ priority=84
86
+ ),
87
+ ClassificationRule(
88
+ name="table_element",
89
+ condition=lambda s, n: n.child_tag_counts.get("table", 0) > 0,
90
+ component_type=ComponentType.UI_TABLE,
91
+ confidence=0.90,
92
+ priority=83
93
+ ),
94
+ ClassificationRule(
95
+ name="modal_element",
96
+ condition=lambda s, n: n.child_tag_counts.get("dialog", 0) > 0 or "modal" in n.class_tokens,
97
+ component_type=ComponentType.UI_MODAL,
98
+ confidence=0.91,
99
+ priority=82
100
+ ),
101
+
102
+ # Sibling repetition rules
103
+ ClassificationRule(
104
+ name="product_card_repetition",
105
+ condition=lambda s, n: s.sibling_count >= 3 and "card" in n.class_tokens,
106
+ component_type=ComponentType.COLLECTION_PRODUCT_CARD,
107
+ confidence=0.88,
108
+ priority=70
109
+ ),
110
+ ClassificationRule(
111
+ name="product_card_img_header",
112
+ condition=lambda s, n: s.sibling_count >= 3 and n.child_tag_counts.get("img", 0) > 0 and (n.child_tag_counts.get("h2", 0) > 0 or n.child_tag_counts.get("h3", 0) > 0),
113
+ component_type=ComponentType.COLLECTION_PRODUCT_CARD,
114
+ confidence=0.85,
115
+ priority=69
116
+ ),
117
+ ClassificationRule(
118
+ name="blog_card_repetition",
119
+ condition=lambda s, n: s.sibling_count >= 3 and "article" in n.class_tokens,
120
+ component_type=ComponentType.COLLECTION_BLOG_CARD,
121
+ confidence=0.87,
122
+ priority=68
123
+ ),
124
+ ClassificationRule(
125
+ name="nav_list_repetition",
126
+ condition=lambda s, n: s.sibling_count >= 5 and n.child_tag_counts.get("li", 0) > 4,
127
+ component_type=ComponentType.LAYOUT_NAV,
128
+ confidence=0.86,
129
+ priority=67
130
+ ),
131
+
132
+ # URL hint rules
133
+ ClassificationRule(
134
+ name="url_product_card",
135
+ condition=lambda s, n: any(x in s.url_path_segments for x in ["product", "shop", "store"]) and s.sibling_count >= 2,
136
+ component_type=ComponentType.COLLECTION_PRODUCT_CARD,
137
+ confidence=0.84,
138
+ priority=60
139
+ ),
140
+ ClassificationRule(
141
+ name="url_blog_card",
142
+ condition=lambda s, n: any(x in s.url_path_segments for x in ["blog", "post", "article"]),
143
+ component_type=ComponentType.COLLECTION_BLOG_CARD,
144
+ confidence=0.83,
145
+ priority=59
146
+ ),
147
+ ClassificationRule(
148
+ name="url_news_item",
149
+ condition=lambda s, n: "news" in s.url_path_segments,
150
+ component_type=ComponentType.COLLECTION_NEWS_ITEM,
151
+ confidence=0.83,
152
+ priority=58
153
+ ),
154
+
155
+ # Class token rules
156
+ ClassificationRule(
157
+ name="class_product_price",
158
+ condition=lambda s, n: "price" in n.class_tokens and n.child_tag_counts.get("img", 0) > 0,
159
+ component_type=ComponentType.COLLECTION_PRODUCT_CARD,
160
+ confidence=0.89,
161
+ priority=50
162
+ ),
163
+ ClassificationRule(
164
+ name="class_hero",
165
+ condition=lambda s, n: "hero" in n.class_tokens and n.dom_depth <= 4,
166
+ component_type=ComponentType.SECTION_HERO,
167
+ confidence=0.91,
168
+ priority=49
169
+ ),
170
+ ClassificationRule(
171
+ name="class_testimonial",
172
+ condition=lambda s, n: "testimonial" in n.class_tokens,
173
+ component_type=ComponentType.SECTION_TESTIMONIAL,
174
+ confidence=0.90,
175
+ priority=48
176
+ ),
177
+ ClassificationRule(
178
+ name="class_faq",
179
+ condition=lambda s, n: "faq" in n.class_tokens,
180
+ component_type=ComponentType.SECTION_FAQ,
181
+ confidence=0.90,
182
+ priority=47
183
+ ),
184
+ ClassificationRule(
185
+ name="class_pricing",
186
+ condition=lambda s, n: "pricing" in n.class_tokens,
187
+ component_type=ComponentType.SECTION_PRICING,
188
+ confidence=0.91,
189
+ priority=46
190
+ ),
191
+ ClassificationRule(
192
+ name="class_breadcrumb",
193
+ condition=lambda s, n: "breadcrumb" in n.class_tokens or "breadcrumb" in n.attrs_fingerprint,
194
+ component_type=ComponentType.LAYOUT_BREADCRUMB,
195
+ confidence=0.95,
196
+ priority=45
197
+ ),
198
+ ClassificationRule(
199
+ name="class_carousel",
200
+ condition=lambda s, n: "carousel" in n.class_tokens or n.child_tag_counts.get("swiper-slide", 0) > 0,
201
+ component_type=ComponentType.UI_CAROUSEL,
202
+ confidence=0.89,
203
+ priority=44
204
+ ),
205
+ ClassificationRule(
206
+ name="class_pagination",
207
+ condition=lambda s, n: "pagination" in n.class_tokens,
208
+ component_type=ComponentType.UI_PAGINATION,
209
+ confidence=0.94,
210
+ priority=43
211
+ ),
212
+ ClassificationRule(
213
+ name="class_search",
214
+ condition=lambda s, n: "search" in n.class_tokens and n.child_tag_counts.get("input", 0) > 0,
215
+ component_type=ComponentType.UI_SEARCH,
216
+ confidence=0.93,
217
+ priority=42
218
+ ),
219
+ ClassificationRule(
220
+ name="class_cta",
221
+ condition=lambda s, n: "cta" in n.class_tokens,
222
+ component_type=ComponentType.SECTION_CTA,
223
+ confidence=0.88,
224
+ priority=41
225
+ ),
226
+
227
+ # Text density rules
228
+ ClassificationRule(
229
+ name="text_article",
230
+ condition=lambda s, n: n.text_density_ratio > 0.6 and n.root_tag in {"article", "main"},
231
+ component_type=ComponentType.CONTENT_ARTICLE,
232
+ confidence=0.85,
233
+ priority=30
234
+ ),
235
+ ClassificationRule(
236
+ name="text_rich_text",
237
+ condition=lambda s, n: n.text_density_ratio > 0.5 and n.dom_depth <= 3,
238
+ component_type=ComponentType.CONTENT_RICH_TEXT,
239
+ confidence=0.80,
240
+ priority=29
241
+ ),
242
+
243
+ # Media rules
244
+ ClassificationRule(
245
+ name="media_img",
246
+ condition=lambda s, n: n.child_tag_counts.get("img", 0) > 3 and n.text_density_ratio < 0.1,
247
+ component_type=ComponentType.CONTENT_MEDIA,
248
+ confidence=0.87,
249
+ priority=20
250
+ ),
251
+ ClassificationRule(
252
+ name="media_video",
253
+ condition=lambda s, n: n.child_tag_counts.get("video", 0) > 0,
254
+ component_type=ComponentType.CONTENT_MEDIA,
255
+ confidence=0.92,
256
+ priority=19
257
+ ),
258
+ ]
259
+
260
+ rules.sort(key=lambda r: r.priority, reverse=True)
261
+ return rules
262
+
263
+ def classify(
264
+ self,
265
+ segment: InputSegment,
266
+ normalized: NormalizedSegment,
267
+ ) -> ClassifiedSegment | None:
268
+ """
269
+ Try each rule in order. Return first match above confidence_threshold.
270
+ Return None if no rule fires.
271
+ """
272
+ for rule in self.rules:
273
+ if rule.condition(segment, normalized):
274
+ if rule.confidence >= self.confidence_threshold:
275
+ return ClassifiedSegment(
276
+ segment_id=segment.segment_id,
277
+ page_url=segment.page_url,
278
+ page_slug=segment.page_slug,
279
+ raw_html=segment.raw_html,
280
+ text_content=segment.text_content,
281
+ position_hint=segment.position_hint,
282
+ component_type=rule.component_type,
283
+ classification_stage=ClassificationStage.RULE_BASED,
284
+ confidence=rule.confidence,
285
+ fingerprint_hash=normalized.fingerprint_hash()
286
+ )
287
+ return None
@@ -0,0 +1,3 @@
1
+ from .html_normalizer import normalize_segment, NormalizedSegment
2
+
3
+ __all__ = ["normalize_segment", "NormalizedSegment"]