segment_classifier 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- segment_classifier/__init__.py +4 -0
- segment_classifier/cache/__init__.py +4 -0
- segment_classifier/cache/l1_cache.py +71 -0
- segment_classifier/cache/l2_cache.py +157 -0
- segment_classifier/config.py +53 -0
- segment_classifier/models.py +142 -0
- segment_classifier/pipeline.py +173 -0
- segment_classifier/stages/__init__.py +6 -0
- segment_classifier/stages/fingerprint.py +10 -0
- segment_classifier/stages/fuzzy_cluster.py +101 -0
- segment_classifier/stages/llm_classifier.py +271 -0
- segment_classifier/stages/rule_based.py +287 -0
- segment_classifier/utils/__init__.py +3 -0
- segment_classifier/utils/html_normalizer.py +165 -0
- segment_classifier-0.1.0.dist-info/METADATA +95 -0
- segment_classifier-0.1.0.dist-info/RECORD +17 -0
- segment_classifier-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
2
|
+
from segment_classifier.cache.l2_cache import L2FuzzyCache
|
|
3
|
+
from segment_classifier.config import CacheConfig
|
|
4
|
+
from segment_classifier.models import (
|
|
5
|
+
InputSegment, ClassifiedSegment, ClassificationStage, ComponentType
|
|
6
|
+
)
|
|
7
|
+
from segment_classifier.utils.html_normalizer import NormalizedSegment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FuzzyClusterStage:
|
|
11
|
+
def __init__(self, cache: L2FuzzyCache, config: CacheConfig):
|
|
12
|
+
self.cache = cache
|
|
13
|
+
self.config = config
|
|
14
|
+
|
|
15
|
+
# Use TfidfVectorizer, but fit it once on a known dummy vocabulary
|
|
16
|
+
# or just use sklearn's HashingVectorizer which is stateless.
|
|
17
|
+
# But since prompt specifies "use sklearn TfidfVectorizer", we must pre-fit it
|
|
18
|
+
# or use HashingVectorizer with TfidfTransformer. HashingVectorizer directly
|
|
19
|
+
# isn't TfidfVectorizer. Let's stick to HashingVectorizer + tfidf for stability?
|
|
20
|
+
# Actually, prompt says "use sklearn TfidfVectorizer". We can use TfidfVectorizer
|
|
21
|
+
# but with a fixed vocabulary if we hash. Or we can just build a stateless tfidf
|
|
22
|
+
# using HashingVectorizer + TfidfTransformer.
|
|
23
|
+
# But wait, code review said: "You must either use a HashingVectorizer... or pre-fit the vectorizer".
|
|
24
|
+
# Let's use HashingVectorizer + TfidfTransformer to get TF-IDF scaling on fixed dimensions.
|
|
25
|
+
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
|
|
26
|
+
from sklearn.pipeline import Pipeline
|
|
27
|
+
self.vectorizer = Pipeline([
|
|
28
|
+
('hash', HashingVectorizer(
|
|
29
|
+
analyzer='char',
|
|
30
|
+
ngram_range=(2, 4),
|
|
31
|
+
n_features=512,
|
|
32
|
+
lowercase=True,
|
|
33
|
+
norm=None # Let TfidfTransformer handle normalization
|
|
34
|
+
)),
|
|
35
|
+
('tfidf', TfidfTransformer())
|
|
36
|
+
])
|
|
37
|
+
# Fit once on empty/dummy string to initialize tfidf idf_ to smooth values
|
|
38
|
+
self.vectorizer.fit(["dummy"])
|
|
39
|
+
self._is_fitted = True
|
|
40
|
+
|
|
41
|
+
def _build_fingerprint_string(self, normalized: NormalizedSegment) -> str:
|
|
42
|
+
return f"{normalized.skeleton} {normalized.attrs_fingerprint} {' '.join(normalized.class_tokens)}"
|
|
43
|
+
|
|
44
|
+
def _vectorize(self, fingerprint_string: str) -> list[float]:
|
|
45
|
+
matrix = self.vectorizer.transform([fingerprint_string])
|
|
46
|
+
return matrix.toarray()[0].tolist()
|
|
47
|
+
|
|
48
|
+
async def classify(
|
|
49
|
+
self,
|
|
50
|
+
segment: InputSegment,
|
|
51
|
+
normalized: NormalizedSegment,
|
|
52
|
+
fingerprint_hash: str,
|
|
53
|
+
) -> ClassifiedSegment | None:
|
|
54
|
+
"""
|
|
55
|
+
1. Build fingerprint string
|
|
56
|
+
2. Vectorize
|
|
57
|
+
3. find_nearest in L2Cache
|
|
58
|
+
4. Return ClassifiedSegment (stage=L2_FUZZY_CACHE) or None
|
|
59
|
+
"""
|
|
60
|
+
fingerprint_string = self._build_fingerprint_string(normalized)
|
|
61
|
+
vector = self._vectorize(fingerprint_string)
|
|
62
|
+
|
|
63
|
+
nearest = await self.cache.find_nearest(vector, self.config.l2_similarity_threshold)
|
|
64
|
+
if nearest:
|
|
65
|
+
# Penalty of 0.05 for fuzzy
|
|
66
|
+
confidence = max(0.0, nearest.confidence - 0.05)
|
|
67
|
+
|
|
68
|
+
return ClassifiedSegment(
|
|
69
|
+
segment_id=segment.segment_id,
|
|
70
|
+
page_url=segment.page_url,
|
|
71
|
+
page_slug=segment.page_slug,
|
|
72
|
+
raw_html=segment.raw_html,
|
|
73
|
+
text_content=segment.text_content,
|
|
74
|
+
position_hint=segment.position_hint,
|
|
75
|
+
component_type=nearest.component_type,
|
|
76
|
+
classification_stage=ClassificationStage.L2_FUZZY_CACHE,
|
|
77
|
+
confidence=confidence,
|
|
78
|
+
fingerprint_hash=fingerprint_hash,
|
|
79
|
+
cluster_id=nearest.cluster_id
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
async def register(
|
|
85
|
+
self,
|
|
86
|
+
fingerprint_hash: str,
|
|
87
|
+
normalized: NormalizedSegment,
|
|
88
|
+
component_type: ComponentType,
|
|
89
|
+
confidence: float,
|
|
90
|
+
) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Called after LLM resolves a segment to register it in L2 for future lookup.
|
|
93
|
+
"""
|
|
94
|
+
fingerprint_string = self._build_fingerprint_string(normalized)
|
|
95
|
+
vector = self._vectorize(fingerprint_string)
|
|
96
|
+
|
|
97
|
+
nearest = await self.cache.find_nearest(vector, self.config.l2_similarity_threshold)
|
|
98
|
+
if nearest:
|
|
99
|
+
await self.cache.add_to_cluster(nearest.cluster_id, fingerprint_hash, vector)
|
|
100
|
+
else:
|
|
101
|
+
await self.cache.create_cluster(fingerprint_hash, vector, component_type, confidence)
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import litellm
|
|
5
|
+
from typing import Any
|
|
6
|
+
from segment_classifier.models import (
|
|
7
|
+
InputSegment, ClassifiedSegment, LLMClassificationRequest,
|
|
8
|
+
LLMClassificationResult, ClassificationStage, ComponentType
|
|
9
|
+
)
|
|
10
|
+
from segment_classifier.utils.html_normalizer import NormalizedSegment
|
|
11
|
+
from segment_classifier.config import ClassifierSettings, ModelFeatureConfig
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LLMBatchClassifier:
|
|
17
|
+
def __init__(self, settings: ClassifierSettings):
|
|
18
|
+
self.settings = settings
|
|
19
|
+
self._semaphore = asyncio.Semaphore(
|
|
20
|
+
settings.litellm_max_concurrent_batches
|
|
21
|
+
)
|
|
22
|
+
self._model_usage: dict[str, int] = {}
|
|
23
|
+
|
|
24
|
+
# Set api key globally or per call, litellm supports both
|
|
25
|
+
if settings.litellm_api_key:
|
|
26
|
+
litellm.api_key = settings.litellm_api_key
|
|
27
|
+
|
|
28
|
+
def select_model(
|
|
29
|
+
self,
|
|
30
|
+
normalized: NormalizedSegment,
|
|
31
|
+
segment: InputSegment,
|
|
32
|
+
) -> str:
|
|
33
|
+
"""Feature-based model routing."""
|
|
34
|
+
cfg = self.settings.model_routing
|
|
35
|
+
|
|
36
|
+
# 1. High complexity
|
|
37
|
+
if (normalized.dom_depth > cfg.high_complexity_dom_depth_threshold or
|
|
38
|
+
normalized.unique_tag_count > cfg.high_complexity_unique_tag_threshold or
|
|
39
|
+
segment.sibling_count == 0):
|
|
40
|
+
return cfg.high_complexity_model
|
|
41
|
+
|
|
42
|
+
# 2. Fast
|
|
43
|
+
elif (normalized.dom_depth <= cfg.fast_model_max_dom_depth and
|
|
44
|
+
normalized.unique_tag_count <= 3):
|
|
45
|
+
return cfg.fast_model
|
|
46
|
+
|
|
47
|
+
# 3. Standard
|
|
48
|
+
else:
|
|
49
|
+
return cfg.standard_model
|
|
50
|
+
|
|
51
|
+
def _build_request(
|
|
52
|
+
self,
|
|
53
|
+
segment: InputSegment,
|
|
54
|
+
normalized: NormalizedSegment,
|
|
55
|
+
fingerprint_hash: str,
|
|
56
|
+
) -> LLMClassificationRequest:
|
|
57
|
+
"""Construct LLMClassificationRequest from segment + normalized data."""
|
|
58
|
+
return LLMClassificationRequest(
|
|
59
|
+
segment_id=segment.segment_id,
|
|
60
|
+
fingerprint_hash=fingerprint_hash,
|
|
61
|
+
normalized_html=normalized.skeleton,
|
|
62
|
+
position_hint=segment.position_hint,
|
|
63
|
+
sibling_count=segment.sibling_count,
|
|
64
|
+
url_hints=segment.url_path_segments,
|
|
65
|
+
dom_depth=normalized.dom_depth,
|
|
66
|
+
child_tag_counts=normalized.child_tag_counts,
|
|
67
|
+
text_density_ratio=normalized.text_density_ratio
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
async def _call_litellm(
|
|
71
|
+
self,
|
|
72
|
+
model: str,
|
|
73
|
+
requests: list[LLMClassificationRequest],
|
|
74
|
+
) -> list[LLMClassificationResult]:
|
|
75
|
+
"""
|
|
76
|
+
Make one LiteLLM acompletion call for a batch.
|
|
77
|
+
Parse response. Return list of results.
|
|
78
|
+
On error: return UNKNOWN for all items.
|
|
79
|
+
"""
|
|
80
|
+
prompt = """You are an expert UI component classifier. Given a list of HTML segment descriptors, classify each into exactly one component type.
|
|
81
|
+
|
|
82
|
+
Available component types:
|
|
83
|
+
"""
|
|
84
|
+
prompt += ", ".join([c.value for c in ComponentType]) + "\n\n"
|
|
85
|
+
prompt += """For each segment respond with valid JSON array only (no markdown, no explanation):
|
|
86
|
+
[
|
|
87
|
+
{
|
|
88
|
+
"segment_id": "...",
|
|
89
|
+
"component_type": "...",
|
|
90
|
+
"confidence": 0.0-1.0,
|
|
91
|
+
"reasoning": "one sentence"
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
Rules:
|
|
96
|
+
- Use normalized_html structure only, ignore content values
|
|
97
|
+
- sibling_count >= 3 strongly suggests a collection item
|
|
98
|
+
- position_hint=top/bottom suggests layout components
|
|
99
|
+
- url_hints provide page context
|
|
100
|
+
- Respond ONLY with the JSON array. No preamble."""
|
|
101
|
+
|
|
102
|
+
user_content = f"Classify these {len(requests)} segments:\n"
|
|
103
|
+
user_content += json.dumps([r.model_dump() for r in requests], indent=2)
|
|
104
|
+
|
|
105
|
+
messages = [
|
|
106
|
+
{"role": "system", "content": prompt},
|
|
107
|
+
{"role": "user", "content": user_content}
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
response = await litellm.acompletion(
|
|
112
|
+
model=model,
|
|
113
|
+
messages=messages,
|
|
114
|
+
timeout=self.settings.litellm_timeout_seconds,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Record usage
|
|
118
|
+
self._model_usage[model] = self._model_usage.get(model, 0) + 1
|
|
119
|
+
|
|
120
|
+
raw_response = response.choices[0].message.content
|
|
121
|
+
# Strip markdown
|
|
122
|
+
raw_response = raw_response.strip()
|
|
123
|
+
if raw_response.startswith("```json"):
|
|
124
|
+
raw_response = raw_response[7:]
|
|
125
|
+
elif raw_response.startswith("```"):
|
|
126
|
+
raw_response = raw_response[3:]
|
|
127
|
+
if raw_response.endswith("```"):
|
|
128
|
+
raw_response = raw_response[:-3]
|
|
129
|
+
|
|
130
|
+
parsed = json.loads(raw_response.strip())
|
|
131
|
+
|
|
132
|
+
results = []
|
|
133
|
+
for item in parsed:
|
|
134
|
+
try:
|
|
135
|
+
results.append(LLMClassificationResult.model_validate(item))
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.warning(f"Error validating LLM response item: {e}")
|
|
138
|
+
results.append(LLMClassificationResult(
|
|
139
|
+
segment_id=item.get("segment_id", ""),
|
|
140
|
+
component_type=ComponentType.UNKNOWN,
|
|
141
|
+
confidence=0.0,
|
|
142
|
+
reasoning=f"Parse error: {e}"
|
|
143
|
+
))
|
|
144
|
+
|
|
145
|
+
# Ensure all segments are accounted for
|
|
146
|
+
parsed_ids = {r.segment_id for r in results}
|
|
147
|
+
for req in requests:
|
|
148
|
+
if req.segment_id not in parsed_ids:
|
|
149
|
+
results.append(LLMClassificationResult(
|
|
150
|
+
segment_id=req.segment_id,
|
|
151
|
+
component_type=ComponentType.UNKNOWN,
|
|
152
|
+
confidence=0.0,
|
|
153
|
+
reasoning="Missing from LLM response"
|
|
154
|
+
))
|
|
155
|
+
|
|
156
|
+
return results
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.warning(f"LLM call failed: {e}")
|
|
160
|
+
self._model_usage[model] = self._model_usage.get(model, 0) + 1 # Still count as call
|
|
161
|
+
return [
|
|
162
|
+
LLMClassificationResult(
|
|
163
|
+
segment_id=r.segment_id,
|
|
164
|
+
component_type=ComponentType.UNKNOWN,
|
|
165
|
+
confidence=0.0,
|
|
166
|
+
reasoning=f"LLM Error: {e}"
|
|
167
|
+
)
|
|
168
|
+
for r in requests
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
async def classify_batch(
|
|
172
|
+
self,
|
|
173
|
+
items: list[tuple[InputSegment, NormalizedSegment, str]],
|
|
174
|
+
) -> list[ClassifiedSegment]:
|
|
175
|
+
"""
|
|
176
|
+
1. Group items by selected model
|
|
177
|
+
2. Split each group into sub-batches of litellm_batch_size
|
|
178
|
+
3. asyncio.gather all sub-batches under semaphore
|
|
179
|
+
4. Return flat list of ClassifiedSegments
|
|
180
|
+
"""
|
|
181
|
+
# Group by model
|
|
182
|
+
grouped: dict[str, list[tuple[InputSegment, NormalizedSegment, str]]] = {}
|
|
183
|
+
for item in items:
|
|
184
|
+
segment, normalized, _ = item
|
|
185
|
+
model = self.select_model(normalized, segment)
|
|
186
|
+
if model not in grouped:
|
|
187
|
+
grouped[model] = []
|
|
188
|
+
grouped[model].append(item)
|
|
189
|
+
|
|
190
|
+
all_classified: list[ClassifiedSegment] = []
|
|
191
|
+
|
|
192
|
+
async def process_subbatch(model_name: str, subbatch: list[tuple[InputSegment, NormalizedSegment, str]]):
|
|
193
|
+
async with self._semaphore:
|
|
194
|
+
requests = [self._build_request(s, n, h) for s, n, h in subbatch]
|
|
195
|
+
results = await self._call_litellm(model_name, requests)
|
|
196
|
+
|
|
197
|
+
# Map results back
|
|
198
|
+
result_map = {r.segment_id: r for r in results}
|
|
199
|
+
|
|
200
|
+
for segment, normalized, fp_hash in subbatch:
|
|
201
|
+
res = result_map.get(segment.segment_id)
|
|
202
|
+
if res:
|
|
203
|
+
all_classified.append(
|
|
204
|
+
ClassifiedSegment(
|
|
205
|
+
segment_id=segment.segment_id,
|
|
206
|
+
page_url=segment.page_url,
|
|
207
|
+
page_slug=segment.page_slug,
|
|
208
|
+
raw_html=segment.raw_html,
|
|
209
|
+
text_content=segment.text_content,
|
|
210
|
+
position_hint=segment.position_hint,
|
|
211
|
+
component_type=res.component_type,
|
|
212
|
+
classification_stage=ClassificationStage.LLM,
|
|
213
|
+
confidence=res.confidence,
|
|
214
|
+
fingerprint_hash=fp_hash,
|
|
215
|
+
llm_model_used=model_name,
|
|
216
|
+
llm_raw_response=res.reasoning # Hack: put reasoning here
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
all_classified.append(
|
|
221
|
+
ClassifiedSegment(
|
|
222
|
+
segment_id=segment.segment_id,
|
|
223
|
+
page_url=segment.page_url,
|
|
224
|
+
page_slug=segment.page_slug,
|
|
225
|
+
raw_html=segment.raw_html,
|
|
226
|
+
text_content=segment.text_content,
|
|
227
|
+
position_hint=segment.position_hint,
|
|
228
|
+
component_type=ComponentType.UNKNOWN,
|
|
229
|
+
classification_stage=ClassificationStage.LLM,
|
|
230
|
+
confidence=0.0,
|
|
231
|
+
fingerprint_hash=fp_hash,
|
|
232
|
+
llm_model_used=model_name
|
|
233
|
+
)
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
tasks = []
|
|
237
|
+
for model_name, group_items in grouped.items():
|
|
238
|
+
# Split into sub-batches
|
|
239
|
+
for i in range(0, len(group_items), self.settings.litellm_batch_size):
|
|
240
|
+
subbatch = group_items[i:i + self.settings.litellm_batch_size]
|
|
241
|
+
tasks.append(process_subbatch(model_name, subbatch))
|
|
242
|
+
|
|
243
|
+
await asyncio.gather(*tasks)
|
|
244
|
+
|
|
245
|
+
# Ensure the returned list is in the exact order as the input items
|
|
246
|
+
result_map = {res.segment_id: res for res in all_classified}
|
|
247
|
+
ordered_classified = []
|
|
248
|
+
for segment, _, _ in items:
|
|
249
|
+
if segment.segment_id in result_map:
|
|
250
|
+
ordered_classified.append(result_map[segment.segment_id])
|
|
251
|
+
else:
|
|
252
|
+
# Fallback, though process_subbatch should populate it
|
|
253
|
+
ordered_classified.append(
|
|
254
|
+
ClassifiedSegment(
|
|
255
|
+
segment_id=segment.segment_id,
|
|
256
|
+
page_url=segment.page_url,
|
|
257
|
+
page_slug=segment.page_slug,
|
|
258
|
+
raw_html=segment.raw_html,
|
|
259
|
+
text_content=segment.text_content,
|
|
260
|
+
position_hint=segment.position_hint,
|
|
261
|
+
component_type=ComponentType.UNKNOWN,
|
|
262
|
+
classification_stage=ClassificationStage.LLM,
|
|
263
|
+
confidence=0.0,
|
|
264
|
+
fingerprint_hash=""
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
return ordered_classified
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def model_usage(self) -> dict[str, int]:
|
|
271
|
+
return dict(self._model_usage)
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from segment_classifier.models import (
|
|
4
|
+
InputSegment, ComponentType, ClassificationStage, ClassifiedSegment, SegmentPosition
|
|
5
|
+
)
|
|
6
|
+
from segment_classifier.utils.html_normalizer import NormalizedSegment
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class ClassificationRule:
|
|
11
|
+
name: str
|
|
12
|
+
condition: Callable[[InputSegment, NormalizedSegment], bool]
|
|
13
|
+
component_type: ComponentType
|
|
14
|
+
confidence: float
|
|
15
|
+
priority: int = 50
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RuleBasedClassifier:
|
|
19
|
+
"""
|
|
20
|
+
Apply ordered classification rules to a segment.
|
|
21
|
+
Rules evaluated highest priority first.
|
|
22
|
+
Returns ClassifiedSegment with stage=RULE_BASED or None if no match.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, confidence_threshold: float = 0.90):
|
|
26
|
+
self.confidence_threshold = confidence_threshold
|
|
27
|
+
self.rules: list[ClassificationRule] = self._build_rules()
|
|
28
|
+
|
|
29
|
+
def _build_rules(self) -> list[ClassificationRule]:
|
|
30
|
+
"""
|
|
31
|
+
Instantiate all rules described above.
|
|
32
|
+
Sort by priority descending before returning.
|
|
33
|
+
"""
|
|
34
|
+
rules = [
|
|
35
|
+
# Position rules
|
|
36
|
+
ClassificationRule(
|
|
37
|
+
name="top_header_nav",
|
|
38
|
+
condition=lambda s, n: s.position_hint == SegmentPosition.TOP and n.root_tag in {"header", "nav"},
|
|
39
|
+
component_type=ComponentType.LAYOUT_HEADER,
|
|
40
|
+
confidence=0.97,
|
|
41
|
+
priority=100
|
|
42
|
+
),
|
|
43
|
+
ClassificationRule(
|
|
44
|
+
name="bottom_footer",
|
|
45
|
+
condition=lambda s, n: s.position_hint == SegmentPosition.BOTTOM and n.root_tag in {"footer"},
|
|
46
|
+
component_type=ComponentType.LAYOUT_FOOTER,
|
|
47
|
+
confidence=0.97,
|
|
48
|
+
priority=99
|
|
49
|
+
),
|
|
50
|
+
ClassificationRule(
|
|
51
|
+
name="top_has_nav",
|
|
52
|
+
condition=lambda s, n: s.position_hint == SegmentPosition.TOP and n.child_tag_counts.get("nav", 0) > 0,
|
|
53
|
+
component_type=ComponentType.LAYOUT_NAV,
|
|
54
|
+
confidence=0.93,
|
|
55
|
+
priority=98
|
|
56
|
+
),
|
|
57
|
+
|
|
58
|
+
# Tag-based rules
|
|
59
|
+
ClassificationRule(
|
|
60
|
+
name="root_footer",
|
|
61
|
+
condition=lambda s, n: n.root_tag == "footer",
|
|
62
|
+
component_type=ComponentType.LAYOUT_FOOTER,
|
|
63
|
+
confidence=0.99,
|
|
64
|
+
priority=90
|
|
65
|
+
),
|
|
66
|
+
ClassificationRule(
|
|
67
|
+
name="root_header",
|
|
68
|
+
condition=lambda s, n: n.root_tag == "header",
|
|
69
|
+
component_type=ComponentType.LAYOUT_HEADER,
|
|
70
|
+
confidence=0.99,
|
|
71
|
+
priority=90
|
|
72
|
+
),
|
|
73
|
+
ClassificationRule(
|
|
74
|
+
name="nav_element",
|
|
75
|
+
condition=lambda s, n: n.root_tag == "nav" or (n.child_tag_counts.get("nav", 0) > 0 and n.child_tag_counts.get("a", 0) > 3),
|
|
76
|
+
component_type=ComponentType.LAYOUT_NAV,
|
|
77
|
+
confidence=0.95,
|
|
78
|
+
priority=85
|
|
79
|
+
),
|
|
80
|
+
ClassificationRule(
|
|
81
|
+
name="form_element",
|
|
82
|
+
condition=lambda s, n: n.root_tag == "form" or n.child_tag_counts.get("input", 0) > 1,
|
|
83
|
+
component_type=ComponentType.UI_FORM,
|
|
84
|
+
confidence=0.93,
|
|
85
|
+
priority=84
|
|
86
|
+
),
|
|
87
|
+
ClassificationRule(
|
|
88
|
+
name="table_element",
|
|
89
|
+
condition=lambda s, n: n.child_tag_counts.get("table", 0) > 0,
|
|
90
|
+
component_type=ComponentType.UI_TABLE,
|
|
91
|
+
confidence=0.90,
|
|
92
|
+
priority=83
|
|
93
|
+
),
|
|
94
|
+
ClassificationRule(
|
|
95
|
+
name="modal_element",
|
|
96
|
+
condition=lambda s, n: n.child_tag_counts.get("dialog", 0) > 0 or "modal" in n.class_tokens,
|
|
97
|
+
component_type=ComponentType.UI_MODAL,
|
|
98
|
+
confidence=0.91,
|
|
99
|
+
priority=82
|
|
100
|
+
),
|
|
101
|
+
|
|
102
|
+
# Sibling repetition rules
|
|
103
|
+
ClassificationRule(
|
|
104
|
+
name="product_card_repetition",
|
|
105
|
+
condition=lambda s, n: s.sibling_count >= 3 and "card" in n.class_tokens,
|
|
106
|
+
component_type=ComponentType.COLLECTION_PRODUCT_CARD,
|
|
107
|
+
confidence=0.88,
|
|
108
|
+
priority=70
|
|
109
|
+
),
|
|
110
|
+
ClassificationRule(
|
|
111
|
+
name="product_card_img_header",
|
|
112
|
+
condition=lambda s, n: s.sibling_count >= 3 and n.child_tag_counts.get("img", 0) > 0 and (n.child_tag_counts.get("h2", 0) > 0 or n.child_tag_counts.get("h3", 0) > 0),
|
|
113
|
+
component_type=ComponentType.COLLECTION_PRODUCT_CARD,
|
|
114
|
+
confidence=0.85,
|
|
115
|
+
priority=69
|
|
116
|
+
),
|
|
117
|
+
ClassificationRule(
|
|
118
|
+
name="blog_card_repetition",
|
|
119
|
+
condition=lambda s, n: s.sibling_count >= 3 and "article" in n.class_tokens,
|
|
120
|
+
component_type=ComponentType.COLLECTION_BLOG_CARD,
|
|
121
|
+
confidence=0.87,
|
|
122
|
+
priority=68
|
|
123
|
+
),
|
|
124
|
+
ClassificationRule(
|
|
125
|
+
name="nav_list_repetition",
|
|
126
|
+
condition=lambda s, n: s.sibling_count >= 5 and n.child_tag_counts.get("li", 0) > 4,
|
|
127
|
+
component_type=ComponentType.LAYOUT_NAV,
|
|
128
|
+
confidence=0.86,
|
|
129
|
+
priority=67
|
|
130
|
+
),
|
|
131
|
+
|
|
132
|
+
# URL hint rules
|
|
133
|
+
ClassificationRule(
|
|
134
|
+
name="url_product_card",
|
|
135
|
+
condition=lambda s, n: any(x in s.url_path_segments for x in ["product", "shop", "store"]) and s.sibling_count >= 2,
|
|
136
|
+
component_type=ComponentType.COLLECTION_PRODUCT_CARD,
|
|
137
|
+
confidence=0.84,
|
|
138
|
+
priority=60
|
|
139
|
+
),
|
|
140
|
+
ClassificationRule(
|
|
141
|
+
name="url_blog_card",
|
|
142
|
+
condition=lambda s, n: any(x in s.url_path_segments for x in ["blog", "post", "article"]),
|
|
143
|
+
component_type=ComponentType.COLLECTION_BLOG_CARD,
|
|
144
|
+
confidence=0.83,
|
|
145
|
+
priority=59
|
|
146
|
+
),
|
|
147
|
+
ClassificationRule(
|
|
148
|
+
name="url_news_item",
|
|
149
|
+
condition=lambda s, n: "news" in s.url_path_segments,
|
|
150
|
+
component_type=ComponentType.COLLECTION_NEWS_ITEM,
|
|
151
|
+
confidence=0.83,
|
|
152
|
+
priority=58
|
|
153
|
+
),
|
|
154
|
+
|
|
155
|
+
# Class token rules
|
|
156
|
+
ClassificationRule(
|
|
157
|
+
name="class_product_price",
|
|
158
|
+
condition=lambda s, n: "price" in n.class_tokens and n.child_tag_counts.get("img", 0) > 0,
|
|
159
|
+
component_type=ComponentType.COLLECTION_PRODUCT_CARD,
|
|
160
|
+
confidence=0.89,
|
|
161
|
+
priority=50
|
|
162
|
+
),
|
|
163
|
+
ClassificationRule(
|
|
164
|
+
name="class_hero",
|
|
165
|
+
condition=lambda s, n: "hero" in n.class_tokens and n.dom_depth <= 4,
|
|
166
|
+
component_type=ComponentType.SECTION_HERO,
|
|
167
|
+
confidence=0.91,
|
|
168
|
+
priority=49
|
|
169
|
+
),
|
|
170
|
+
ClassificationRule(
|
|
171
|
+
name="class_testimonial",
|
|
172
|
+
condition=lambda s, n: "testimonial" in n.class_tokens,
|
|
173
|
+
component_type=ComponentType.SECTION_TESTIMONIAL,
|
|
174
|
+
confidence=0.90,
|
|
175
|
+
priority=48
|
|
176
|
+
),
|
|
177
|
+
ClassificationRule(
|
|
178
|
+
name="class_faq",
|
|
179
|
+
condition=lambda s, n: "faq" in n.class_tokens,
|
|
180
|
+
component_type=ComponentType.SECTION_FAQ,
|
|
181
|
+
confidence=0.90,
|
|
182
|
+
priority=47
|
|
183
|
+
),
|
|
184
|
+
ClassificationRule(
|
|
185
|
+
name="class_pricing",
|
|
186
|
+
condition=lambda s, n: "pricing" in n.class_tokens,
|
|
187
|
+
component_type=ComponentType.SECTION_PRICING,
|
|
188
|
+
confidence=0.91,
|
|
189
|
+
priority=46
|
|
190
|
+
),
|
|
191
|
+
ClassificationRule(
|
|
192
|
+
name="class_breadcrumb",
|
|
193
|
+
condition=lambda s, n: "breadcrumb" in n.class_tokens or "breadcrumb" in n.attrs_fingerprint,
|
|
194
|
+
component_type=ComponentType.LAYOUT_BREADCRUMB,
|
|
195
|
+
confidence=0.95,
|
|
196
|
+
priority=45
|
|
197
|
+
),
|
|
198
|
+
ClassificationRule(
|
|
199
|
+
name="class_carousel",
|
|
200
|
+
condition=lambda s, n: "carousel" in n.class_tokens or n.child_tag_counts.get("swiper-slide", 0) > 0,
|
|
201
|
+
component_type=ComponentType.UI_CAROUSEL,
|
|
202
|
+
confidence=0.89,
|
|
203
|
+
priority=44
|
|
204
|
+
),
|
|
205
|
+
ClassificationRule(
|
|
206
|
+
name="class_pagination",
|
|
207
|
+
condition=lambda s, n: "pagination" in n.class_tokens,
|
|
208
|
+
component_type=ComponentType.UI_PAGINATION,
|
|
209
|
+
confidence=0.94,
|
|
210
|
+
priority=43
|
|
211
|
+
),
|
|
212
|
+
ClassificationRule(
|
|
213
|
+
name="class_search",
|
|
214
|
+
condition=lambda s, n: "search" in n.class_tokens and n.child_tag_counts.get("input", 0) > 0,
|
|
215
|
+
component_type=ComponentType.UI_SEARCH,
|
|
216
|
+
confidence=0.93,
|
|
217
|
+
priority=42
|
|
218
|
+
),
|
|
219
|
+
ClassificationRule(
|
|
220
|
+
name="class_cta",
|
|
221
|
+
condition=lambda s, n: "cta" in n.class_tokens,
|
|
222
|
+
component_type=ComponentType.SECTION_CTA,
|
|
223
|
+
confidence=0.88,
|
|
224
|
+
priority=41
|
|
225
|
+
),
|
|
226
|
+
|
|
227
|
+
# Text density rules
|
|
228
|
+
ClassificationRule(
|
|
229
|
+
name="text_article",
|
|
230
|
+
condition=lambda s, n: n.text_density_ratio > 0.6 and n.root_tag in {"article", "main"},
|
|
231
|
+
component_type=ComponentType.CONTENT_ARTICLE,
|
|
232
|
+
confidence=0.85,
|
|
233
|
+
priority=30
|
|
234
|
+
),
|
|
235
|
+
ClassificationRule(
|
|
236
|
+
name="text_rich_text",
|
|
237
|
+
condition=lambda s, n: n.text_density_ratio > 0.5 and n.dom_depth <= 3,
|
|
238
|
+
component_type=ComponentType.CONTENT_RICH_TEXT,
|
|
239
|
+
confidence=0.80,
|
|
240
|
+
priority=29
|
|
241
|
+
),
|
|
242
|
+
|
|
243
|
+
# Media rules
|
|
244
|
+
ClassificationRule(
|
|
245
|
+
name="media_img",
|
|
246
|
+
condition=lambda s, n: n.child_tag_counts.get("img", 0) > 3 and n.text_density_ratio < 0.1,
|
|
247
|
+
component_type=ComponentType.CONTENT_MEDIA,
|
|
248
|
+
confidence=0.87,
|
|
249
|
+
priority=20
|
|
250
|
+
),
|
|
251
|
+
ClassificationRule(
|
|
252
|
+
name="media_video",
|
|
253
|
+
condition=lambda s, n: n.child_tag_counts.get("video", 0) > 0,
|
|
254
|
+
component_type=ComponentType.CONTENT_MEDIA,
|
|
255
|
+
confidence=0.92,
|
|
256
|
+
priority=19
|
|
257
|
+
),
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
rules.sort(key=lambda r: r.priority, reverse=True)
|
|
261
|
+
return rules
|
|
262
|
+
|
|
263
|
+
def classify(
|
|
264
|
+
self,
|
|
265
|
+
segment: InputSegment,
|
|
266
|
+
normalized: NormalizedSegment,
|
|
267
|
+
) -> ClassifiedSegment | None:
|
|
268
|
+
"""
|
|
269
|
+
Try each rule in order. Return first match above confidence_threshold.
|
|
270
|
+
Return None if no rule fires.
|
|
271
|
+
"""
|
|
272
|
+
for rule in self.rules:
|
|
273
|
+
if rule.condition(segment, normalized):
|
|
274
|
+
if rule.confidence >= self.confidence_threshold:
|
|
275
|
+
return ClassifiedSegment(
|
|
276
|
+
segment_id=segment.segment_id,
|
|
277
|
+
page_url=segment.page_url,
|
|
278
|
+
page_slug=segment.page_slug,
|
|
279
|
+
raw_html=segment.raw_html,
|
|
280
|
+
text_content=segment.text_content,
|
|
281
|
+
position_hint=segment.position_hint,
|
|
282
|
+
component_type=rule.component_type,
|
|
283
|
+
classification_stage=ClassificationStage.RULE_BASED,
|
|
284
|
+
confidence=rule.confidence,
|
|
285
|
+
fingerprint_hash=normalized.fingerprint_hash()
|
|
286
|
+
)
|
|
287
|
+
return None
|