component-mapper 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- component_mapper/__init__.py +4 -0
- component_mapper/cache/__init__.py +0 -0
- component_mapper/cache/mapping_cache.py +72 -0
- component_mapper/config.py +247 -0
- component_mapper/mcp/__init__.py +0 -0
- component_mapper/mcp/official_client.py +182 -0
- component_mapper/mcp/registry_fetcher.py +214 -0
- component_mapper/models.py +159 -0
- component_mapper/pipeline.py +182 -0
- component_mapper/registry/__init__.py +0 -0
- component_mapper/registry/astro_generator.py +390 -0
- component_mapper/registry/custom_registry.py +127 -0
- component_mapper/registry/prop_mapper.py +370 -0
- component_mapper/registry/signature_index.py +694 -0
- component_mapper/stages/__init__.py +0 -0
- component_mapper/stages/astro_stage.py +122 -0
- component_mapper/stages/cache_lookup.py +93 -0
- component_mapper/stages/llm_mapper.py +509 -0
- component_mapper/stages/structural_match.py +145 -0
- component_mapper/utils/__init__.py +0 -0
- component_mapper/utils/similarity.py +69 -0
- component_mapper/utils/source_parser.py +292 -0
- component_mapper-0.1.0.dist-info/METADATA +16 -0
- component_mapper-0.1.0.dist-info/RECORD +25 -0
- component_mapper-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import litellm
|
|
6
|
+
from component_mapper.config import MapperSettings
|
|
7
|
+
from component_mapper.models import (
|
|
8
|
+
MappedComponent,
|
|
9
|
+
MappingStage,
|
|
10
|
+
MappingCacheRecord,
|
|
11
|
+
RankedCandidate,
|
|
12
|
+
PropMapping,
|
|
13
|
+
CustomComponentDefinition,
|
|
14
|
+
InteractivityMode,
|
|
15
|
+
PropDefinition,
|
|
16
|
+
)
|
|
17
|
+
from component_mapper.cache.mapping_cache import MappingCache
|
|
18
|
+
from component_mapper.registry.signature_index import SignatureIndex
|
|
19
|
+
from component_mapper.registry.custom_registry import CustomRegistry
|
|
20
|
+
from component_mapper.registry.prop_mapper import (
|
|
21
|
+
infer_prop_mapping,
|
|
22
|
+
_extract_content_nodes,
|
|
23
|
+
)
|
|
24
|
+
from component_mapper.registry.astro_generator import generate_astro_component
|
|
25
|
+
from segment_classifier.models import ClassifiedSegment, ComponentType
|
|
26
|
+
from segment_classifier.utils.html_normalizer import normalize_segment
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LLMMapperStage:
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
settings: MapperSettings,
|
|
35
|
+
index: SignatureIndex,
|
|
36
|
+
custom_registry: CustomRegistry,
|
|
37
|
+
cache: MappingCache,
|
|
38
|
+
):
|
|
39
|
+
self._settings = settings
|
|
40
|
+
self._index = index
|
|
41
|
+
self._custom_registry = custom_registry
|
|
42
|
+
self._cache = cache
|
|
43
|
+
self.calls_made = 0
|
|
44
|
+
self._model_usage: dict[str, int] = {}
|
|
45
|
+
self._sem = asyncio.Semaphore(settings.litellm.max_concurrent_batches)
|
|
46
|
+
self._llm_defaults: dict = {}
|
|
47
|
+
self._setup_litellm()
|
|
48
|
+
|
|
49
|
+
def _setup_litellm(self) -> None:
|
|
50
|
+
cfg = self._settings.litellm
|
|
51
|
+
|
|
52
|
+
# Universal API key: read once, set on litellm module so all providers can use it
|
|
53
|
+
api_key = os.environ.get(cfg.api_key_env, "")
|
|
54
|
+
if api_key:
|
|
55
|
+
litellm.api_key = api_key
|
|
56
|
+
logger.debug("LiteLLM api_key set from env %s", cfg.api_key_env)
|
|
57
|
+
else:
|
|
58
|
+
logger.debug(
|
|
59
|
+
"Env %s not set — relying on provider-specific env vars",
|
|
60
|
+
cfg.api_key_env,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Load config file defaults (JSON, optional)
|
|
64
|
+
if cfg.config_path:
|
|
65
|
+
try:
|
|
66
|
+
self._llm_defaults = litellm.read_config_args(cfg.config_path)
|
|
67
|
+
logger.info(
|
|
68
|
+
"Loaded LiteLLM config from %s: %s",
|
|
69
|
+
cfg.config_path,
|
|
70
|
+
list(self._llm_defaults),
|
|
71
|
+
)
|
|
72
|
+
except Exception as exc:
|
|
73
|
+
logger.warning(
|
|
74
|
+
"Failed to load LiteLLM config %s: %s", cfg.config_path, exc
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def get_model_usage(self) -> dict[str, int]:
|
|
78
|
+
return dict(self._model_usage)
|
|
79
|
+
|
|
80
|
+
def _select_model(
|
|
81
|
+
self,
|
|
82
|
+
candidates: list[RankedCandidate],
|
|
83
|
+
is_novel: bool,
|
|
84
|
+
max_prop_count: int,
|
|
85
|
+
) -> str:
|
|
86
|
+
cfg = self._settings.model_routing
|
|
87
|
+
if is_novel:
|
|
88
|
+
return cfg.standard_model
|
|
89
|
+
if len(candidates) > cfg.complex_candidate_threshold:
|
|
90
|
+
return cfg.complex_model
|
|
91
|
+
if (
|
|
92
|
+
len(candidates) <= cfg.fast_max_candidates
|
|
93
|
+
and max_prop_count <= cfg.fast_max_props
|
|
94
|
+
):
|
|
95
|
+
return cfg.fast_model
|
|
96
|
+
return cfg.standard_model
|
|
97
|
+
|
|
98
|
+
async def process(
|
|
99
|
+
self,
|
|
100
|
+
ambiguous: list[tuple[ClassifiedSegment, list[RankedCandidate]]],
|
|
101
|
+
novel: list[ClassifiedSegment],
|
|
102
|
+
) -> tuple[list[MappedComponent], list[ClassifiedSegment]]:
|
|
103
|
+
"""Returns (mapped, still_unresolved)."""
|
|
104
|
+
mapped: list[MappedComponent] = []
|
|
105
|
+
unresolved: list[ClassifiedSegment] = []
|
|
106
|
+
|
|
107
|
+
# Process ambiguous
|
|
108
|
+
if ambiguous:
|
|
109
|
+
batch_size = self._settings.litellm.batch_size
|
|
110
|
+
batches = [
|
|
111
|
+
ambiguous[i : i + batch_size]
|
|
112
|
+
for i in range(0, len(ambiguous), batch_size)
|
|
113
|
+
]
|
|
114
|
+
for batch in batches:
|
|
115
|
+
results = await asyncio.gather(
|
|
116
|
+
*[self._process_ambiguous(seg, cands) for seg, cands in batch],
|
|
117
|
+
return_exceptions=True,
|
|
118
|
+
)
|
|
119
|
+
for (seg, _), result in zip(batch, results):
|
|
120
|
+
if isinstance(result, Exception) or result is None:
|
|
121
|
+
logger.warning(
|
|
122
|
+
"LLM mapping failed for %s: %s", seg.segment_id, result
|
|
123
|
+
)
|
|
124
|
+
unresolved.append(seg)
|
|
125
|
+
else:
|
|
126
|
+
mapped.append(result)
|
|
127
|
+
|
|
128
|
+
# Process novel
|
|
129
|
+
if novel:
|
|
130
|
+
batch_size = self._settings.litellm.batch_size
|
|
131
|
+
batches = [
|
|
132
|
+
novel[i : i + batch_size] for i in range(0, len(novel), batch_size)
|
|
133
|
+
]
|
|
134
|
+
for batch in batches:
|
|
135
|
+
results = await asyncio.gather(
|
|
136
|
+
*[self._process_novel(seg) for seg in batch],
|
|
137
|
+
return_exceptions=True,
|
|
138
|
+
)
|
|
139
|
+
for seg, result in zip(batch, results):
|
|
140
|
+
if isinstance(result, Exception) or result is None:
|
|
141
|
+
logger.warning(
|
|
142
|
+
"Novel LLM mapping failed for %s: %s",
|
|
143
|
+
seg.segment_id,
|
|
144
|
+
result,
|
|
145
|
+
)
|
|
146
|
+
unresolved.append(seg)
|
|
147
|
+
else:
|
|
148
|
+
mapped.append(result)
|
|
149
|
+
|
|
150
|
+
logger.info(
|
|
151
|
+
"LLM stage: %d mapped, %d unresolved, %d LLM calls",
|
|
152
|
+
len(mapped),
|
|
153
|
+
len(unresolved),
|
|
154
|
+
self.calls_made,
|
|
155
|
+
)
|
|
156
|
+
return mapped, unresolved
|
|
157
|
+
|
|
158
|
+
async def _process_ambiguous(
|
|
159
|
+
self,
|
|
160
|
+
seg: ClassifiedSegment,
|
|
161
|
+
candidates: list[RankedCandidate],
|
|
162
|
+
) -> MappedComponent | None:
|
|
163
|
+
normalized = normalize_segment(seg.raw_html, seg.text_content)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
content_nodes = _extract_content_nodes(seg.raw_html)
|
|
167
|
+
except Exception:
|
|
168
|
+
content_nodes = []
|
|
169
|
+
|
|
170
|
+
# Find ambiguous prop mappings from best candidate
|
|
171
|
+
ambiguous_mappings = []
|
|
172
|
+
if candidates:
|
|
173
|
+
best_sig = candidates[0].signature
|
|
174
|
+
pm = infer_prop_mapping(seg.raw_html, best_sig.props)
|
|
175
|
+
ambiguous_mappings = [
|
|
176
|
+
{
|
|
177
|
+
"segment_field": m["segment_field"],
|
|
178
|
+
"candidates": [m["component_prop"]],
|
|
179
|
+
}
|
|
180
|
+
for m in pm.mappings
|
|
181
|
+
if m.get("ambiguous")
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
max_prop_count = max((len(c.signature.props) for c in candidates), default=0)
|
|
185
|
+
model = self._select_model(candidates, False, max_prop_count)
|
|
186
|
+
|
|
187
|
+
prompt_user = json.dumps(
|
|
188
|
+
{
|
|
189
|
+
"segment_id": seg.segment_id,
|
|
190
|
+
"component_type": seg.component_type.value,
|
|
191
|
+
"dom_skeleton": normalized.skeleton,
|
|
192
|
+
"class_tokens": normalized.class_tokens,
|
|
193
|
+
"sibling_count": getattr(seg, "sibling_count", 0),
|
|
194
|
+
"candidates": [
|
|
195
|
+
{
|
|
196
|
+
"name": c.component_name,
|
|
197
|
+
"score": c.composite_score,
|
|
198
|
+
"props": [p.model_dump() for p in c.signature.props],
|
|
199
|
+
"dom_skeleton": c.signature.dom_skeleton,
|
|
200
|
+
}
|
|
201
|
+
for c in candidates
|
|
202
|
+
],
|
|
203
|
+
"content_nodes": [
|
|
204
|
+
{
|
|
205
|
+
"tag": n["tag"],
|
|
206
|
+
"selector": n["selector"],
|
|
207
|
+
"type": n["type"],
|
|
208
|
+
}
|
|
209
|
+
for n in content_nodes[:10]
|
|
210
|
+
],
|
|
211
|
+
"ambiguous_mappings": ambiguous_mappings,
|
|
212
|
+
},
|
|
213
|
+
indent=2,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
response = await self._call_llm(
|
|
217
|
+
model=model,
|
|
218
|
+
system=(
|
|
219
|
+
"You are a UI component mapper. Given an HTML segment and candidate "
|
|
220
|
+
"Shadcn components, select the best match and provide prop mappings. "
|
|
221
|
+
"Output ONLY valid JSON. No markdown, no explanation."
|
|
222
|
+
),
|
|
223
|
+
user=prompt_user,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if response is None:
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
return self._build_mapped_ambiguous(seg, response, candidates, model)
|
|
230
|
+
|
|
231
|
+
async def _process_novel(self, seg: ClassifiedSegment) -> MappedComponent | None:
|
|
232
|
+
normalized = normalize_segment(seg.raw_html, seg.text_content)
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
content_nodes = _extract_content_nodes(seg.raw_html)
|
|
236
|
+
except Exception:
|
|
237
|
+
content_nodes = []
|
|
238
|
+
|
|
239
|
+
model = self._settings.model_routing.standard_model
|
|
240
|
+
|
|
241
|
+
prompt_user = json.dumps(
|
|
242
|
+
{
|
|
243
|
+
"segment_id": seg.segment_id,
|
|
244
|
+
"component_type": seg.component_type.value,
|
|
245
|
+
"dom_skeleton": normalized.skeleton,
|
|
246
|
+
"class_tokens": normalized.class_tokens,
|
|
247
|
+
"sibling_count": getattr(seg, "sibling_count", 0),
|
|
248
|
+
"content_nodes": [
|
|
249
|
+
{
|
|
250
|
+
"tag": n["tag"],
|
|
251
|
+
"selector": n["selector"],
|
|
252
|
+
"type": n["type"],
|
|
253
|
+
}
|
|
254
|
+
for n in content_nodes[:10]
|
|
255
|
+
],
|
|
256
|
+
"task": (
|
|
257
|
+
"Define a new custom component for this segment. "
|
|
258
|
+
"Return a JSON object with keys: segment_id, custom_component "
|
|
259
|
+
"(name, dom_skeleton, structural_class_tokens, compatible_component_types, "
|
|
260
|
+
"props, interactivity, description), prop_mapping, confidence, reasoning."
|
|
261
|
+
),
|
|
262
|
+
},
|
|
263
|
+
indent=2,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
response = await self._call_llm(
|
|
267
|
+
model=model,
|
|
268
|
+
system=(
|
|
269
|
+
"You are a UI component designer. Given an HTML segment, define a new "
|
|
270
|
+
"custom component and its prop mapping. "
|
|
271
|
+
"Output ONLY valid JSON. No markdown, no explanation."
|
|
272
|
+
),
|
|
273
|
+
user=prompt_user,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
if response is None:
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
return self._build_mapped_novel(seg, response, model)
|
|
280
|
+
|
|
281
|
+
async def _call_llm(self, model: str, system: str, user: str) -> dict | None:
|
|
282
|
+
# Config file model takes priority; routing model is the fallback
|
|
283
|
+
resolved_model = self._llm_defaults.get("model", model)
|
|
284
|
+
async with self._sem:
|
|
285
|
+
try:
|
|
286
|
+
self.calls_made += 1
|
|
287
|
+
self._model_usage[resolved_model] = self._model_usage.get(resolved_model, 0) + 1
|
|
288
|
+
|
|
289
|
+
call_kwargs: dict = {
|
|
290
|
+
"temperature": 0.1,
|
|
291
|
+
"max_tokens": 1024,
|
|
292
|
+
**self._llm_defaults,
|
|
293
|
+
"model": resolved_model,
|
|
294
|
+
"messages": [
|
|
295
|
+
{"role": "system", "content": system},
|
|
296
|
+
{"role": "user", "content": user},
|
|
297
|
+
],
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
resp = await asyncio.wait_for(
|
|
301
|
+
litellm.acompletion(**call_kwargs),
|
|
302
|
+
timeout=self._settings.litellm.timeout_seconds,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
content = resp.choices[0].message.content or ""
|
|
306
|
+
content = content.strip()
|
|
307
|
+
if content.startswith("```"):
|
|
308
|
+
content = "\n".join(content.split("\n")[1:])
|
|
309
|
+
if content.endswith("```"):
|
|
310
|
+
content = "\n".join(content.split("\n")[:-1])
|
|
311
|
+
|
|
312
|
+
return json.loads(content)
|
|
313
|
+
|
|
314
|
+
except asyncio.TimeoutError:
|
|
315
|
+
logger.warning("LLM call timed out for model %s", model)
|
|
316
|
+
return None
|
|
317
|
+
except json.JSONDecodeError as exc:
|
|
318
|
+
logger.warning("LLM returned invalid JSON from %s: %s", model, exc)
|
|
319
|
+
return None
|
|
320
|
+
except Exception as exc:
|
|
321
|
+
logger.warning("LLM call failed (%s): %s", model, exc)
|
|
322
|
+
return None
|
|
323
|
+
|
|
324
|
+
def _build_mapped_ambiguous(
|
|
325
|
+
self,
|
|
326
|
+
seg: ClassifiedSegment,
|
|
327
|
+
response: dict,
|
|
328
|
+
candidates: list[RankedCandidate],
|
|
329
|
+
model: str,
|
|
330
|
+
) -> MappedComponent | None:
|
|
331
|
+
selected = response.get("selected_component", "")
|
|
332
|
+
confidence = float(response.get("confidence", 0.5))
|
|
333
|
+
prop_mapping_raw = response.get("prop_mapping", [])
|
|
334
|
+
reasoning = response.get("reasoning", "")
|
|
335
|
+
|
|
336
|
+
# Find signature
|
|
337
|
+
sig = self._index.get_signature(selected)
|
|
338
|
+
if sig is None and candidates:
|
|
339
|
+
sig = candidates[0].signature
|
|
340
|
+
selected = candidates[0].component_name
|
|
341
|
+
|
|
342
|
+
if sig is None:
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
# Build PropMapping from LLM response
|
|
346
|
+
mappings = []
|
|
347
|
+
for m in prop_mapping_raw:
|
|
348
|
+
mappings.append(
|
|
349
|
+
{
|
|
350
|
+
"segment_field": m.get("segment_field", ""),
|
|
351
|
+
"component_prop": m.get("component_prop", ""),
|
|
352
|
+
"type": "text",
|
|
353
|
+
"confidence": float(m.get("confidence", 0.7)),
|
|
354
|
+
"ambiguous": float(m.get("confidence", 0.7)) < 0.70,
|
|
355
|
+
}
|
|
356
|
+
)
|
|
357
|
+
prop_mapping = PropMapping(
|
|
358
|
+
mappings=mappings,
|
|
359
|
+
has_ambiguous=any(m["ambiguous"] for m in mappings),
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
astro = generate_astro_component(seg, sig, prop_mapping, selected)
|
|
364
|
+
except Exception as exc:
|
|
365
|
+
logger.debug("Astro gen failed for LLM-mapped %s: %s", seg.segment_id, exc)
|
|
366
|
+
from component_mapper.stages.cache_lookup import _minimal_astro
|
|
367
|
+
|
|
368
|
+
astro = _minimal_astro(selected)
|
|
369
|
+
|
|
370
|
+
# Cache result
|
|
371
|
+
asyncio.create_task(
|
|
372
|
+
self._cache.set(
|
|
373
|
+
seg.fingerprint_hash,
|
|
374
|
+
MappingCacheRecord(
|
|
375
|
+
fingerprint_hash=seg.fingerprint_hash,
|
|
376
|
+
component_name=selected,
|
|
377
|
+
registry_source=sig.registry_source,
|
|
378
|
+
prop_mapping=prop_mapping,
|
|
379
|
+
mapping_stage=MappingStage.LLM_MAPPED,
|
|
380
|
+
confidence=confidence,
|
|
381
|
+
),
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
return MappedComponent(
|
|
386
|
+
segment_id=seg.segment_id,
|
|
387
|
+
page_url=seg.page_url,
|
|
388
|
+
component_type=seg.component_type,
|
|
389
|
+
classification_stage=seg.classification_stage,
|
|
390
|
+
component_name=selected,
|
|
391
|
+
registry_source=sig.registry_source,
|
|
392
|
+
mapping_stage=MappingStage.LLM_MAPPED,
|
|
393
|
+
mapping_confidence=confidence,
|
|
394
|
+
prop_mapping=prop_mapping,
|
|
395
|
+
astro_component=astro,
|
|
396
|
+
llm_model_used=model,
|
|
397
|
+
llm_reasoning=reasoning,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
def _build_mapped_novel(
|
|
401
|
+
self,
|
|
402
|
+
seg: ClassifiedSegment,
|
|
403
|
+
response: dict,
|
|
404
|
+
model: str,
|
|
405
|
+
) -> MappedComponent | None:
|
|
406
|
+
custom_raw = response.get("custom_component", {})
|
|
407
|
+
if not custom_raw:
|
|
408
|
+
return None
|
|
409
|
+
|
|
410
|
+
confidence = float(response.get("confidence", 0.6))
|
|
411
|
+
prop_mapping_raw = response.get("prop_mapping", [])
|
|
412
|
+
reasoning = response.get("reasoning", "")
|
|
413
|
+
|
|
414
|
+
# Build props
|
|
415
|
+
props = []
|
|
416
|
+
for p in custom_raw.get("props", []):
|
|
417
|
+
props.append(
|
|
418
|
+
PropDefinition(
|
|
419
|
+
name=p.get("name", "prop"),
|
|
420
|
+
type=p.get("type", "string"),
|
|
421
|
+
required=p.get("required", False),
|
|
422
|
+
default_value=p.get("default_value"),
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Parse compatible types
|
|
427
|
+
compat_types = []
|
|
428
|
+
for ct_str in custom_raw.get("compatible_component_types", []):
|
|
429
|
+
try:
|
|
430
|
+
compat_types.append(ComponentType(ct_str))
|
|
431
|
+
except ValueError:
|
|
432
|
+
pass
|
|
433
|
+
|
|
434
|
+
interactivity = InteractivityMode.STATIC
|
|
435
|
+
try:
|
|
436
|
+
interactivity = InteractivityMode(custom_raw.get("interactivity", "static"))
|
|
437
|
+
except ValueError:
|
|
438
|
+
pass
|
|
439
|
+
|
|
440
|
+
defn = CustomComponentDefinition(
|
|
441
|
+
name=custom_raw.get("name", f"custom-{seg.segment_id[:8]}"),
|
|
442
|
+
dom_skeleton=custom_raw.get("dom_skeleton", "div"),
|
|
443
|
+
structural_class_tokens=custom_raw.get("structural_class_tokens", []),
|
|
444
|
+
compatible_component_types=compat_types,
|
|
445
|
+
props=props,
|
|
446
|
+
astro_import=f"@/components/custom/{custom_raw.get('name', 'custom')}.astro",
|
|
447
|
+
interactivity=interactivity,
|
|
448
|
+
description=custom_raw.get("description", ""),
|
|
449
|
+
source="llm_generated",
|
|
450
|
+
confidence=confidence,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
sig = self._custom_registry.register(defn)
|
|
454
|
+
self._index.register_custom(defn)
|
|
455
|
+
|
|
456
|
+
# Build PropMapping
|
|
457
|
+
mappings = []
|
|
458
|
+
for m in prop_mapping_raw:
|
|
459
|
+
mappings.append(
|
|
460
|
+
{
|
|
461
|
+
"segment_field": m.get("segment_field", ""),
|
|
462
|
+
"component_prop": m.get("component_prop", ""),
|
|
463
|
+
"type": "text",
|
|
464
|
+
"confidence": float(m.get("confidence", 0.6)),
|
|
465
|
+
"ambiguous": float(m.get("confidence", 0.6)) < 0.70,
|
|
466
|
+
}
|
|
467
|
+
)
|
|
468
|
+
prop_mapping = PropMapping(
|
|
469
|
+
mappings=mappings,
|
|
470
|
+
has_ambiguous=any(m["ambiguous"] for m in mappings),
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
try:
|
|
474
|
+
astro = generate_astro_component(seg, sig, prop_mapping, defn.name)
|
|
475
|
+
except Exception as exc:
|
|
476
|
+
logger.debug("Astro gen failed for novel %s: %s", seg.segment_id, exc)
|
|
477
|
+
from component_mapper.stages.cache_lookup import _minimal_astro
|
|
478
|
+
|
|
479
|
+
astro = _minimal_astro(defn.name)
|
|
480
|
+
|
|
481
|
+
# Cache
|
|
482
|
+
asyncio.create_task(
|
|
483
|
+
self._cache.set(
|
|
484
|
+
seg.fingerprint_hash,
|
|
485
|
+
MappingCacheRecord(
|
|
486
|
+
fingerprint_hash=seg.fingerprint_hash,
|
|
487
|
+
component_name=defn.name,
|
|
488
|
+
registry_source=sig.registry_source,
|
|
489
|
+
prop_mapping=prop_mapping,
|
|
490
|
+
mapping_stage=MappingStage.LLM_NOVEL,
|
|
491
|
+
confidence=confidence,
|
|
492
|
+
),
|
|
493
|
+
)
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
return MappedComponent(
|
|
497
|
+
segment_id=seg.segment_id,
|
|
498
|
+
page_url=seg.page_url,
|
|
499
|
+
component_type=seg.component_type,
|
|
500
|
+
classification_stage=seg.classification_stage,
|
|
501
|
+
component_name=defn.name,
|
|
502
|
+
registry_source=sig.registry_source,
|
|
503
|
+
mapping_stage=MappingStage.LLM_NOVEL,
|
|
504
|
+
mapping_confidence=confidence,
|
|
505
|
+
prop_mapping=prop_mapping,
|
|
506
|
+
astro_component=astro,
|
|
507
|
+
llm_model_used=model,
|
|
508
|
+
llm_reasoning=reasoning,
|
|
509
|
+
)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from component_mapper.models import (
|
|
4
|
+
MappedComponent,
|
|
5
|
+
MappingStage,
|
|
6
|
+
MappingCacheRecord,
|
|
7
|
+
RankedCandidate,
|
|
8
|
+
)
|
|
9
|
+
from component_mapper.cache.mapping_cache import MappingCache
|
|
10
|
+
from component_mapper.registry.signature_index import SignatureIndex
|
|
11
|
+
from component_mapper.registry.prop_mapper import infer_prop_mapping
|
|
12
|
+
from component_mapper.registry.astro_generator import generate_astro_component
|
|
13
|
+
from segment_classifier.models import ClassifiedSegment
|
|
14
|
+
from segment_classifier.utils.html_normalizer import normalize_segment
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class StructuralMatchStage:
|
|
20
|
+
def __init__(self, index: SignatureIndex, cache: MappingCache):
|
|
21
|
+
self._index = index
|
|
22
|
+
self._cache = cache
|
|
23
|
+
|
|
24
|
+
async def process(
|
|
25
|
+
self,
|
|
26
|
+
segments: list[ClassifiedSegment],
|
|
27
|
+
) -> tuple[
|
|
28
|
+
list[MappedComponent],
|
|
29
|
+
list[tuple[ClassifiedSegment, list[RankedCandidate]]],
|
|
30
|
+
list[ClassifiedSegment],
|
|
31
|
+
]:
|
|
32
|
+
"""Returns (direct_matches, ambiguous_with_candidates, novel)."""
|
|
33
|
+
if not segments:
|
|
34
|
+
return [], [], []
|
|
35
|
+
|
|
36
|
+
cfg = self._index._settings.signature_index
|
|
37
|
+
|
|
38
|
+
# Build batch items for index query
|
|
39
|
+
items = []
|
|
40
|
+
for seg in segments:
|
|
41
|
+
normalized = normalize_segment(seg.raw_html, seg.text_content)
|
|
42
|
+
items.append((seg.component_type, normalized, seg.fingerprint_hash))
|
|
43
|
+
|
|
44
|
+
candidates_map = self._index.batch_get_candidates(items)
|
|
45
|
+
|
|
46
|
+
direct: list[MappedComponent] = []
|
|
47
|
+
ambiguous: list[tuple[ClassifiedSegment, list[RankedCandidate]]] = []
|
|
48
|
+
novel: list[ClassifiedSegment] = []
|
|
49
|
+
|
|
50
|
+
tasks = []
|
|
51
|
+
task_segs = []
|
|
52
|
+
for seg in segments:
|
|
53
|
+
cands = candidates_map.get(seg.fingerprint_hash, [])
|
|
54
|
+
tasks.append(self._classify_segment(seg, cands, cfg))
|
|
55
|
+
task_segs.append(seg)
|
|
56
|
+
|
|
57
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
58
|
+
|
|
59
|
+
for seg, result in zip(task_segs, results):
|
|
60
|
+
if isinstance(result, Exception):
|
|
61
|
+
logger.warning(
|
|
62
|
+
"Structural match error for %s: %s", seg.segment_id, result
|
|
63
|
+
)
|
|
64
|
+
novel.append(seg)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
outcome, data = result
|
|
68
|
+
if outcome == "direct":
|
|
69
|
+
direct.append(data)
|
|
70
|
+
elif outcome == "ambiguous":
|
|
71
|
+
ambiguous.append(data)
|
|
72
|
+
else:
|
|
73
|
+
novel.append(seg)
|
|
74
|
+
|
|
75
|
+
logger.info(
|
|
76
|
+
"Structural match: %d direct, %d ambiguous, %d novel",
|
|
77
|
+
len(direct),
|
|
78
|
+
len(ambiguous),
|
|
79
|
+
len(novel),
|
|
80
|
+
)
|
|
81
|
+
return direct, ambiguous, novel
|
|
82
|
+
|
|
83
|
+
async def _classify_segment(
|
|
84
|
+
self,
|
|
85
|
+
seg: ClassifiedSegment,
|
|
86
|
+
candidates: list[RankedCandidate],
|
|
87
|
+
cfg,
|
|
88
|
+
) -> tuple[str, any]:
|
|
89
|
+
if not candidates:
|
|
90
|
+
return "novel", None
|
|
91
|
+
|
|
92
|
+
top = candidates[0]
|
|
93
|
+
|
|
94
|
+
if top.composite_score >= cfg.direct_match_threshold:
|
|
95
|
+
# Try clean direct match
|
|
96
|
+
sig = top.signature
|
|
97
|
+
prop_mapping = infer_prop_mapping(seg.raw_html, sig.props)
|
|
98
|
+
|
|
99
|
+
if not prop_mapping.has_ambiguous:
|
|
100
|
+
# Clean match — generate Astro and cache
|
|
101
|
+
try:
|
|
102
|
+
astro = generate_astro_component(
|
|
103
|
+
seg, sig, prop_mapping, top.component_name
|
|
104
|
+
)
|
|
105
|
+
except Exception as exc:
|
|
106
|
+
logger.debug(
|
|
107
|
+
"Astro gen failed for direct match %s: %s",
|
|
108
|
+
seg.segment_id,
|
|
109
|
+
exc,
|
|
110
|
+
)
|
|
111
|
+
return "ambiguous", (seg, candidates)
|
|
112
|
+
|
|
113
|
+
mapped = MappedComponent(
|
|
114
|
+
segment_id=seg.segment_id,
|
|
115
|
+
page_url=seg.page_url,
|
|
116
|
+
component_type=seg.component_type,
|
|
117
|
+
classification_stage=seg.classification_stage,
|
|
118
|
+
component_name=top.component_name,
|
|
119
|
+
registry_source=top.registry_source,
|
|
120
|
+
mapping_stage=MappingStage.STRUCTURAL_MATCH,
|
|
121
|
+
mapping_confidence=top.composite_score,
|
|
122
|
+
prop_mapping=prop_mapping,
|
|
123
|
+
astro_component=astro,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
await self._cache.set(
|
|
127
|
+
seg.fingerprint_hash,
|
|
128
|
+
MappingCacheRecord(
|
|
129
|
+
fingerprint_hash=seg.fingerprint_hash,
|
|
130
|
+
component_name=top.component_name,
|
|
131
|
+
registry_source=top.registry_source,
|
|
132
|
+
prop_mapping=prop_mapping,
|
|
133
|
+
mapping_stage=MappingStage.STRUCTURAL_MATCH,
|
|
134
|
+
confidence=top.composite_score,
|
|
135
|
+
),
|
|
136
|
+
)
|
|
137
|
+
return "direct", mapped
|
|
138
|
+
else:
|
|
139
|
+
return "ambiguous", (seg, candidates)
|
|
140
|
+
|
|
141
|
+
elif top.composite_score >= cfg.candidate_min_threshold:
|
|
142
|
+
return "ambiguous", (seg, candidates)
|
|
143
|
+
|
|
144
|
+
else:
|
|
145
|
+
return "novel", None
|
|
File without changes
|