component-mapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,509 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import litellm
6
+ from component_mapper.config import MapperSettings
7
+ from component_mapper.models import (
8
+ MappedComponent,
9
+ MappingStage,
10
+ MappingCacheRecord,
11
+ RankedCandidate,
12
+ PropMapping,
13
+ CustomComponentDefinition,
14
+ InteractivityMode,
15
+ PropDefinition,
16
+ )
17
+ from component_mapper.cache.mapping_cache import MappingCache
18
+ from component_mapper.registry.signature_index import SignatureIndex
19
+ from component_mapper.registry.custom_registry import CustomRegistry
20
+ from component_mapper.registry.prop_mapper import (
21
+ infer_prop_mapping,
22
+ _extract_content_nodes,
23
+ )
24
+ from component_mapper.registry.astro_generator import generate_astro_component
25
+ from segment_classifier.models import ClassifiedSegment, ComponentType
26
+ from segment_classifier.utils.html_normalizer import normalize_segment
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class LLMMapperStage:
32
+ def __init__(
33
+ self,
34
+ settings: MapperSettings,
35
+ index: SignatureIndex,
36
+ custom_registry: CustomRegistry,
37
+ cache: MappingCache,
38
+ ):
39
+ self._settings = settings
40
+ self._index = index
41
+ self._custom_registry = custom_registry
42
+ self._cache = cache
43
+ self.calls_made = 0
44
+ self._model_usage: dict[str, int] = {}
45
+ self._sem = asyncio.Semaphore(settings.litellm.max_concurrent_batches)
46
+ self._llm_defaults: dict = {}
47
+ self._setup_litellm()
48
+
49
+ def _setup_litellm(self) -> None:
50
+ cfg = self._settings.litellm
51
+
52
+ # Universal API key: read once, set on litellm module so all providers can use it
53
+ api_key = os.environ.get(cfg.api_key_env, "")
54
+ if api_key:
55
+ litellm.api_key = api_key
56
+ logger.debug("LiteLLM api_key set from env %s", cfg.api_key_env)
57
+ else:
58
+ logger.debug(
59
+ "Env %s not set — relying on provider-specific env vars",
60
+ cfg.api_key_env,
61
+ )
62
+
63
+ # Load config file defaults (JSON, optional)
64
+ if cfg.config_path:
65
+ try:
66
+ self._llm_defaults = litellm.read_config_args(cfg.config_path)
67
+ logger.info(
68
+ "Loaded LiteLLM config from %s: %s",
69
+ cfg.config_path,
70
+ list(self._llm_defaults),
71
+ )
72
+ except Exception as exc:
73
+ logger.warning(
74
+ "Failed to load LiteLLM config %s: %s", cfg.config_path, exc
75
+ )
76
+
77
+ def get_model_usage(self) -> dict[str, int]:
78
+ return dict(self._model_usage)
79
+
80
+ def _select_model(
81
+ self,
82
+ candidates: list[RankedCandidate],
83
+ is_novel: bool,
84
+ max_prop_count: int,
85
+ ) -> str:
86
+ cfg = self._settings.model_routing
87
+ if is_novel:
88
+ return cfg.standard_model
89
+ if len(candidates) > cfg.complex_candidate_threshold:
90
+ return cfg.complex_model
91
+ if (
92
+ len(candidates) <= cfg.fast_max_candidates
93
+ and max_prop_count <= cfg.fast_max_props
94
+ ):
95
+ return cfg.fast_model
96
+ return cfg.standard_model
97
+
98
+ async def process(
99
+ self,
100
+ ambiguous: list[tuple[ClassifiedSegment, list[RankedCandidate]]],
101
+ novel: list[ClassifiedSegment],
102
+ ) -> tuple[list[MappedComponent], list[ClassifiedSegment]]:
103
+ """Returns (mapped, still_unresolved)."""
104
+ mapped: list[MappedComponent] = []
105
+ unresolved: list[ClassifiedSegment] = []
106
+
107
+ # Process ambiguous
108
+ if ambiguous:
109
+ batch_size = self._settings.litellm.batch_size
110
+ batches = [
111
+ ambiguous[i : i + batch_size]
112
+ for i in range(0, len(ambiguous), batch_size)
113
+ ]
114
+ for batch in batches:
115
+ results = await asyncio.gather(
116
+ *[self._process_ambiguous(seg, cands) for seg, cands in batch],
117
+ return_exceptions=True,
118
+ )
119
+ for (seg, _), result in zip(batch, results):
120
+ if isinstance(result, Exception) or result is None:
121
+ logger.warning(
122
+ "LLM mapping failed for %s: %s", seg.segment_id, result
123
+ )
124
+ unresolved.append(seg)
125
+ else:
126
+ mapped.append(result)
127
+
128
+ # Process novel
129
+ if novel:
130
+ batch_size = self._settings.litellm.batch_size
131
+ batches = [
132
+ novel[i : i + batch_size] for i in range(0, len(novel), batch_size)
133
+ ]
134
+ for batch in batches:
135
+ results = await asyncio.gather(
136
+ *[self._process_novel(seg) for seg in batch],
137
+ return_exceptions=True,
138
+ )
139
+ for seg, result in zip(batch, results):
140
+ if isinstance(result, Exception) or result is None:
141
+ logger.warning(
142
+ "Novel LLM mapping failed for %s: %s",
143
+ seg.segment_id,
144
+ result,
145
+ )
146
+ unresolved.append(seg)
147
+ else:
148
+ mapped.append(result)
149
+
150
+ logger.info(
151
+ "LLM stage: %d mapped, %d unresolved, %d LLM calls",
152
+ len(mapped),
153
+ len(unresolved),
154
+ self.calls_made,
155
+ )
156
+ return mapped, unresolved
157
+
158
+ async def _process_ambiguous(
159
+ self,
160
+ seg: ClassifiedSegment,
161
+ candidates: list[RankedCandidate],
162
+ ) -> MappedComponent | None:
163
+ normalized = normalize_segment(seg.raw_html, seg.text_content)
164
+
165
+ try:
166
+ content_nodes = _extract_content_nodes(seg.raw_html)
167
+ except Exception:
168
+ content_nodes = []
169
+
170
+ # Find ambiguous prop mappings from best candidate
171
+ ambiguous_mappings = []
172
+ if candidates:
173
+ best_sig = candidates[0].signature
174
+ pm = infer_prop_mapping(seg.raw_html, best_sig.props)
175
+ ambiguous_mappings = [
176
+ {
177
+ "segment_field": m["segment_field"],
178
+ "candidates": [m["component_prop"]],
179
+ }
180
+ for m in pm.mappings
181
+ if m.get("ambiguous")
182
+ ]
183
+
184
+ max_prop_count = max((len(c.signature.props) for c in candidates), default=0)
185
+ model = self._select_model(candidates, False, max_prop_count)
186
+
187
+ prompt_user = json.dumps(
188
+ {
189
+ "segment_id": seg.segment_id,
190
+ "component_type": seg.component_type.value,
191
+ "dom_skeleton": normalized.skeleton,
192
+ "class_tokens": normalized.class_tokens,
193
+ "sibling_count": getattr(seg, "sibling_count", 0),
194
+ "candidates": [
195
+ {
196
+ "name": c.component_name,
197
+ "score": c.composite_score,
198
+ "props": [p.model_dump() for p in c.signature.props],
199
+ "dom_skeleton": c.signature.dom_skeleton,
200
+ }
201
+ for c in candidates
202
+ ],
203
+ "content_nodes": [
204
+ {
205
+ "tag": n["tag"],
206
+ "selector": n["selector"],
207
+ "type": n["type"],
208
+ }
209
+ for n in content_nodes[:10]
210
+ ],
211
+ "ambiguous_mappings": ambiguous_mappings,
212
+ },
213
+ indent=2,
214
+ )
215
+
216
+ response = await self._call_llm(
217
+ model=model,
218
+ system=(
219
+ "You are a UI component mapper. Given an HTML segment and candidate "
220
+ "Shadcn components, select the best match and provide prop mappings. "
221
+ "Output ONLY valid JSON. No markdown, no explanation."
222
+ ),
223
+ user=prompt_user,
224
+ )
225
+
226
+ if response is None:
227
+ return None
228
+
229
+ return self._build_mapped_ambiguous(seg, response, candidates, model)
230
+
231
+ async def _process_novel(self, seg: ClassifiedSegment) -> MappedComponent | None:
232
+ normalized = normalize_segment(seg.raw_html, seg.text_content)
233
+
234
+ try:
235
+ content_nodes = _extract_content_nodes(seg.raw_html)
236
+ except Exception:
237
+ content_nodes = []
238
+
239
+ model = self._settings.model_routing.standard_model
240
+
241
+ prompt_user = json.dumps(
242
+ {
243
+ "segment_id": seg.segment_id,
244
+ "component_type": seg.component_type.value,
245
+ "dom_skeleton": normalized.skeleton,
246
+ "class_tokens": normalized.class_tokens,
247
+ "sibling_count": getattr(seg, "sibling_count", 0),
248
+ "content_nodes": [
249
+ {
250
+ "tag": n["tag"],
251
+ "selector": n["selector"],
252
+ "type": n["type"],
253
+ }
254
+ for n in content_nodes[:10]
255
+ ],
256
+ "task": (
257
+ "Define a new custom component for this segment. "
258
+ "Return a JSON object with keys: segment_id, custom_component "
259
+ "(name, dom_skeleton, structural_class_tokens, compatible_component_types, "
260
+ "props, interactivity, description), prop_mapping, confidence, reasoning."
261
+ ),
262
+ },
263
+ indent=2,
264
+ )
265
+
266
+ response = await self._call_llm(
267
+ model=model,
268
+ system=(
269
+ "You are a UI component designer. Given an HTML segment, define a new "
270
+ "custom component and its prop mapping. "
271
+ "Output ONLY valid JSON. No markdown, no explanation."
272
+ ),
273
+ user=prompt_user,
274
+ )
275
+
276
+ if response is None:
277
+ return None
278
+
279
+ return self._build_mapped_novel(seg, response, model)
280
+
281
+ async def _call_llm(self, model: str, system: str, user: str) -> dict | None:
282
+ # Config file model takes priority; routing model is the fallback
283
+ resolved_model = self._llm_defaults.get("model", model)
284
+ async with self._sem:
285
+ try:
286
+ self.calls_made += 1
287
+ self._model_usage[resolved_model] = self._model_usage.get(resolved_model, 0) + 1
288
+
289
+ call_kwargs: dict = {
290
+ "temperature": 0.1,
291
+ "max_tokens": 1024,
292
+ **self._llm_defaults,
293
+ "model": resolved_model,
294
+ "messages": [
295
+ {"role": "system", "content": system},
296
+ {"role": "user", "content": user},
297
+ ],
298
+ }
299
+
300
+ resp = await asyncio.wait_for(
301
+ litellm.acompletion(**call_kwargs),
302
+ timeout=self._settings.litellm.timeout_seconds,
303
+ )
304
+
305
+ content = resp.choices[0].message.content or ""
306
+ content = content.strip()
307
+ if content.startswith("```"):
308
+ content = "\n".join(content.split("\n")[1:])
309
+ if content.endswith("```"):
310
+ content = "\n".join(content.split("\n")[:-1])
311
+
312
+ return json.loads(content)
313
+
314
+ except asyncio.TimeoutError:
315
+ logger.warning("LLM call timed out for model %s", model)
316
+ return None
317
+ except json.JSONDecodeError as exc:
318
+ logger.warning("LLM returned invalid JSON from %s: %s", model, exc)
319
+ return None
320
+ except Exception as exc:
321
+ logger.warning("LLM call failed (%s): %s", model, exc)
322
+ return None
323
+
324
+ def _build_mapped_ambiguous(
325
+ self,
326
+ seg: ClassifiedSegment,
327
+ response: dict,
328
+ candidates: list[RankedCandidate],
329
+ model: str,
330
+ ) -> MappedComponent | None:
331
+ selected = response.get("selected_component", "")
332
+ confidence = float(response.get("confidence", 0.5))
333
+ prop_mapping_raw = response.get("prop_mapping", [])
334
+ reasoning = response.get("reasoning", "")
335
+
336
+ # Find signature
337
+ sig = self._index.get_signature(selected)
338
+ if sig is None and candidates:
339
+ sig = candidates[0].signature
340
+ selected = candidates[0].component_name
341
+
342
+ if sig is None:
343
+ return None
344
+
345
+ # Build PropMapping from LLM response
346
+ mappings = []
347
+ for m in prop_mapping_raw:
348
+ mappings.append(
349
+ {
350
+ "segment_field": m.get("segment_field", ""),
351
+ "component_prop": m.get("component_prop", ""),
352
+ "type": "text",
353
+ "confidence": float(m.get("confidence", 0.7)),
354
+ "ambiguous": float(m.get("confidence", 0.7)) < 0.70,
355
+ }
356
+ )
357
+ prop_mapping = PropMapping(
358
+ mappings=mappings,
359
+ has_ambiguous=any(m["ambiguous"] for m in mappings),
360
+ )
361
+
362
+ try:
363
+ astro = generate_astro_component(seg, sig, prop_mapping, selected)
364
+ except Exception as exc:
365
+ logger.debug("Astro gen failed for LLM-mapped %s: %s", seg.segment_id, exc)
366
+ from component_mapper.stages.cache_lookup import _minimal_astro
367
+
368
+ astro = _minimal_astro(selected)
369
+
370
+ # Cache result
371
+ asyncio.create_task(
372
+ self._cache.set(
373
+ seg.fingerprint_hash,
374
+ MappingCacheRecord(
375
+ fingerprint_hash=seg.fingerprint_hash,
376
+ component_name=selected,
377
+ registry_source=sig.registry_source,
378
+ prop_mapping=prop_mapping,
379
+ mapping_stage=MappingStage.LLM_MAPPED,
380
+ confidence=confidence,
381
+ ),
382
+ )
383
+ )
384
+
385
+ return MappedComponent(
386
+ segment_id=seg.segment_id,
387
+ page_url=seg.page_url,
388
+ component_type=seg.component_type,
389
+ classification_stage=seg.classification_stage,
390
+ component_name=selected,
391
+ registry_source=sig.registry_source,
392
+ mapping_stage=MappingStage.LLM_MAPPED,
393
+ mapping_confidence=confidence,
394
+ prop_mapping=prop_mapping,
395
+ astro_component=astro,
396
+ llm_model_used=model,
397
+ llm_reasoning=reasoning,
398
+ )
399
+
400
+ def _build_mapped_novel(
401
+ self,
402
+ seg: ClassifiedSegment,
403
+ response: dict,
404
+ model: str,
405
+ ) -> MappedComponent | None:
406
+ custom_raw = response.get("custom_component", {})
407
+ if not custom_raw:
408
+ return None
409
+
410
+ confidence = float(response.get("confidence", 0.6))
411
+ prop_mapping_raw = response.get("prop_mapping", [])
412
+ reasoning = response.get("reasoning", "")
413
+
414
+ # Build props
415
+ props = []
416
+ for p in custom_raw.get("props", []):
417
+ props.append(
418
+ PropDefinition(
419
+ name=p.get("name", "prop"),
420
+ type=p.get("type", "string"),
421
+ required=p.get("required", False),
422
+ default_value=p.get("default_value"),
423
+ )
424
+ )
425
+
426
+ # Parse compatible types
427
+ compat_types = []
428
+ for ct_str in custom_raw.get("compatible_component_types", []):
429
+ try:
430
+ compat_types.append(ComponentType(ct_str))
431
+ except ValueError:
432
+ pass
433
+
434
+ interactivity = InteractivityMode.STATIC
435
+ try:
436
+ interactivity = InteractivityMode(custom_raw.get("interactivity", "static"))
437
+ except ValueError:
438
+ pass
439
+
440
+ defn = CustomComponentDefinition(
441
+ name=custom_raw.get("name", f"custom-{seg.segment_id[:8]}"),
442
+ dom_skeleton=custom_raw.get("dom_skeleton", "div"),
443
+ structural_class_tokens=custom_raw.get("structural_class_tokens", []),
444
+ compatible_component_types=compat_types,
445
+ props=props,
446
+ astro_import=f"@/components/custom/{custom_raw.get('name', 'custom')}.astro",
447
+ interactivity=interactivity,
448
+ description=custom_raw.get("description", ""),
449
+ source="llm_generated",
450
+ confidence=confidence,
451
+ )
452
+
453
+ sig = self._custom_registry.register(defn)
454
+ self._index.register_custom(defn)
455
+
456
+ # Build PropMapping
457
+ mappings = []
458
+ for m in prop_mapping_raw:
459
+ mappings.append(
460
+ {
461
+ "segment_field": m.get("segment_field", ""),
462
+ "component_prop": m.get("component_prop", ""),
463
+ "type": "text",
464
+ "confidence": float(m.get("confidence", 0.6)),
465
+ "ambiguous": float(m.get("confidence", 0.6)) < 0.70,
466
+ }
467
+ )
468
+ prop_mapping = PropMapping(
469
+ mappings=mappings,
470
+ has_ambiguous=any(m["ambiguous"] for m in mappings),
471
+ )
472
+
473
+ try:
474
+ astro = generate_astro_component(seg, sig, prop_mapping, defn.name)
475
+ except Exception as exc:
476
+ logger.debug("Astro gen failed for novel %s: %s", seg.segment_id, exc)
477
+ from component_mapper.stages.cache_lookup import _minimal_astro
478
+
479
+ astro = _minimal_astro(defn.name)
480
+
481
+ # Cache
482
+ asyncio.create_task(
483
+ self._cache.set(
484
+ seg.fingerprint_hash,
485
+ MappingCacheRecord(
486
+ fingerprint_hash=seg.fingerprint_hash,
487
+ component_name=defn.name,
488
+ registry_source=sig.registry_source,
489
+ prop_mapping=prop_mapping,
490
+ mapping_stage=MappingStage.LLM_NOVEL,
491
+ confidence=confidence,
492
+ ),
493
+ )
494
+ )
495
+
496
+ return MappedComponent(
497
+ segment_id=seg.segment_id,
498
+ page_url=seg.page_url,
499
+ component_type=seg.component_type,
500
+ classification_stage=seg.classification_stage,
501
+ component_name=defn.name,
502
+ registry_source=sig.registry_source,
503
+ mapping_stage=MappingStage.LLM_NOVEL,
504
+ mapping_confidence=confidence,
505
+ prop_mapping=prop_mapping,
506
+ astro_component=astro,
507
+ llm_model_used=model,
508
+ llm_reasoning=reasoning,
509
+ )
@@ -0,0 +1,145 @@
1
+ import asyncio
2
+ import logging
3
+ from component_mapper.models import (
4
+ MappedComponent,
5
+ MappingStage,
6
+ MappingCacheRecord,
7
+ RankedCandidate,
8
+ )
9
+ from component_mapper.cache.mapping_cache import MappingCache
10
+ from component_mapper.registry.signature_index import SignatureIndex
11
+ from component_mapper.registry.prop_mapper import infer_prop_mapping
12
+ from component_mapper.registry.astro_generator import generate_astro_component
13
+ from segment_classifier.models import ClassifiedSegment
14
+ from segment_classifier.utils.html_normalizer import normalize_segment
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class StructuralMatchStage:
20
+ def __init__(self, index: SignatureIndex, cache: MappingCache):
21
+ self._index = index
22
+ self._cache = cache
23
+
24
+ async def process(
25
+ self,
26
+ segments: list[ClassifiedSegment],
27
+ ) -> tuple[
28
+ list[MappedComponent],
29
+ list[tuple[ClassifiedSegment, list[RankedCandidate]]],
30
+ list[ClassifiedSegment],
31
+ ]:
32
+ """Returns (direct_matches, ambiguous_with_candidates, novel)."""
33
+ if not segments:
34
+ return [], [], []
35
+
36
+ cfg = self._index._settings.signature_index
37
+
38
+ # Build batch items for index query
39
+ items = []
40
+ for seg in segments:
41
+ normalized = normalize_segment(seg.raw_html, seg.text_content)
42
+ items.append((seg.component_type, normalized, seg.fingerprint_hash))
43
+
44
+ candidates_map = self._index.batch_get_candidates(items)
45
+
46
+ direct: list[MappedComponent] = []
47
+ ambiguous: list[tuple[ClassifiedSegment, list[RankedCandidate]]] = []
48
+ novel: list[ClassifiedSegment] = []
49
+
50
+ tasks = []
51
+ task_segs = []
52
+ for seg in segments:
53
+ cands = candidates_map.get(seg.fingerprint_hash, [])
54
+ tasks.append(self._classify_segment(seg, cands, cfg))
55
+ task_segs.append(seg)
56
+
57
+ results = await asyncio.gather(*tasks, return_exceptions=True)
58
+
59
+ for seg, result in zip(task_segs, results):
60
+ if isinstance(result, Exception):
61
+ logger.warning(
62
+ "Structural match error for %s: %s", seg.segment_id, result
63
+ )
64
+ novel.append(seg)
65
+ continue
66
+
67
+ outcome, data = result
68
+ if outcome == "direct":
69
+ direct.append(data)
70
+ elif outcome == "ambiguous":
71
+ ambiguous.append(data)
72
+ else:
73
+ novel.append(seg)
74
+
75
+ logger.info(
76
+ "Structural match: %d direct, %d ambiguous, %d novel",
77
+ len(direct),
78
+ len(ambiguous),
79
+ len(novel),
80
+ )
81
+ return direct, ambiguous, novel
82
+
83
+ async def _classify_segment(
84
+ self,
85
+ seg: ClassifiedSegment,
86
+ candidates: list[RankedCandidate],
87
+ cfg,
88
+ ) -> tuple[str, any]:
89
+ if not candidates:
90
+ return "novel", None
91
+
92
+ top = candidates[0]
93
+
94
+ if top.composite_score >= cfg.direct_match_threshold:
95
+ # Try clean direct match
96
+ sig = top.signature
97
+ prop_mapping = infer_prop_mapping(seg.raw_html, sig.props)
98
+
99
+ if not prop_mapping.has_ambiguous:
100
+ # Clean match — generate Astro and cache
101
+ try:
102
+ astro = generate_astro_component(
103
+ seg, sig, prop_mapping, top.component_name
104
+ )
105
+ except Exception as exc:
106
+ logger.debug(
107
+ "Astro gen failed for direct match %s: %s",
108
+ seg.segment_id,
109
+ exc,
110
+ )
111
+ return "ambiguous", (seg, candidates)
112
+
113
+ mapped = MappedComponent(
114
+ segment_id=seg.segment_id,
115
+ page_url=seg.page_url,
116
+ component_type=seg.component_type,
117
+ classification_stage=seg.classification_stage,
118
+ component_name=top.component_name,
119
+ registry_source=top.registry_source,
120
+ mapping_stage=MappingStage.STRUCTURAL_MATCH,
121
+ mapping_confidence=top.composite_score,
122
+ prop_mapping=prop_mapping,
123
+ astro_component=astro,
124
+ )
125
+
126
+ await self._cache.set(
127
+ seg.fingerprint_hash,
128
+ MappingCacheRecord(
129
+ fingerprint_hash=seg.fingerprint_hash,
130
+ component_name=top.component_name,
131
+ registry_source=top.registry_source,
132
+ prop_mapping=prop_mapping,
133
+ mapping_stage=MappingStage.STRUCTURAL_MATCH,
134
+ confidence=top.composite_score,
135
+ ),
136
+ )
137
+ return "direct", mapped
138
+ else:
139
+ return "ambiguous", (seg, candidates)
140
+
141
+ elif top.composite_score >= cfg.candidate_min_threshold:
142
+ return "ambiguous", (seg, candidates)
143
+
144
+ else:
145
+ return "novel", None
File without changes