component-mapper 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- component_mapper/__init__.py +4 -0
- component_mapper/cache/__init__.py +0 -0
- component_mapper/cache/mapping_cache.py +72 -0
- component_mapper/config.py +247 -0
- component_mapper/mcp/__init__.py +0 -0
- component_mapper/mcp/official_client.py +182 -0
- component_mapper/mcp/registry_fetcher.py +214 -0
- component_mapper/models.py +159 -0
- component_mapper/pipeline.py +182 -0
- component_mapper/registry/__init__.py +0 -0
- component_mapper/registry/astro_generator.py +390 -0
- component_mapper/registry/custom_registry.py +127 -0
- component_mapper/registry/prop_mapper.py +370 -0
- component_mapper/registry/signature_index.py +694 -0
- component_mapper/stages/__init__.py +0 -0
- component_mapper/stages/astro_stage.py +122 -0
- component_mapper/stages/cache_lookup.py +93 -0
- component_mapper/stages/llm_mapper.py +509 -0
- component_mapper/stages/structural_match.py +145 -0
- component_mapper/utils/__init__.py +0 -0
- component_mapper/utils/similarity.py +69 -0
- component_mapper/utils/source_parser.py +292 -0
- component_mapper-0.1.0.dist-info/METADATA +16 -0
- component_mapper-0.1.0.dist-info/RECORD +25 -0
- component_mapper-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
from component_mapper.models import PropDefinition, PropMapping
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
PROP_NAME_TO_TAG_SCORES: dict[str, dict[str, float]] = {
|
|
8
|
+
"title": {"h1": 0.95, "h2": 0.90, "h3": 0.85, "h4": 0.75, "p": 0.40},
|
|
9
|
+
"description": {"p": 0.90, "span": 0.70, "div": 0.50, "h4": 0.55},
|
|
10
|
+
"image": {"img": 0.99, "picture": 0.95},
|
|
11
|
+
"src": {"img": 0.99},
|
|
12
|
+
"href": {"a": 0.99},
|
|
13
|
+
"label": {"span": 0.85, "p": 0.70, "div": 0.60},
|
|
14
|
+
"price": {"span": 0.80, "p": 0.75, "div": 0.60},
|
|
15
|
+
"children": {"div": 0.70, "section": 0.70, "p": 0.65},
|
|
16
|
+
"footer": {"button": 0.80, "div": 0.60, "a": 0.70},
|
|
17
|
+
"action": {"button": 0.95, "a": 0.85},
|
|
18
|
+
"badge": {"span": 0.80},
|
|
19
|
+
"items": {"ul": 0.90, "ol": 0.85},
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
PROP_TYPE_COMPAT: dict[str, set[str]] = {
|
|
23
|
+
"string": {"text", "image_url", "link"},
|
|
24
|
+
"number": {"text"},
|
|
25
|
+
"boolean": set(), # no content node type maps naturally
|
|
26
|
+
"ReactNode": {"text", "image_url", "link", "action", "list"},
|
|
27
|
+
"any": {"text", "image_url", "link", "action", "list"},
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
CLASS_HINT_SCORES: dict[str, dict[str, float]] = {
|
|
31
|
+
"price": {"price": 0.90, "cost": 0.85, "amount": 0.80},
|
|
32
|
+
"badge": {"badge": 0.90, "tag": 0.80, "label": 0.75},
|
|
33
|
+
"description": {"desc": 0.85, "summary": 0.80, "excerpt": 0.80},
|
|
34
|
+
"title": {"title": 0.85, "heading": 0.80, "name": 0.80},
|
|
35
|
+
"footer": {"footer": 0.85, "actions": 0.80},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def infer_prop_mapping(
|
|
40
|
+
segment_html: str,
|
|
41
|
+
props: list[PropDefinition],
|
|
42
|
+
) -> PropMapping:
|
|
43
|
+
"""Infer how segment content maps to component props. Returns PropMapping."""
|
|
44
|
+
if not props:
|
|
45
|
+
return PropMapping()
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
content_nodes = _extract_content_nodes(segment_html)
|
|
49
|
+
except Exception as exc:
|
|
50
|
+
logger.warning("Failed to parse segment HTML for prop mapping: %s", exc)
|
|
51
|
+
return PropMapping(unmapped_props=[p.name for p in props if p.required])
|
|
52
|
+
|
|
53
|
+
if not content_nodes:
|
|
54
|
+
return PropMapping(unmapped_props=[p.name for p in props if p.required])
|
|
55
|
+
|
|
56
|
+
# Build cost matrix (we want to maximize, so we negate for scipy)
|
|
57
|
+
n_props = len(props)
|
|
58
|
+
n_nodes = len(content_nodes)
|
|
59
|
+
|
|
60
|
+
score_matrix = []
|
|
61
|
+
for prop in props:
|
|
62
|
+
row = []
|
|
63
|
+
for node in content_nodes:
|
|
64
|
+
row.append(_score_assignment(node, prop))
|
|
65
|
+
score_matrix.append(row)
|
|
66
|
+
|
|
67
|
+
# Greedy assignment: highest score first
|
|
68
|
+
assignments = _greedy_assign(score_matrix, n_props, n_nodes)
|
|
69
|
+
|
|
70
|
+
mappings = []
|
|
71
|
+
unmapped = []
|
|
72
|
+
has_ambiguous = False
|
|
73
|
+
|
|
74
|
+
for prop_idx, prop in enumerate(props):
|
|
75
|
+
node_idx = assignments.get(prop_idx)
|
|
76
|
+
if node_idx is None:
|
|
77
|
+
if prop.required:
|
|
78
|
+
unmapped.append(prop.name)
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
node = content_nodes[node_idx]
|
|
82
|
+
confidence = score_matrix[prop_idx][node_idx]
|
|
83
|
+
|
|
84
|
+
if confidence < 0.10:
|
|
85
|
+
if prop.required:
|
|
86
|
+
unmapped.append(prop.name)
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
ambiguous = confidence < 0.70
|
|
90
|
+
if ambiguous:
|
|
91
|
+
has_ambiguous = True
|
|
92
|
+
|
|
93
|
+
# Determine content type
|
|
94
|
+
content_type = _infer_content_type(node, prop)
|
|
95
|
+
|
|
96
|
+
mappings.append(
|
|
97
|
+
{
|
|
98
|
+
"segment_field": node["selector"],
|
|
99
|
+
"component_prop": prop.name,
|
|
100
|
+
"type": content_type,
|
|
101
|
+
"confidence": round(confidence, 4),
|
|
102
|
+
"ambiguous": ambiguous,
|
|
103
|
+
}
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return PropMapping(
|
|
107
|
+
mappings=mappings,
|
|
108
|
+
has_ambiguous=has_ambiguous,
|
|
109
|
+
unmapped_props=unmapped,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _extract_content_nodes(html: str) -> list[dict]:
|
|
114
|
+
"""Extract content nodes from HTML. Returns list of node descriptors."""
|
|
115
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
116
|
+
nodes = []
|
|
117
|
+
seen_selectors: set[str] = set()
|
|
118
|
+
|
|
119
|
+
def make_selector(el, idx: int) -> str:
|
|
120
|
+
tag = el.name
|
|
121
|
+
classes = el.get("class", [])
|
|
122
|
+
semantic_classes = [
|
|
123
|
+
c
|
|
124
|
+
for c in classes
|
|
125
|
+
if any(
|
|
126
|
+
kw in c.lower()
|
|
127
|
+
for kw in (
|
|
128
|
+
"price",
|
|
129
|
+
"title",
|
|
130
|
+
"desc",
|
|
131
|
+
"badge",
|
|
132
|
+
"name",
|
|
133
|
+
"label",
|
|
134
|
+
"summary",
|
|
135
|
+
"heading",
|
|
136
|
+
"action",
|
|
137
|
+
"cta",
|
|
138
|
+
"footer",
|
|
139
|
+
"content",
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
]
|
|
143
|
+
if semantic_classes:
|
|
144
|
+
return f"{tag}.{semantic_classes[0]}"
|
|
145
|
+
|
|
146
|
+
# Use position-based selector
|
|
147
|
+
parent = el.parent
|
|
148
|
+
if parent and parent.name:
|
|
149
|
+
siblings = [
|
|
150
|
+
s for s in parent.children if hasattr(s, "name") and s.name == tag
|
|
151
|
+
]
|
|
152
|
+
if len(siblings) > 1:
|
|
153
|
+
pos = siblings.index(el) + 1
|
|
154
|
+
return f"{tag}:nth-of-type({pos})"
|
|
155
|
+
return f"{tag}:{idx}"
|
|
156
|
+
|
|
157
|
+
# Images
|
|
158
|
+
for i, img in enumerate(soup.find_all("img")):
|
|
159
|
+
sel = img.get("src") and "img[src]" or f"img:{i}"
|
|
160
|
+
if sel not in seen_selectors:
|
|
161
|
+
nodes.append(
|
|
162
|
+
{
|
|
163
|
+
"tag": "img",
|
|
164
|
+
"selector": sel,
|
|
165
|
+
"type": "image_url",
|
|
166
|
+
"class_tokens": _get_class_tokens(img),
|
|
167
|
+
"content_preview": img.get("src", "") or img.get("alt", ""),
|
|
168
|
+
"attrs": {"src": img.get("src", ""), "alt": img.get("alt", "")},
|
|
169
|
+
}
|
|
170
|
+
)
|
|
171
|
+
seen_selectors.add(sel)
|
|
172
|
+
|
|
173
|
+
# Headings (by priority)
|
|
174
|
+
for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
|
175
|
+
for i, el in enumerate(soup.find_all(tag)):
|
|
176
|
+
sel = make_selector(el, i)
|
|
177
|
+
if sel not in seen_selectors:
|
|
178
|
+
nodes.append(
|
|
179
|
+
{
|
|
180
|
+
"tag": tag,
|
|
181
|
+
"selector": sel,
|
|
182
|
+
"type": "text",
|
|
183
|
+
"class_tokens": _get_class_tokens(el),
|
|
184
|
+
"content_preview": el.get_text(strip=True)[:80],
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
seen_selectors.add(sel)
|
|
188
|
+
|
|
189
|
+
# Paragraphs and spans
|
|
190
|
+
for tag in ["p", "span"]:
|
|
191
|
+
for i, el in enumerate(soup.find_all(tag)):
|
|
192
|
+
sel = make_selector(el, i)
|
|
193
|
+
if sel not in seen_selectors:
|
|
194
|
+
text = el.get_text(strip=True)
|
|
195
|
+
if text:
|
|
196
|
+
nodes.append(
|
|
197
|
+
{
|
|
198
|
+
"tag": tag,
|
|
199
|
+
"selector": sel,
|
|
200
|
+
"type": "text",
|
|
201
|
+
"class_tokens": _get_class_tokens(el),
|
|
202
|
+
"content_preview": text[:80],
|
|
203
|
+
}
|
|
204
|
+
)
|
|
205
|
+
seen_selectors.add(sel)
|
|
206
|
+
|
|
207
|
+
# Links
|
|
208
|
+
for i, a in enumerate(soup.find_all("a")):
|
|
209
|
+
sel = make_selector(a, i)
|
|
210
|
+
if sel not in seen_selectors:
|
|
211
|
+
nodes.append(
|
|
212
|
+
{
|
|
213
|
+
"tag": "a",
|
|
214
|
+
"selector": sel,
|
|
215
|
+
"type": "link",
|
|
216
|
+
"class_tokens": _get_class_tokens(a),
|
|
217
|
+
"content_preview": a.get("href", ""),
|
|
218
|
+
"attrs": {"href": a.get("href", "")},
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
seen_selectors.add(sel)
|
|
222
|
+
|
|
223
|
+
# Buttons
|
|
224
|
+
for i, btn in enumerate(soup.find_all("button")):
|
|
225
|
+
sel = make_selector(btn, i)
|
|
226
|
+
if sel not in seen_selectors:
|
|
227
|
+
nodes.append(
|
|
228
|
+
{
|
|
229
|
+
"tag": "button",
|
|
230
|
+
"selector": sel,
|
|
231
|
+
"type": "action",
|
|
232
|
+
"class_tokens": _get_class_tokens(btn),
|
|
233
|
+
"content_preview": btn.get_text(strip=True)[:80],
|
|
234
|
+
}
|
|
235
|
+
)
|
|
236
|
+
seen_selectors.add(sel)
|
|
237
|
+
|
|
238
|
+
# Lists
|
|
239
|
+
for i, ul in enumerate(soup.find_all(["ul", "ol"])):
|
|
240
|
+
sel = make_selector(ul, i)
|
|
241
|
+
if sel not in seen_selectors:
|
|
242
|
+
items = [li.get_text(strip=True) for li in ul.find_all("li")]
|
|
243
|
+
nodes.append(
|
|
244
|
+
{
|
|
245
|
+
"tag": ul.name,
|
|
246
|
+
"selector": sel,
|
|
247
|
+
"type": "list",
|
|
248
|
+
"class_tokens": _get_class_tokens(ul),
|
|
249
|
+
"content_preview": str(items[:3]),
|
|
250
|
+
}
|
|
251
|
+
)
|
|
252
|
+
seen_selectors.add(sel)
|
|
253
|
+
|
|
254
|
+
return nodes
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _get_class_tokens(el) -> list[str]:
|
|
258
|
+
classes = el.get("class", [])
|
|
259
|
+
return [c.lower() for c in classes]
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _score_assignment(content_node: dict, prop: PropDefinition) -> float:
|
|
263
|
+
"""Score how well a content node matches a prop. Returns 0.0-1.0."""
|
|
264
|
+
tag = content_node["tag"]
|
|
265
|
+
prop_name_lower = prop.name.lower()
|
|
266
|
+
node_type = content_node["type"]
|
|
267
|
+
class_tokens = content_node.get("class_tokens", [])
|
|
268
|
+
|
|
269
|
+
# Type compatibility gate
|
|
270
|
+
prop_type = prop.type.strip()
|
|
271
|
+
compat_types = PROP_TYPE_COMPAT.get(
|
|
272
|
+
prop_type, {"text", "image_url", "link", "action", "list"}
|
|
273
|
+
)
|
|
274
|
+
if compat_types and node_type not in compat_types:
|
|
275
|
+
return 0.0
|
|
276
|
+
|
|
277
|
+
score = 0.0
|
|
278
|
+
|
|
279
|
+
# Direct prop name → tag lookup
|
|
280
|
+
tag_scores = PROP_NAME_TO_TAG_SCORES.get(prop_name_lower, {})
|
|
281
|
+
if tag in tag_scores:
|
|
282
|
+
score = max(score, tag_scores[tag])
|
|
283
|
+
|
|
284
|
+
# Prop name substring match in class tokens
|
|
285
|
+
for cls_token in class_tokens:
|
|
286
|
+
if prop_name_lower in cls_token or cls_token in prop_name_lower:
|
|
287
|
+
score = max(score, 0.80)
|
|
288
|
+
|
|
289
|
+
# Class hint scoring
|
|
290
|
+
hint_map = CLASS_HINT_SCORES.get(prop_name_lower, {})
|
|
291
|
+
for cls_token in class_tokens:
|
|
292
|
+
for hint, hint_score in hint_map.items():
|
|
293
|
+
if hint in cls_token:
|
|
294
|
+
score = max(score, hint_score)
|
|
295
|
+
|
|
296
|
+
# ReactNode props accept anything at medium confidence
|
|
297
|
+
if prop_type == "ReactNode" and score == 0.0:
|
|
298
|
+
score = 0.65
|
|
299
|
+
|
|
300
|
+
# Partial prop name match against tag
|
|
301
|
+
if score == 0.0:
|
|
302
|
+
if prop_name_lower in tag or tag in prop_name_lower:
|
|
303
|
+
score = 0.50
|
|
304
|
+
|
|
305
|
+
return min(1.0, score)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _infer_content_type(node: dict, prop: PropDefinition) -> str:
|
|
309
|
+
"""Infer content type string for the mapping entry."""
|
|
310
|
+
node_type = node["type"]
|
|
311
|
+
prop_type = prop.type.strip()
|
|
312
|
+
|
|
313
|
+
if node_type == "image_url":
|
|
314
|
+
return "image_url"
|
|
315
|
+
if node_type == "action":
|
|
316
|
+
return "node"
|
|
317
|
+
if prop_type == "ReactNode":
|
|
318
|
+
return "node"
|
|
319
|
+
if node_type == "list":
|
|
320
|
+
return "node"
|
|
321
|
+
if node_type == "link":
|
|
322
|
+
return "text"
|
|
323
|
+
return "text"
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _greedy_assign(
|
|
327
|
+
score_matrix: list[list[float]],
|
|
328
|
+
n_props: int,
|
|
329
|
+
n_nodes: int,
|
|
330
|
+
) -> dict[int, int]:
|
|
331
|
+
"""Greedy 1:1 assignment: highest score first."""
|
|
332
|
+
# Try scipy first
|
|
333
|
+
try:
|
|
334
|
+
import numpy as np
|
|
335
|
+
from scipy.optimize import linear_sum_assignment
|
|
336
|
+
|
|
337
|
+
mat = np.zeros((n_props, max(n_props, n_nodes)))
|
|
338
|
+
for i in range(n_props):
|
|
339
|
+
for j in range(n_nodes):
|
|
340
|
+
mat[i, j] = score_matrix[i][j]
|
|
341
|
+
|
|
342
|
+
row_ind, col_ind = linear_sum_assignment(-mat)
|
|
343
|
+
result = {}
|
|
344
|
+
for r, c in zip(row_ind, col_ind):
|
|
345
|
+
if c < n_nodes and mat[r, c] > 0.05:
|
|
346
|
+
result[r] = c
|
|
347
|
+
return result
|
|
348
|
+
except ImportError:
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
# Greedy fallback
|
|
352
|
+
pairs = []
|
|
353
|
+
for i in range(n_props):
|
|
354
|
+
for j in range(n_nodes):
|
|
355
|
+
pairs.append((score_matrix[i][j], i, j))
|
|
356
|
+
pairs.sort(reverse=True)
|
|
357
|
+
|
|
358
|
+
used_props: set[int] = set()
|
|
359
|
+
used_nodes: set[int] = set()
|
|
360
|
+
result: dict[int, int] = {}
|
|
361
|
+
|
|
362
|
+
for score, prop_idx, node_idx in pairs:
|
|
363
|
+
if prop_idx in used_props or node_idx in used_nodes:
|
|
364
|
+
continue
|
|
365
|
+
if score > 0.05:
|
|
366
|
+
result[prop_idx] = node_idx
|
|
367
|
+
used_props.add(prop_idx)
|
|
368
|
+
used_nodes.add(node_idx)
|
|
369
|
+
|
|
370
|
+
return result
|