component-mapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,370 @@
1
+ import logging
2
+ from bs4 import BeautifulSoup
3
+ from component_mapper.models import PropDefinition, PropMapping
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ PROP_NAME_TO_TAG_SCORES: dict[str, dict[str, float]] = {
8
+ "title": {"h1": 0.95, "h2": 0.90, "h3": 0.85, "h4": 0.75, "p": 0.40},
9
+ "description": {"p": 0.90, "span": 0.70, "div": 0.50, "h4": 0.55},
10
+ "image": {"img": 0.99, "picture": 0.95},
11
+ "src": {"img": 0.99},
12
+ "href": {"a": 0.99},
13
+ "label": {"span": 0.85, "p": 0.70, "div": 0.60},
14
+ "price": {"span": 0.80, "p": 0.75, "div": 0.60},
15
+ "children": {"div": 0.70, "section": 0.70, "p": 0.65},
16
+ "footer": {"button": 0.80, "div": 0.60, "a": 0.70},
17
+ "action": {"button": 0.95, "a": 0.85},
18
+ "badge": {"span": 0.80},
19
+ "items": {"ul": 0.90, "ol": 0.85},
20
+ }
21
+
22
+ PROP_TYPE_COMPAT: dict[str, set[str]] = {
23
+ "string": {"text", "image_url", "link"},
24
+ "number": {"text"},
25
+ "boolean": set(), # no content node type maps naturally
26
+ "ReactNode": {"text", "image_url", "link", "action", "list"},
27
+ "any": {"text", "image_url", "link", "action", "list"},
28
+ }
29
+
30
+ CLASS_HINT_SCORES: dict[str, dict[str, float]] = {
31
+ "price": {"price": 0.90, "cost": 0.85, "amount": 0.80},
32
+ "badge": {"badge": 0.90, "tag": 0.80, "label": 0.75},
33
+ "description": {"desc": 0.85, "summary": 0.80, "excerpt": 0.80},
34
+ "title": {"title": 0.85, "heading": 0.80, "name": 0.80},
35
+ "footer": {"footer": 0.85, "actions": 0.80},
36
+ }
37
+
38
+
39
+ def infer_prop_mapping(
40
+ segment_html: str,
41
+ props: list[PropDefinition],
42
+ ) -> PropMapping:
43
+ """Infer how segment content maps to component props. Returns PropMapping."""
44
+ if not props:
45
+ return PropMapping()
46
+
47
+ try:
48
+ content_nodes = _extract_content_nodes(segment_html)
49
+ except Exception as exc:
50
+ logger.warning("Failed to parse segment HTML for prop mapping: %s", exc)
51
+ return PropMapping(unmapped_props=[p.name for p in props if p.required])
52
+
53
+ if not content_nodes:
54
+ return PropMapping(unmapped_props=[p.name for p in props if p.required])
55
+
56
+ # Build cost matrix (we want to maximize, so we negate for scipy)
57
+ n_props = len(props)
58
+ n_nodes = len(content_nodes)
59
+
60
+ score_matrix = []
61
+ for prop in props:
62
+ row = []
63
+ for node in content_nodes:
64
+ row.append(_score_assignment(node, prop))
65
+ score_matrix.append(row)
66
+
67
+ # Greedy assignment: highest score first
68
+ assignments = _greedy_assign(score_matrix, n_props, n_nodes)
69
+
70
+ mappings = []
71
+ unmapped = []
72
+ has_ambiguous = False
73
+
74
+ for prop_idx, prop in enumerate(props):
75
+ node_idx = assignments.get(prop_idx)
76
+ if node_idx is None:
77
+ if prop.required:
78
+ unmapped.append(prop.name)
79
+ continue
80
+
81
+ node = content_nodes[node_idx]
82
+ confidence = score_matrix[prop_idx][node_idx]
83
+
84
+ if confidence < 0.10:
85
+ if prop.required:
86
+ unmapped.append(prop.name)
87
+ continue
88
+
89
+ ambiguous = confidence < 0.70
90
+ if ambiguous:
91
+ has_ambiguous = True
92
+
93
+ # Determine content type
94
+ content_type = _infer_content_type(node, prop)
95
+
96
+ mappings.append(
97
+ {
98
+ "segment_field": node["selector"],
99
+ "component_prop": prop.name,
100
+ "type": content_type,
101
+ "confidence": round(confidence, 4),
102
+ "ambiguous": ambiguous,
103
+ }
104
+ )
105
+
106
+ return PropMapping(
107
+ mappings=mappings,
108
+ has_ambiguous=has_ambiguous,
109
+ unmapped_props=unmapped,
110
+ )
111
+
112
+
113
+ def _extract_content_nodes(html: str) -> list[dict]:
114
+ """Extract content nodes from HTML. Returns list of node descriptors."""
115
+ soup = BeautifulSoup(html, "html.parser")
116
+ nodes = []
117
+ seen_selectors: set[str] = set()
118
+
119
+ def make_selector(el, idx: int) -> str:
120
+ tag = el.name
121
+ classes = el.get("class", [])
122
+ semantic_classes = [
123
+ c
124
+ for c in classes
125
+ if any(
126
+ kw in c.lower()
127
+ for kw in (
128
+ "price",
129
+ "title",
130
+ "desc",
131
+ "badge",
132
+ "name",
133
+ "label",
134
+ "summary",
135
+ "heading",
136
+ "action",
137
+ "cta",
138
+ "footer",
139
+ "content",
140
+ )
141
+ )
142
+ ]
143
+ if semantic_classes:
144
+ return f"{tag}.{semantic_classes[0]}"
145
+
146
+ # Use position-based selector
147
+ parent = el.parent
148
+ if parent and parent.name:
149
+ siblings = [
150
+ s for s in parent.children if hasattr(s, "name") and s.name == tag
151
+ ]
152
+ if len(siblings) > 1:
153
+ pos = siblings.index(el) + 1
154
+ return f"{tag}:nth-of-type({pos})"
155
+ return f"{tag}:{idx}"
156
+
157
+ # Images
158
+ for i, img in enumerate(soup.find_all("img")):
159
+ sel = img.get("src") and "img[src]" or f"img:{i}"
160
+ if sel not in seen_selectors:
161
+ nodes.append(
162
+ {
163
+ "tag": "img",
164
+ "selector": sel,
165
+ "type": "image_url",
166
+ "class_tokens": _get_class_tokens(img),
167
+ "content_preview": img.get("src", "") or img.get("alt", ""),
168
+ "attrs": {"src": img.get("src", ""), "alt": img.get("alt", "")},
169
+ }
170
+ )
171
+ seen_selectors.add(sel)
172
+
173
+ # Headings (by priority)
174
+ for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
175
+ for i, el in enumerate(soup.find_all(tag)):
176
+ sel = make_selector(el, i)
177
+ if sel not in seen_selectors:
178
+ nodes.append(
179
+ {
180
+ "tag": tag,
181
+ "selector": sel,
182
+ "type": "text",
183
+ "class_tokens": _get_class_tokens(el),
184
+ "content_preview": el.get_text(strip=True)[:80],
185
+ }
186
+ )
187
+ seen_selectors.add(sel)
188
+
189
+ # Paragraphs and spans
190
+ for tag in ["p", "span"]:
191
+ for i, el in enumerate(soup.find_all(tag)):
192
+ sel = make_selector(el, i)
193
+ if sel not in seen_selectors:
194
+ text = el.get_text(strip=True)
195
+ if text:
196
+ nodes.append(
197
+ {
198
+ "tag": tag,
199
+ "selector": sel,
200
+ "type": "text",
201
+ "class_tokens": _get_class_tokens(el),
202
+ "content_preview": text[:80],
203
+ }
204
+ )
205
+ seen_selectors.add(sel)
206
+
207
+ # Links
208
+ for i, a in enumerate(soup.find_all("a")):
209
+ sel = make_selector(a, i)
210
+ if sel not in seen_selectors:
211
+ nodes.append(
212
+ {
213
+ "tag": "a",
214
+ "selector": sel,
215
+ "type": "link",
216
+ "class_tokens": _get_class_tokens(a),
217
+ "content_preview": a.get("href", ""),
218
+ "attrs": {"href": a.get("href", "")},
219
+ }
220
+ )
221
+ seen_selectors.add(sel)
222
+
223
+ # Buttons
224
+ for i, btn in enumerate(soup.find_all("button")):
225
+ sel = make_selector(btn, i)
226
+ if sel not in seen_selectors:
227
+ nodes.append(
228
+ {
229
+ "tag": "button",
230
+ "selector": sel,
231
+ "type": "action",
232
+ "class_tokens": _get_class_tokens(btn),
233
+ "content_preview": btn.get_text(strip=True)[:80],
234
+ }
235
+ )
236
+ seen_selectors.add(sel)
237
+
238
+ # Lists
239
+ for i, ul in enumerate(soup.find_all(["ul", "ol"])):
240
+ sel = make_selector(ul, i)
241
+ if sel not in seen_selectors:
242
+ items = [li.get_text(strip=True) for li in ul.find_all("li")]
243
+ nodes.append(
244
+ {
245
+ "tag": ul.name,
246
+ "selector": sel,
247
+ "type": "list",
248
+ "class_tokens": _get_class_tokens(ul),
249
+ "content_preview": str(items[:3]),
250
+ }
251
+ )
252
+ seen_selectors.add(sel)
253
+
254
+ return nodes
255
+
256
+
257
+ def _get_class_tokens(el) -> list[str]:
258
+ classes = el.get("class", [])
259
+ return [c.lower() for c in classes]
260
+
261
+
262
+ def _score_assignment(content_node: dict, prop: PropDefinition) -> float:
263
+ """Score how well a content node matches a prop. Returns 0.0-1.0."""
264
+ tag = content_node["tag"]
265
+ prop_name_lower = prop.name.lower()
266
+ node_type = content_node["type"]
267
+ class_tokens = content_node.get("class_tokens", [])
268
+
269
+ # Type compatibility gate
270
+ prop_type = prop.type.strip()
271
+ compat_types = PROP_TYPE_COMPAT.get(
272
+ prop_type, {"text", "image_url", "link", "action", "list"}
273
+ )
274
+ if compat_types and node_type not in compat_types:
275
+ return 0.0
276
+
277
+ score = 0.0
278
+
279
+ # Direct prop name → tag lookup
280
+ tag_scores = PROP_NAME_TO_TAG_SCORES.get(prop_name_lower, {})
281
+ if tag in tag_scores:
282
+ score = max(score, tag_scores[tag])
283
+
284
+ # Prop name substring match in class tokens
285
+ for cls_token in class_tokens:
286
+ if prop_name_lower in cls_token or cls_token in prop_name_lower:
287
+ score = max(score, 0.80)
288
+
289
+ # Class hint scoring
290
+ hint_map = CLASS_HINT_SCORES.get(prop_name_lower, {})
291
+ for cls_token in class_tokens:
292
+ for hint, hint_score in hint_map.items():
293
+ if hint in cls_token:
294
+ score = max(score, hint_score)
295
+
296
+ # ReactNode props accept anything at medium confidence
297
+ if prop_type == "ReactNode" and score == 0.0:
298
+ score = 0.65
299
+
300
+ # Partial prop name match against tag
301
+ if score == 0.0:
302
+ if prop_name_lower in tag or tag in prop_name_lower:
303
+ score = 0.50
304
+
305
+ return min(1.0, score)
306
+
307
+
308
+ def _infer_content_type(node: dict, prop: PropDefinition) -> str:
309
+ """Infer content type string for the mapping entry."""
310
+ node_type = node["type"]
311
+ prop_type = prop.type.strip()
312
+
313
+ if node_type == "image_url":
314
+ return "image_url"
315
+ if node_type == "action":
316
+ return "node"
317
+ if prop_type == "ReactNode":
318
+ return "node"
319
+ if node_type == "list":
320
+ return "node"
321
+ if node_type == "link":
322
+ return "text"
323
+ return "text"
324
+
325
+
326
+ def _greedy_assign(
327
+ score_matrix: list[list[float]],
328
+ n_props: int,
329
+ n_nodes: int,
330
+ ) -> dict[int, int]:
331
+ """Greedy 1:1 assignment: highest score first."""
332
+ # Try scipy first
333
+ try:
334
+ import numpy as np
335
+ from scipy.optimize import linear_sum_assignment
336
+
337
+ mat = np.zeros((n_props, max(n_props, n_nodes)))
338
+ for i in range(n_props):
339
+ for j in range(n_nodes):
340
+ mat[i, j] = score_matrix[i][j]
341
+
342
+ row_ind, col_ind = linear_sum_assignment(-mat)
343
+ result = {}
344
+ for r, c in zip(row_ind, col_ind):
345
+ if c < n_nodes and mat[r, c] > 0.05:
346
+ result[r] = c
347
+ return result
348
+ except ImportError:
349
+ pass
350
+
351
+ # Greedy fallback
352
+ pairs = []
353
+ for i in range(n_props):
354
+ for j in range(n_nodes):
355
+ pairs.append((score_matrix[i][j], i, j))
356
+ pairs.sort(reverse=True)
357
+
358
+ used_props: set[int] = set()
359
+ used_nodes: set[int] = set()
360
+ result: dict[int, int] = {}
361
+
362
+ for score, prop_idx, node_idx in pairs:
363
+ if prop_idx in used_props or node_idx in used_nodes:
364
+ continue
365
+ if score > 0.05:
366
+ result[prop_idx] = node_idx
367
+ used_props.add(prop_idx)
368
+ used_nodes.add(node_idx)
369
+
370
+ return result