wordlift-sdk 2.9.1__py3-none-any.whl → 2.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. wordlift_sdk/__init__.py +1 -1
  2. wordlift_sdk/render/__init__.py +30 -0
  3. wordlift_sdk/render/browser.py +132 -0
  4. wordlift_sdk/render/cleanup_options.py +24 -0
  5. wordlift_sdk/render/html_renderer.py +86 -0
  6. wordlift_sdk/render/render_options.py +21 -0
  7. wordlift_sdk/render/rendered_page.py +13 -0
  8. wordlift_sdk/render/xhtml_cleaner.py +126 -0
  9. wordlift_sdk/structured_data/__init__.py +27 -0
  10. wordlift_sdk/structured_data/agent.py +49 -0
  11. wordlift_sdk/structured_data/agent_generator.py +12 -0
  12. wordlift_sdk/structured_data/batch.py +220 -0
  13. wordlift_sdk/structured_data/constants.py +1 -0
  14. wordlift_sdk/structured_data/dataset_resolver.py +32 -0
  15. wordlift_sdk/structured_data/debug.py +23 -0
  16. wordlift_sdk/structured_data/engine.py +2875 -0
  17. wordlift_sdk/structured_data/inputs.py +58 -0
  18. wordlift_sdk/structured_data/io.py +44 -0
  19. wordlift_sdk/structured_data/materialization.py +70 -0
  20. wordlift_sdk/structured_data/models.py +48 -0
  21. wordlift_sdk/structured_data/orchestrator.py +194 -0
  22. wordlift_sdk/structured_data/rendering.py +43 -0
  23. wordlift_sdk/structured_data/schema_guide.py +17 -0
  24. wordlift_sdk/structured_data/structured_data_engine.py +58 -0
  25. wordlift_sdk/structured_data/validation.py +31 -0
  26. wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
  27. wordlift_sdk/url_source/__init__.py +7 -2
  28. wordlift_sdk/validation/__init__.py +7 -0
  29. wordlift_sdk/validation/generator.py +446 -0
  30. wordlift_sdk/validation/shacl.py +205 -0
  31. wordlift_sdk/validation/shacls/__init__.py +1 -0
  32. wordlift_sdk/validation/shacls/google-article.ttl +148 -0
  33. wordlift_sdk/validation/shacls/google-book.ttl +660 -0
  34. wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
  35. wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
  36. wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
  37. wordlift_sdk/validation/shacls/google-course.ttl +43 -0
  38. wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
  39. wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
  40. wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
  41. wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
  42. wordlift_sdk/validation/shacls/google-event.ttl +46 -0
  43. wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
  44. wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
  45. wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
  46. wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
  47. wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
  48. wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
  49. wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
  50. wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
  51. wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
  52. wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
  53. wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
  54. wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
  55. wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
  56. wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
  57. wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
  58. wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
  59. wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
  60. wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
  61. wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
  62. wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
  63. wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
  64. wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
  65. wordlift_sdk/validation/shacls/google-video.ttl +149 -0
  66. wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
  67. {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/METADATA +3 -1
  68. {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/RECORD +69 -5
  69. {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,2875 @@
1
+ """Generate structured data from a rendered web page."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import hashlib
7
+ import json
8
+ import logging
9
+ import re
10
+ import shutil
11
+ import subprocess
12
+ from dataclasses import dataclass
13
+ from importlib import resources
14
+ from pathlib import Path
15
+ from typing import Any, Callable
16
+ from urllib.parse import urlparse
17
+
18
+ import wordlift_client
19
+ from wordlift_client import ApiClient, Configuration
20
+ from wordlift_client import AgentApi
21
+ from wordlift_client.models.ask_request import AskRequest
22
+ from rdflib import Graph, Namespace, RDF
23
+ from rdflib.term import BNode, Identifier, Literal, URIRef
24
+
25
+ from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
26
+ from wordlift_sdk.validation.shacl import ValidationResult, validate_file
27
+
28
+
29
+ _SCHEMA_BASE = "https://schema.org"
30
+ _SCHEMA_HTTP = "http://schema.org/"
31
+ _AGENT_BASE_URL = "https://api.wordlift.io/agent"
32
+ _AGENT_MODEL = "gpt-5.1"
33
+ _RR = Namespace("http://www.w3.org/ns/r2rml#")
34
+ _RML = Namespace("http://w3id.org/rml/")
35
+ _RML_LEGACY = Namespace("http://semweb.mmlab.be/ns/rml#")
36
+ _QL = Namespace("http://semweb.mmlab.be/ns/ql#")
37
+ _SH = Namespace("http://www.w3.org/ns/shacl#")
38
+ _REVIEW_OPTIONAL_EXTRAS = {
39
+ "description",
40
+ "positiveNotes",
41
+ "negativeNotes",
42
+ "reviewBody",
43
+ "image",
44
+ "inLanguage",
45
+ "publisher",
46
+ "datePublished",
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class StructuredDataOptions:
52
+ url: str
53
+ target_type: str | None
54
+ dataset_uri: str
55
+ headless: bool = True
56
+ timeout_ms: int = 30000
57
+ wait_until: str = "networkidle"
58
+ max_retries: int = 2
59
+ max_xhtml_chars: int = 40000
60
+ max_text_node_chars: int = 400
61
+ max_nesting_depth: int = 2
62
+ verbose: bool = True
63
+
64
+
65
+ @dataclass
66
+ class StructuredDataResult:
67
+ jsonld: dict[str, Any]
68
+ yarrml: str
69
+ jsonld_filename: str
70
+ yarrml_filename: str
71
+
72
+
73
+ def _build_client(api_key: str, base_url: str) -> ApiClient:
74
+ config = Configuration(host=base_url)
75
+ config.api_key["ApiKey"] = api_key
76
+ config.api_key_prefix["ApiKey"] = "Key"
77
+ return ApiClient(config)
78
+
79
+
80
+ def _build_agent_client(api_key: str) -> ApiClient:
81
+ config = Configuration(host=_AGENT_BASE_URL)
82
+ config.api_key["ApiKey"] = api_key
83
+ config.api_key_prefix["ApiKey"] = "Key"
84
+ return ApiClient(config)
85
+
86
+
87
+ async def get_dataset_uri_async(api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
88
+ async with _build_client(api_key, base_url) as api_client:
89
+ api = wordlift_client.AccountApi(api_client)
90
+ account = await api.get_me()
91
+ dataset_uri = getattr(account, "dataset_uri", None)
92
+ if not dataset_uri:
93
+ raise RuntimeError("Failed to resolve dataset_uri from account get_me.")
94
+ return dataset_uri
95
+
96
+
97
+ def get_dataset_uri(api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
98
+ return asyncio.run(get_dataset_uri_async(api_key, base_url))
99
+
100
+
101
+ def normalize_type(value: str) -> str:
102
+ value = value.strip()
103
+ if value.startswith("schema:"):
104
+ return value.split(":", 1)[1]
105
+ if value.startswith("http://schema.org/"):
106
+ return value.split("/", 3)[-1]
107
+ if value.startswith("https://schema.org/"):
108
+ return value.split("/", 3)[-1]
109
+ return value
110
+
111
+
112
+ _GOOGLE_SHAPES_CACHE: Graph | None = None
113
+ _SCHEMA_SHAPES_CACHE: Graph | None = None
114
+ _SCHEMA_PROP_CACHE: set[str] | None = None
115
+ _SCHEMA_RANGE_CACHE: dict[str, dict[str, set[str]]] | None = None
116
+
117
+
118
+ def _load_google_shapes() -> Graph:
119
+ global _GOOGLE_SHAPES_CACHE
120
+ if _GOOGLE_SHAPES_CACHE is not None:
121
+ return _GOOGLE_SHAPES_CACHE
122
+ graph = Graph()
123
+ shapes_dir = resources.files("wordlift_sdk.validation.shacls")
124
+ for entry in shapes_dir.iterdir():
125
+ if not entry.is_file() or not entry.name.endswith(".ttl"):
126
+ continue
127
+ if entry.name.startswith("google-") or entry.name == "review-snippet.ttl":
128
+ graph.parse(data=entry.read_text(encoding="utf-8"), format="turtle")
129
+ _GOOGLE_SHAPES_CACHE = graph
130
+ return graph
131
+
132
+
133
+ def _load_schema_shapes() -> Graph:
134
+ global _SCHEMA_SHAPES_CACHE
135
+ if _SCHEMA_SHAPES_CACHE is not None:
136
+ return _SCHEMA_SHAPES_CACHE
137
+ graph = Graph()
138
+ shapes_dir = resources.files("wordlift_sdk.validation.shacls")
139
+ schema_path = shapes_dir.joinpath("schemaorg-grammar.ttl")
140
+ if not schema_path.is_file():
141
+ raise RuntimeError(
142
+ "schemaorg-grammar.ttl not found. Regenerate with scripts/generate_schema_shacls.py."
143
+ )
144
+ graph.parse(data=schema_path.read_text(encoding="utf-8"), format="turtle")
145
+ _SCHEMA_SHAPES_CACHE = graph
146
+ return graph
147
+
148
+
149
+ def _schema_property_set() -> set[str]:
150
+ global _SCHEMA_PROP_CACHE
151
+ if _SCHEMA_PROP_CACHE is not None:
152
+ return _SCHEMA_PROP_CACHE
153
+ graph = _load_schema_shapes()
154
+ props: set[str] = set()
155
+ for prop in graph.objects(None, _SH.path):
156
+ name = _path_to_string(graph, prop)
157
+ if not name:
158
+ continue
159
+ props.add(name)
160
+ props.add(name.split(".", 1)[0])
161
+ _SCHEMA_PROP_CACHE = props
162
+ return props
163
+
164
+
165
+ def _rdf_list_items(graph: Graph, head: Identifier) -> list[Identifier]:
166
+ items: list[Identifier] = []
167
+ current: Identifier | None = head
168
+ while current and current != RDF.nil:
169
+ first = graph.value(current, RDF.first)
170
+ if first is None:
171
+ break
172
+ items.append(first)
173
+ current = graph.value(current, RDF.rest)
174
+ return items
175
+
176
+
177
+ def _schema_property_ranges() -> dict[str, dict[str, set[str]]]:
178
+ global _SCHEMA_RANGE_CACHE
179
+ if _SCHEMA_RANGE_CACHE is not None:
180
+ return _SCHEMA_RANGE_CACHE
181
+ graph = _load_schema_shapes()
182
+ ranges: dict[str, dict[str, set[str]]] = {}
183
+ for shape in graph.subjects(_SH.targetClass, None):
184
+ target_class = graph.value(shape, _SH.targetClass)
185
+ type_name = _short_schema_name(target_class)
186
+ if not type_name:
187
+ continue
188
+ for prop in graph.objects(shape, _SH.property):
189
+ path = graph.value(prop, _SH.path)
190
+ if path is None:
191
+ continue
192
+ prop_name = _path_to_string(graph, path)
193
+ if not prop_name:
194
+ continue
195
+ or_list = graph.value(prop, _SH["or"])
196
+ if or_list is None:
197
+ continue
198
+ for item in _rdf_list_items(graph, or_list):
199
+ class_node = graph.value(item, _SH["class"])
200
+ class_name = _short_schema_name(class_node)
201
+ if not class_name:
202
+ continue
203
+ ranges.setdefault(type_name, {}).setdefault(prop_name, set()).add(
204
+ class_name
205
+ )
206
+ _SCHEMA_RANGE_CACHE = ranges
207
+ return ranges
208
+
209
+
210
+ def _short_schema_name(value: Identifier) -> str | None:
211
+ if not isinstance(value, URIRef):
212
+ return None
213
+ text = str(value)
214
+ if text.startswith(_SCHEMA_BASE):
215
+ return text[len(_SCHEMA_BASE) + 1 :]
216
+ if text.startswith(_SCHEMA_HTTP):
217
+ return text[len(_SCHEMA_HTTP) :]
218
+ return None
219
+
220
+
221
+ def _path_to_string(graph: Graph, path: Identifier) -> str | None:
222
+ if isinstance(path, URIRef):
223
+ return _short_schema_name(path)
224
+ if isinstance(path, BNode):
225
+ parts: list[str] = []
226
+ current: Identifier | None = path
227
+ while current and current != RDF.nil:
228
+ first = graph.value(current, RDF.first)
229
+ if first is None:
230
+ break
231
+ name = _short_schema_name(first)
232
+ if not name:
233
+ break
234
+ parts.append(name)
235
+ current = graph.value(current, RDF.rest)
236
+ if parts:
237
+ return ".".join(parts)
238
+ return None
239
+
240
+
241
+ def _property_guide_for_type(type_name: str) -> dict[str, list[str]]:
242
+ type_name = normalize_type(type_name)
243
+ targets = {
244
+ URIRef(f"{_SCHEMA_BASE}/{type_name}"),
245
+ URIRef(f"{_SCHEMA_HTTP}{type_name}"),
246
+ }
247
+
248
+ required: set[str] = set()
249
+ recommended: set[str] = set()
250
+ google_graph = _load_google_shapes()
251
+ for target in targets:
252
+ for shape in google_graph.subjects(_SH.targetClass, target):
253
+ for prop in google_graph.objects(shape, _SH.property):
254
+ path = google_graph.value(prop, _SH.path)
255
+ if path is None:
256
+ continue
257
+ min_count = google_graph.value(prop, _SH.minCount)
258
+ if isinstance(min_count, Literal):
259
+ try:
260
+ if int(min_count) < 1:
261
+ continue
262
+ except Exception:
263
+ continue
264
+ else:
265
+ continue
266
+ prop_name = _path_to_string(google_graph, path)
267
+ if not prop_name:
268
+ continue
269
+ severity = google_graph.value(prop, _SH.severity)
270
+ if severity == _SH.Warning:
271
+ recommended.add(prop_name)
272
+ else:
273
+ required.add(prop_name)
274
+
275
+ schema_props: set[str] = set()
276
+ schema_graph = _load_schema_shapes()
277
+ for target in targets:
278
+ for shape in schema_graph.subjects(_SH.targetClass, target):
279
+ for prop in schema_graph.objects(shape, _SH.property):
280
+ path = schema_graph.value(prop, _SH.path)
281
+ if path is None:
282
+ continue
283
+ prop_name = _path_to_string(schema_graph, path)
284
+ if not prop_name:
285
+ continue
286
+ schema_props.add(prop_name)
287
+
288
+ optional = sorted(schema_props.difference(required).difference(recommended))
289
+
290
+ return {
291
+ "required": sorted(required),
292
+ "recommended": sorted(recommended),
293
+ "optional": optional,
294
+ "schema": sorted(schema_props),
295
+ }
296
+
297
+
298
+ def _related_types_for_type(
299
+ type_name: str,
300
+ property_guide: dict[str, list[str]],
301
+ ranges: dict[str, dict[str, set[str]]],
302
+ ) -> list[str]:
303
+ related: set[str] = set()
304
+ prop_ranges = ranges.get(type_name, {})
305
+ prop_candidates = property_guide.get("required", []) + property_guide.get(
306
+ "recommended", []
307
+ )
308
+ if not prop_candidates:
309
+ prop_candidates = property_guide.get("schema", [])
310
+ for prop in prop_candidates:
311
+ base = prop.split(".", 1)[0]
312
+ for range_type in prop_ranges.get(base, set()):
313
+ if range_type == "Thing":
314
+ continue
315
+ related.add(range_type)
316
+ return sorted(related)
317
+
318
+
319
+ def property_guides_with_related(
320
+ type_name: str,
321
+ max_depth: int = 2,
322
+ ) -> dict[str, dict[str, list[str]]]:
323
+ type_name = normalize_type(type_name)
324
+ ranges = _schema_property_ranges()
325
+ guides: dict[str, dict[str, list[str]]] = {}
326
+ queue: list[tuple[str, int]] = [(type_name, 0)]
327
+ seen: set[str] = set()
328
+
329
+ while queue:
330
+ current, depth = queue.pop(0)
331
+ if current in seen:
332
+ continue
333
+ seen.add(current)
334
+ guide = _property_guide_for_type(current)
335
+ guides[current] = guide
336
+ if depth >= max_depth:
337
+ continue
338
+ for related in _related_types_for_type(current, guide, ranges):
339
+ if related not in seen:
340
+ queue.append((related, depth + 1))
341
+
342
+ return guides
343
+
344
+
345
+ def shape_specs_for_type(type_name: str) -> list[str]:
346
+ return all_shape_specs()
347
+
348
+
349
+ def shape_specs_for_types(type_names: list[str]) -> list[str]:
350
+ return all_shape_specs()
351
+
352
+
353
+ def all_shape_specs() -> list[str]:
354
+ shapes_dir = resources.files("wordlift_sdk.validation.shacls")
355
+ shape_specs: list[str] = []
356
+ for entry in shapes_dir.iterdir():
357
+ if not entry.is_file() or not entry.name.endswith(".ttl"):
358
+ continue
359
+ if entry.name not in shape_specs:
360
+ shape_specs.append(entry.name)
361
+ if "schemaorg-grammar.ttl" not in shape_specs:
362
+ shape_specs.append("schemaorg-grammar.ttl")
363
+ return shape_specs
364
+
365
+
366
+ def _slugify(value: str, default: str) -> str:
367
+ cleaned = re.sub(r"[^A-Za-z0-9]+", "-", value.strip().lower()).strip("-")
368
+ return cleaned or default
369
+
370
+
371
+ def _dash_type(value: str) -> str:
372
+ value = re.sub(r"[^A-Za-z0-9]+", "-", value.strip())
373
+ value = re.sub(r"(?<!^)(?=[A-Z])", "-", value)
374
+ return re.sub(r"-+", "-", value).strip("-").lower()
375
+
376
+
377
+ def _pluralize(value: str) -> str:
378
+ if value.endswith("y") and len(value) > 1 and value[-2] not in "aeiou":
379
+ return value[:-1] + "ies"
380
+ if value.endswith(("s", "x", "z", "ch", "sh")):
381
+ return value + "es"
382
+ return value + "s"
383
+
384
+
385
+ def _hash_url(url: str, length: int = 12) -> str:
386
+ return hashlib.sha256(url.encode("utf-8")).hexdigest()[:length]
387
+
388
+
389
+ def build_output_basename(url: str, default: str = "page") -> str:
390
+ parsed = urlparse(url)
391
+ base = f"{parsed.netloc}{parsed.path}".strip("/")
392
+ slug = _slugify(base or url, default=default)
393
+ return f"{slug}--{_hash_url(url)}"
394
+
395
+
396
+ def build_id(
397
+ dataset_uri: str,
398
+ type_name: str,
399
+ name: str,
400
+ url: str | None,
401
+ index: int,
402
+ ) -> str:
403
+ return build_id_base(dataset_uri, type_name, name, url, index)
404
+
405
+
406
+ def build_id_base(
407
+ base_uri: str,
408
+ type_name: str,
409
+ name: str,
410
+ url: str | None,
411
+ index: int,
412
+ ) -> str:
413
+ base = base_uri.rstrip("/")
414
+ dashed_type = _dash_type(type_name)
415
+ plural_type = _pluralize(dashed_type)
416
+ name_slug = _slugify(name, default=_dash_type(type_name))
417
+ if url:
418
+ suffix = _hash_url(url)
419
+ else:
420
+ suffix = str(index)
421
+ return f"{base}/{plural_type}/{name_slug}-{suffix}"
422
+
423
+
424
+ def _format_prop_list(items: list[str]) -> str:
425
+ if not items:
426
+ return "none"
427
+ return ", ".join(items)
428
+
429
+
430
+ def _agent_prompt(
431
+ url: str,
432
+ html: str,
433
+ target_type: str | None,
434
+ property_guides: dict[str, dict[str, list[str]]] | None = None,
435
+ missing_required: list[str] | None = None,
436
+ missing_recommended: list[str] | None = None,
437
+ previous_yarrml: str | None = None,
438
+ validation_errors: list[str] | None = None,
439
+ validation_report: list[str] | None = None,
440
+ xpath_warnings: list[str] | None = None,
441
+ allow_properties: dict[str, list[str]] | None = None,
442
+ quality_feedback: list[str] | None = None,
443
+ ) -> str:
444
+ target = target_type or "AUTO"
445
+ guide_lines: list[str] = []
446
+ if property_guides:
447
+ guide_lines.append(
448
+ "Property guide by type (Google required/recommended + Schema.org grammar):"
449
+ )
450
+ for type_name, guide in property_guides.items():
451
+ guide_lines.append(f"- {type_name}:")
452
+ guide_lines.append(
453
+ f" - Required (Google): {_format_prop_list(guide.get('required', []))}"
454
+ )
455
+ guide_lines.append(
456
+ f" - Recommended (Google): {_format_prop_list(guide.get('recommended', []))}"
457
+ )
458
+ guide_lines.append(
459
+ " - Optional (Schema.org, excluding required/recommended): "
460
+ f"{_format_prop_list(guide.get('optional', []))}"
461
+ )
462
+ guide_lines.append(
463
+ f" - All Schema.org properties for {type_name}: {_format_prop_list(guide.get('schema', []))}"
464
+ )
465
+ guide_lines.append("")
466
+ guide_lines.append(
467
+ "If a required property is not present on the page, omit it (do not fabricate)."
468
+ )
469
+ guide_lines.append("Only use properties listed in the guide for each type.")
470
+ guide_lines.append("")
471
+ if allow_properties:
472
+ guide_lines.append("Allowed properties (Google only):")
473
+ for type_name, props in allow_properties.items():
474
+ guide_lines.append(f"- {type_name}: {_format_prop_list(props)}")
475
+ guide_lines.append("")
476
+
477
+ refine_lines: list[str] = []
478
+ if missing_required:
479
+ refine_lines.append("Missing required properties in the previous mapping:")
480
+ refine_lines.append(", ".join(missing_required))
481
+ refine_lines.append(
482
+ "Update the mapping to add these properties if the data exists on the page."
483
+ )
484
+ refine_lines.append("Keep existing correct mappings and selectors.")
485
+ refine_lines.append("")
486
+ if missing_recommended:
487
+ refine_lines.append("Missing recommended properties in the previous mapping:")
488
+ refine_lines.append(", ".join(missing_recommended))
489
+ refine_lines.append("Add these properties if the data exists on the page.")
490
+ refine_lines.append("")
491
+ if validation_errors:
492
+ refine_lines.append("Validation errors from the previous mapping:")
493
+ refine_lines.extend(validation_errors)
494
+ refine_lines.append("Fix these issues without fabricating data.")
495
+ refine_lines.append("")
496
+ if validation_report:
497
+ refine_lines.append("Validation report from the previous mapping:")
498
+ refine_lines.extend(validation_report)
499
+ refine_lines.append(
500
+ "Use the report to fix the mapping without fabricating data."
501
+ )
502
+ refine_lines.append("")
503
+ if xpath_warnings:
504
+ refine_lines.append("XPath evaluation warnings from the previous mapping:")
505
+ refine_lines.extend(xpath_warnings)
506
+ refine_lines.append("Fix the XPath selectors that returned no results.")
507
+ refine_lines.append("")
508
+ if quality_feedback:
509
+ refine_lines.append("Quality feedback from the previous mapping:")
510
+ refine_lines.extend(quality_feedback)
511
+ refine_lines.append(
512
+ "Improve the mapping to raise the quality score while only using data present in XHTML."
513
+ )
514
+ refine_lines.append("")
515
+ if previous_yarrml:
516
+ refine_lines.append("Previous mapping:")
517
+ refine_lines.append(previous_yarrml.strip())
518
+ refine_lines.append("")
519
+
520
+ guide_text = "\n".join(guide_lines) if guide_lines else ""
521
+ refine_text = "\n".join(refine_lines) if refine_lines else ""
522
+ return (
523
+ f"analyze the entities on this webpage: {url}\n"
524
+ "\n"
525
+ "You are a structured data extraction agent.\n"
526
+ "Goal: produce a YARRRML mapping using XPath only.\n"
527
+ "Use the provided XHTML source instead of fetching the URL.\n"
528
+ "Do NOT parse any existing structured data (JSON-LD, RDFa, Microdata).\n"
529
+ "Do NOT emit @id values. IDs will be assigned locally.\n"
530
+ "Output ONLY the YARRRML mapping (no prose, no code fences).\n"
531
+ "\n"
532
+ f"Target Schema.org type: {target}\n"
533
+ "\n"
534
+ "Requirements:\n"
535
+ "- Use XPath in all selectors.\n"
536
+ "- Use $(xpath) for XPath references (not {xpath}).\n"
537
+ '- Do NOT wrap XPath expressions in quotes inside $(...). Use $(/path), not $("/path").\n'
538
+ '- Always quote attribute values in XPath (e.g., @id="..."). Do NOT use @id=foo.\n'
539
+ "- The main mapping must include schema:url with the exact URL.\n"
540
+ "- Always include schema:name for every mapped node.\n"
541
+ "- Include schema:description for Review if available.\n"
542
+ "- Include schema:image if available (prefer og:image). \n"
543
+ "- Include schema:inLanguage if available (html/@lang). \n"
544
+ "- Include schema:publisher if available (prefer og:site_name as Organization). \n"
545
+ "- Include schema:reviewBody for Review if available (main article text). Prefer the paragraph immediately following the H1\n"
546
+ " (e.g., following-sibling::p[1]) and only use class-based selectors if necessary.\n"
547
+ '- Include schema:datePublished for Review if available (time/@datetime or meta[property="article:published_time"],\n'
548
+ " otherwise use the first byline date).\n"
549
+ "- Include positiveNotes/negativeNotes for Review if available.\n"
550
+ "- Include relevant properties for the main type.\n"
551
+ "- If Target Schema.org type is AUTO, infer the best type and use it.\n"
552
+ "- Define dependent nodes as separate mappings and link them from the main mapping.\n"
553
+ "- Prefer reusable XPath selectors that generalize across pages using the same template.\n"
554
+ "- Avoid brittle selectors that depend on full class names, IDs, or numeric suffixes unless there is no alternative.\n"
555
+ "- Prefer structural paths (head/meta, main/h1, time[@datetime], link[@rel], figure/img) and stable attributes.\n"
556
+ "- If you must use classes or IDs, prefer contains(@class, 'stable-token') over exact matches and avoid numeric IDs.\n"
557
+ "- NEVER use table IDs with numeric suffixes (e.g., tablepress-12345). Instead, locate tables by header text\n"
558
+ " (e.g., th contains 'APR'/'rating') and then select the adjacent cell by position or data-th.\n"
559
+ '- Do NOT key selectors off a specific person name or URL slug; use byline labels like "Written by" or metadata instead.\n'
560
+ "- For author, prefer metadata or rel links first (meta[name=author], meta[property=article:author], link[rel=author]) before class-based selectors.\n"
561
+ '- If the page shows a byline label (e.g., "Written by"), select the author link or text immediately following that label.\n'
562
+ "- For positiveNotes/negativeNotes (Review/Product and subclasses only), anchor on semantic headings (Pros/Cons, Advantages/Disadvantages,\n"
563
+ " What we like/What we don't like). Prefer heading text matches (contains(., 'Pros')) over IDs/classes,\n"
564
+ " and select the closest following list items (li) from ul/ol, rows from tables, or terms/defs from dl.\n"
565
+ " Detect the page language and include localized heading variants with English as a fallback. Avoid site-specific classes/IDs unless there is no alternative.\n"
566
+ "- Only include reviewRating if the page explicitly provides a rating score (stars or numeric rating). Do NOT infer ratings from APR/fee tables or unrelated metrics.\n"
567
+ "- Do NOT use hard-coded literal values. All values must come from XPath except schema:url.\n"
568
+ "- ratingValue must be a literal extracted from XPath (not an IRI).\n"
569
+ "- reviewRating must point to a Rating node.\n"
570
+ "- author must be a Person or Organization node.\n"
571
+ "\n"
572
+ f"{guide_text}"
573
+ f"{refine_text}"
574
+ "XHTML:\n"
575
+ f"{html}\n"
576
+ )
577
+
578
+
579
+ def _quality_prompt(
580
+ url: str,
581
+ xhtml: str,
582
+ jsonld: dict[str, Any] | list[Any],
583
+ property_guides: dict[str, dict[str, list[str]]] | None,
584
+ target_type: str | None,
585
+ ) -> str:
586
+ guide_lines: list[str] = []
587
+ if property_guides:
588
+ guide_lines.append(
589
+ "Property guide by type (Google required/recommended + Schema.org grammar):"
590
+ )
591
+ for type_name, guide in property_guides.items():
592
+ guide_lines.append(f"- {type_name}:")
593
+ guide_lines.append(
594
+ f" - Required (Google): {_format_prop_list(guide.get('required', []))}"
595
+ )
596
+ guide_lines.append(
597
+ f" - Recommended (Google): {_format_prop_list(guide.get('recommended', []))}"
598
+ )
599
+ guide_lines.append(
600
+ " - Optional (Schema.org, excluding required/recommended): "
601
+ f"{_format_prop_list(guide.get('optional', []))}"
602
+ )
603
+ guide_lines.append("")
604
+ guide_text = "\n".join(guide_lines) if guide_lines else ""
605
+ payload = json.dumps(jsonld, ensure_ascii=True)
606
+ return (
607
+ f"analyze the entities on this webpage: {url}\n"
608
+ "\n"
609
+ "You are evaluating structured data quality.\n"
610
+ "Compare the XHTML and JSON-LD. Only count properties that are present in XHTML.\n"
611
+ "Do NOT penalize missing properties if they do not appear in the XHTML.\n"
612
+ "Return a JSON object with keys: score (0-10 integer), missing_in_jsonld (list),\n"
613
+ "suggested_xpath (object mapping property -> XPath), notes (list).\n"
614
+ "Use XPath in suggested_xpath and keep it generic/reusable.\n"
615
+ "\n"
616
+ f"Target Schema.org type: {target_type or 'AUTO'}\n"
617
+ "\n"
618
+ f"{guide_text}"
619
+ "XHTML:\n"
620
+ f"{xhtml}\n"
621
+ "\n"
622
+ "JSON-LD:\n"
623
+ f"{payload}\n"
624
+ )
625
+
626
+
627
+ async def _ask_agent_async(
628
+ prompt: str, api_key: str, model: str | None = None
629
+ ) -> object:
630
+ async with _build_agent_client(api_key) as api_client:
631
+ api = AgentApi(api_client)
632
+ ask_request = AskRequest(message=prompt, model=model or _AGENT_MODEL)
633
+ return await api.ask_request_api_ask_post(ask_request)
634
+
635
+
636
+ def _collect_strings(payload: Any, results: list[str]) -> None:
637
+ if isinstance(payload, str):
638
+ if payload.strip():
639
+ results.append(payload)
640
+ return
641
+ if isinstance(payload, dict):
642
+ for value in payload.values():
643
+ _collect_strings(value, results)
644
+ return
645
+ if isinstance(payload, list):
646
+ for value in payload:
647
+ _collect_strings(value, results)
648
+
649
+
650
+ def _extract_agent_text(payload: Any) -> str | None:
651
+ if isinstance(payload, str) and payload.strip():
652
+ return payload.strip()
653
+ if isinstance(payload, dict):
654
+ for key in (
655
+ "response",
656
+ "answer",
657
+ "content",
658
+ "result",
659
+ "output",
660
+ "text",
661
+ "message",
662
+ ):
663
+ if key in payload:
664
+ value = _extract_agent_text(payload.get(key))
665
+ if value:
666
+ return value
667
+ strings: list[str] = []
668
+ _collect_strings(payload, strings)
669
+ for value in strings:
670
+ if "mappings:" in value or "prefixes:" in value:
671
+ return value.strip()
672
+ for value in strings:
673
+ if value.strip():
674
+ return value.strip()
675
+ return None
676
+
677
+
678
+ def _extract_agent_json(payload: Any) -> dict[str, Any] | None:
679
+ if isinstance(payload, dict):
680
+ return payload
681
+ text = _extract_agent_text(payload)
682
+ if not text:
683
+ return None
684
+ start = text.find("{")
685
+ end = text.rfind("}")
686
+ if start == -1 or end == -1 or end <= start:
687
+ return None
688
+ snippet = text[start : end + 1]
689
+ try:
690
+ return json.loads(snippet)
691
+ except Exception:
692
+ return None
693
+
694
+
695
+ def ask_agent_for_yarrml(
696
+ api_key: str,
697
+ url: str,
698
+ html: str,
699
+ target_type: str | None,
700
+ debug: bool = False,
701
+ debug_path: Path | None = None,
702
+ property_guides: dict[str, dict[str, list[str]]] | None = None,
703
+ missing_required: list[str] | None = None,
704
+ missing_recommended: list[str] | None = None,
705
+ previous_yarrml: str | None = None,
706
+ validation_errors: list[str] | None = None,
707
+ validation_report: list[str] | None = None,
708
+ xpath_warnings: list[str] | None = None,
709
+ allow_properties: dict[str, list[str]] | None = None,
710
+ quality_feedback: list[str] | None = None,
711
+ ) -> str:
712
+ prompt = _agent_prompt(
713
+ url,
714
+ html,
715
+ target_type,
716
+ property_guides=property_guides,
717
+ missing_required=missing_required,
718
+ missing_recommended=missing_recommended,
719
+ previous_yarrml=previous_yarrml,
720
+ validation_errors=validation_errors,
721
+ validation_report=validation_report,
722
+ xpath_warnings=xpath_warnings,
723
+ allow_properties=allow_properties,
724
+ quality_feedback=quality_feedback,
725
+ )
726
+ try:
727
+ response = asyncio.run(_ask_agent_async(prompt, api_key))
728
+ except Exception as exc:
729
+ raise RuntimeError(f"Agent request failed: {exc}") from exc
730
+
731
+ if isinstance(response, dict):
732
+ data = response
733
+ else:
734
+ try:
735
+ data = response.model_dump()
736
+ except Exception:
737
+ data = {}
738
+
739
+ if debug and debug_path is not None:
740
+ debug_path.parent.mkdir(parents=True, exist_ok=True)
741
+ debug_payload = {
742
+ "prompt": prompt,
743
+ "response": data,
744
+ }
745
+ debug_path.write_text(json.dumps(debug_payload, indent=2))
746
+
747
+ extracted = _extract_agent_text(data)
748
+ if extracted:
749
+ return extracted
750
+
751
+ raise RuntimeError("Agent response did not include YARRRML content.")
752
+
753
+
754
+ def ask_agent_for_quality(
755
+ api_key: str,
756
+ url: str,
757
+ xhtml: str,
758
+ jsonld: dict[str, Any] | list[Any],
759
+ property_guides: dict[str, dict[str, list[str]]] | None,
760
+ target_type: str | None,
761
+ ) -> dict[str, Any] | None:
762
+ prompt = _quality_prompt(url, xhtml, jsonld, property_guides, target_type)
763
+ try:
764
+ response = asyncio.run(_ask_agent_async(prompt, api_key))
765
+ except Exception as exc:
766
+ raise RuntimeError(f"Agent quality request failed: {exc}") from exc
767
+ return _extract_agent_json(response)
768
+
769
+
770
+ def _replace_sources_with_file(yarrml: str, file_uri: str) -> str:
771
+ pattern = re.compile(r"(\[\s*['\"])([^'\"]+)(['\"]\s*,\s*['\"]xpath['\"])")
772
+ inline_pattern = re.compile(r"(\[\s*)([^,\]]+?~xpath)(\s*,)")
773
+
774
+ def repl(match: re.Match[str]) -> str:
775
+ return f"{match.group(1)}{file_uri}{match.group(3)}"
776
+
777
+ def repl_inline(match: re.Match[str]) -> str:
778
+ return f"{match.group(1)}{file_uri}~xpath{match.group(3)}"
779
+
780
+ yarrml = pattern.sub(repl, yarrml)
781
+ return inline_pattern.sub(repl_inline, yarrml)
782
+
783
+
784
+ def _replace_sources_with_placeholder(yarrml: str, placeholder: str) -> str:
785
+ pattern = re.compile(r"(\[\s*['\"])([^'\"]+)(['\"]\s*,\s*['\"]xpath['\"])")
786
+ inline_pattern = re.compile(r"(\[\s*)([^,\]]+?~xpath)(\s*,)")
787
+
788
+ def repl(match: re.Match[str]) -> str:
789
+ return f"{match.group(1)}{placeholder}{match.group(3)}"
790
+
791
+ def repl_inline(match: re.Match[str]) -> str:
792
+ return f"{match.group(1)}{placeholder}~xpath{match.group(3)}"
793
+
794
+ yarrml = pattern.sub(repl, yarrml)
795
+ return inline_pattern.sub(repl_inline, yarrml)
796
+
797
+
798
+ def make_reusable_yarrrml(
799
+ yarrml: str, url: str, source_placeholder: str = "__XHTML__"
800
+ ) -> str:
801
+ normalized = _replace_sources_with_placeholder(yarrml, source_placeholder)
802
+ escaped_url = re.escape(url)
803
+ normalized = re.sub(
804
+ rf"(schema:url\s*,\s*['\"])({escaped_url})(['\"])",
805
+ r"\1__URL__\3",
806
+ normalized,
807
+ )
808
+ return normalized
809
+
810
+
811
+ def _strip_quotes(value: str | None) -> str:
812
+ if not isinstance(value, str):
813
+ return ""
814
+ value = value.strip()
815
+ if (value.startswith('"') and value.endswith('"')) or (
816
+ value.startswith("'") and value.endswith("'")
817
+ ):
818
+ return value[1:-1]
819
+ return value
820
+
821
+
822
+ def _strip_wrapped_list(value: str | None) -> str:
823
+ if not isinstance(value, str):
824
+ return ""
825
+ value = value.strip()
826
+ if value.startswith("[") and value.endswith("]"):
827
+ value = value[1:-1].strip()
828
+ return _strip_quotes(value)
829
+
830
+
831
+ def _strip_all_quotes(value: str | None) -> str:
832
+ if not isinstance(value, str):
833
+ return ""
834
+ value = value.strip()
835
+ while (value.startswith('"') and value.endswith('"')) or (
836
+ value.startswith("'") and value.endswith("'")
837
+ ):
838
+ value = value[1:-1].strip()
839
+ return value
840
+
841
+
842
+ def _normalize_xpath_literal(value: str) -> str:
843
+ value = value.strip()
844
+ if value.startswith("{") and value.endswith("}"):
845
+ inner = value[1:-1].strip()
846
+ return f"$({inner})"
847
+ if value.startswith("$(xpath)/"):
848
+ tail = value[len("$(xpath)") :]
849
+ return f"$({tail})"
850
+ if value.startswith("$(xpath://") or value.startswith("$(xpath:/"):
851
+ tail = value[len("$(xpath:") :]
852
+ return f"$({tail})"
853
+ if value.startswith("$(") and value.endswith(")"):
854
+ inner = value[2:-1].strip()
855
+ if (inner.startswith('"') and inner.endswith('"')) or (
856
+ inner.startswith("'") and inner.endswith("'")
857
+ ):
858
+ return f"$({inner[1:-1]})"
859
+ if value.startswith("$(xpath)',"):
860
+ _, _, tail = value.partition(",")
861
+ tail = _strip_all_quotes(tail.strip())
862
+ return _normalize_xpath_literal(f"$({tail})")
863
+ return value
864
+
865
+
866
+ def _looks_like_xpath(value: str) -> bool:
867
+ value = value.strip()
868
+ return (
869
+ (value.startswith("$(") and value.endswith(")"))
870
+ or value.startswith("/")
871
+ or value.startswith(".//")
872
+ or value.startswith("//")
873
+ or value.startswith("normalize-")
874
+ or value.startswith("normalize(")
875
+ or value.startswith("string(")
876
+ or value.startswith("concat(")
877
+ )
878
+
879
+
880
+ def _simplify_xpath(value: str) -> str:
881
+ value = value.strip()
882
+ if value.startswith("$(") and value.endswith(")"):
883
+ value = value[2:-1].strip()
884
+ if value.startswith("xpath="):
885
+ value = value[len("xpath=") :].strip()
886
+ if value.startswith("xpath://") or value.startswith("xpath:/"):
887
+ value = value[len("xpath:") :]
888
+ if value.startswith(('"', "'")) and value.endswith(('"', "'")) and len(value) >= 2:
889
+ value = value[1:-1]
890
+ value = value.replace('\\"', '"').replace("\\'", "'")
891
+ match = re.match(r"(?:normalize-space|string)\((.+)\)$", value)
892
+ if match:
893
+ return match.group(1).strip()
894
+ return value
895
+
896
+
897
+ def _normalize_xpath_reference(value: str) -> str:
898
+ value = value.strip()
899
+ value = value.replace("/text()", "")
900
+ value = value.replace("text()", "")
901
+ value = re.sub(r"contains\(@class,\s*\"([^\"]+)\"\)", r"@class=\"\1\"", value)
902
+ value = re.sub(r"contains\(@class,\s*'([^']+)'\)", r"@class=\"\1\"", value)
903
+ value = re.sub(r"contains\(@id,\s*\"([^\"]+)\"\)", r"@id=\"\1\"", value)
904
+ value = re.sub(r"contains\(@id,\s*'([^']+)'\)", r"@id=\"\1\"", value)
905
+ return value
906
+
907
+
908
+ def _first_list_value(value: str) -> str:
909
+ value = value.strip()
910
+ if value.startswith("[") and value.endswith("]"):
911
+ match = re.search(r"['\"]([^'\"]+)['\"]", value)
912
+ if match:
913
+ return match.group(1)
914
+ return _strip_all_quotes(_strip_wrapped_list(value))
915
+
916
+
917
+ def _normalize_agent_yarrml(
918
+ raw: str,
919
+ url: str,
920
+ file_uri: str,
921
+ target_type: str | None,
922
+ ) -> tuple[str, list[dict[str, Any]]]:
923
+ raw = _quote_unquoted_xpath_attributes(raw)
924
+ raw = re.sub(r"(['\"])\\{([^{}]+)\\}\\1", r"\\1$(\\2)\\1", raw)
925
+ raw = re.sub(
926
+ r"(['\"])\\$\\(xpath\\)\\1\\s*,\\s*(['\"])([^'\"]+)\\2", r"\\1$(\\3)\\1", raw
927
+ )
928
+ lines = raw.splitlines()
929
+ mappings: list[dict[str, Any]] = []
930
+ current: dict[str, Any] | None = None
931
+ in_mappings = False
932
+ mappings_indent: int | None = None
933
+ ignore_keys = {"mappings", "prefixes", "sources", "po"}
934
+ last_p: str | None = None
935
+ pending_o: str | None = None
936
+ in_props_block = False
937
+ props_indent = 0
938
+ in_sources = False
939
+ sources_indent: int | None = None
940
+
941
+ for line in lines:
942
+ stripped = line.strip()
943
+ indent = len(line) - len(line.lstrip(" "))
944
+ if not stripped:
945
+ continue
946
+ if stripped == "mappings:":
947
+ in_mappings = True
948
+ mappings_indent = indent
949
+ continue
950
+ if (
951
+ in_mappings
952
+ and stripped.endswith(":")
953
+ and not stripped.startswith("-")
954
+ and not stripped.startswith("s:")
955
+ and mappings_indent is not None
956
+ and indent == mappings_indent + 2
957
+ ):
958
+ name = stripped[:-1].strip()
959
+ if name in ignore_keys:
960
+ continue
961
+ current = {"name": name, "type": None, "props": []}
962
+ mappings.append(current)
963
+ last_p = None
964
+ pending_o = None
965
+ in_props_block = False
966
+ continue
967
+ if current is None:
968
+ continue
969
+ if stripped == "sources:":
970
+ in_sources = True
971
+ sources_indent = indent
972
+ continue
973
+ if in_sources:
974
+ if sources_indent is not None and indent > sources_indent:
975
+ continue
976
+ in_sources = False
977
+ sources_indent = None
978
+ if stripped == "po:":
979
+ continue
980
+ if stripped == "p:":
981
+ in_props_block = True
982
+ props_indent = indent
983
+ pending_o = None
984
+ last_p = None
985
+ continue
986
+ if in_props_block and indent <= props_indent and stripped != "p:":
987
+ in_props_block = False
988
+ pending_o = None
989
+ if stripped.startswith("- [") and "value:" in stripped:
990
+ if current is None:
991
+ continue
992
+ match = re.search(r"value:\s*\"([^\"]+)\"", stripped) or re.search(
993
+ r"value:\s*'([^']+)'", stripped
994
+ )
995
+ if match:
996
+ current["source_xpath"] = match.group(1)
997
+ else:
998
+ match = re.search(r"\\['html'\\]\\s*,\\s*\"([^\"]+)\"", stripped)
999
+ if match:
1000
+ current["source_xpath"] = match.group(1)
1001
+ continue
1002
+ if stripped.startswith("- ["):
1003
+ if current is None:
1004
+ continue
1005
+ match = re.match(r"- \[\s*\"([^\"]+)\"\s*\]$", stripped) or re.match(
1006
+ r"- \[\s*'([^']+)'\s*\]$", stripped
1007
+ )
1008
+ if match:
1009
+ current["source_xpath"] = match.group(1)
1010
+ continue
1011
+ if stripped.startswith("s:") and stripped.endswith(":") and stripped != "s:":
1012
+ prop_name = stripped[2:-1].strip()
1013
+ if prop_name:
1014
+ pending_o = prop_name
1015
+ continue
1016
+ if stripped.startswith("s:") and not stripped.startswith("s: "):
1017
+ rest = stripped[2:]
1018
+ prop, _, obj_part = rest.partition(":")
1019
+ prop = prop.strip()
1020
+ obj = _normalize_xpath_literal(
1021
+ _strip_all_quotes(_strip_wrapped_list(obj_part.strip()))
1022
+ )
1023
+ if prop:
1024
+ current["props"].append((prop, obj))
1025
+ continue
1026
+ if stripped.startswith("s:"):
1027
+ value = _strip_all_quotes(
1028
+ _strip_wrapped_list(stripped.split(":", 1)[1].strip())
1029
+ )
1030
+ if value.startswith(("http://", "https://")) or _looks_like_xpath(value):
1031
+ continue
1032
+ if value.startswith("schema:"):
1033
+ current["type"] = normalize_type(value)
1034
+ continue
1035
+ if re.fullmatch(r"[A-Za-z][A-Za-z0-9]+", value):
1036
+ current["type"] = value
1037
+ continue
1038
+ if stripped.startswith("- p:"):
1039
+ _, value = stripped.split("p:", 1)
1040
+ last_p = _strip_quotes(value.strip())
1041
+ pending_o = None
1042
+ continue
1043
+ if in_props_block and stripped.startswith("schema:") and stripped.endswith(":"):
1044
+ pending_o = stripped[:-1].strip()
1045
+ continue
1046
+ if stripped.startswith("o:") and last_p:
1047
+ _, value = stripped.split("o:", 1)
1048
+ obj = _normalize_xpath_literal(
1049
+ _strip_all_quotes(_strip_wrapped_list(value.strip()))
1050
+ )
1051
+ if obj:
1052
+ current["props"].append((last_p, obj))
1053
+ last_p = None
1054
+ pending_o = None
1055
+ else:
1056
+ pending_o = last_p
1057
+ continue
1058
+ if stripped.startswith("- [a,") or stripped.startswith("- [ a,"):
1059
+ match = re.search(r"'schema:([^']+)'", stripped) or re.search(
1060
+ r"\"schema:([^\"]+)\"", stripped
1061
+ )
1062
+ if match:
1063
+ current["type"] = normalize_type(match.group(1))
1064
+ continue
1065
+ if stripped.startswith("value:") and pending_o:
1066
+ _, value = stripped.split("value:", 1)
1067
+ obj = _normalize_xpath_literal(_first_list_value(value.strip()))
1068
+ current["props"].append((pending_o, obj))
1069
+ pending_o = None
1070
+ last_p = None
1071
+ continue
1072
+ if stripped.startswith("mapping:") and pending_o:
1073
+ _, value = stripped.split("mapping:", 1)
1074
+ obj = _normalize_xpath_literal(
1075
+ _strip_all_quotes(_strip_wrapped_list(value.strip()))
1076
+ )
1077
+ if obj:
1078
+ current["props"].append((pending_o, obj))
1079
+ pending_o = None
1080
+ last_p = None
1081
+ continue
1082
+ if stripped.startswith("- ["):
1083
+ if "p:" in stripped and "o:" in stripped:
1084
+ match = re.search(r"\[p:\s*([^,]+),\s*o:\s*(.+)\]$", stripped)
1085
+ if match:
1086
+ prop = _strip_quotes(match.group(1).strip())
1087
+ obj = _normalize_xpath_literal(
1088
+ _strip_all_quotes(_strip_wrapped_list(match.group(2).strip()))
1089
+ )
1090
+ if prop == "a":
1091
+ continue
1092
+ current["props"].append((prop, obj))
1093
+ continue
1094
+ match = re.search(r"\[\s*([^,]+)\s*,\s*(.+)\]$", stripped)
1095
+ if match:
1096
+ prop = _strip_quotes(match.group(1).strip())
1097
+ obj = _normalize_xpath_literal(
1098
+ _strip_all_quotes(_strip_wrapped_list(match.group(2).strip()))
1099
+ )
1100
+ if prop == "a":
1101
+ continue
1102
+ current["props"].append((prop, obj))
1103
+
1104
+ if not mappings:
1105
+ raise RuntimeError("Agent response did not include recognizable mappings.")
1106
+
1107
+ mapping_names = {m["name"] for m in mappings}
1108
+ target = normalize_type(target_type) if target_type else None
1109
+ denied_props = {"schema:html", "html"}
1110
+ schema_props = _schema_property_set()
1111
+
1112
+ main_mapping = None
1113
+ for mapping in mappings:
1114
+ if target and normalize_type(mapping["type"] or "") == target:
1115
+ main_mapping = mapping
1116
+ break
1117
+ if main_mapping is None:
1118
+ main_mapping = mappings[0]
1119
+ if main_mapping:
1120
+ main_mapping["__main__"] = True
1121
+
1122
+ output_lines = [
1123
+ "prefixes:",
1124
+ f" schema: '{_SCHEMA_BASE}/'",
1125
+ " ex: 'http://example.com/'",
1126
+ "mappings:",
1127
+ ]
1128
+
1129
+ for mapping in mappings:
1130
+ map_name = mapping["name"]
1131
+ map_type = mapping["type"] or ("Review" if mapping is main_mapping else "Thing")
1132
+ map_type = normalize_type(map_type)
1133
+ output_lines += [
1134
+ f" {map_name}:",
1135
+ " sources:",
1136
+ f" - [{file_uri}~xpath, '/']",
1137
+ f" s: ex:{map_name}~iri",
1138
+ " po:",
1139
+ f" - [a, 'schema:{map_type}']",
1140
+ ]
1141
+ props = list(mapping["props"])
1142
+ source_xpath = mapping.get("source_xpath")
1143
+
1144
+ if mapping is main_mapping:
1145
+ has_url = any(p == "schema:url" for p, _ in props)
1146
+ if not has_url:
1147
+ props.insert(0, ("schema:url", url))
1148
+
1149
+ for prop, obj in props:
1150
+ if not prop.startswith("schema:"):
1151
+ prop = f"schema:{prop}"
1152
+ prop_name = prop[7:]
1153
+ if prop_name == "a" or prop == "schema:a":
1154
+ continue
1155
+ if "~" in prop_name or "http" in prop_name:
1156
+ continue
1157
+ if prop in denied_props:
1158
+ continue
1159
+ if prop_name not in schema_props:
1160
+ continue
1161
+ if not obj:
1162
+ continue
1163
+ if obj == "{value}" and source_xpath:
1164
+ obj = source_xpath
1165
+ if (
1166
+ isinstance(obj, str)
1167
+ and obj.startswith("ex:")
1168
+ and obj[3:] in mapping_names
1169
+ ):
1170
+ obj = obj[3:]
1171
+ if obj in mapping_names:
1172
+ output_lines.append(f" - [{prop}, ex:{obj}~iri]")
1173
+ continue
1174
+ if _looks_like_xpath(obj):
1175
+ xpath = _normalize_xpath_reference(_simplify_xpath(obj)).replace(
1176
+ "'", '"'
1177
+ )
1178
+ output_lines.append(f" - [{prop}, '$(%s)']" % xpath)
1179
+ continue
1180
+ output_lines.append(f" - [{prop}, '{obj}']")
1181
+
1182
+ return "\n".join(output_lines) + "\n", mappings
1183
+
1184
+
1185
+ def _quote_unquoted_xpath_attributes(text: str) -> str:
1186
+ pattern = re.compile(
1187
+ r"@(id|class|property|rel|name|type|itemprop|content|href|src)\s*=\s*([A-Za-z0-9_-]+)"
1188
+ )
1189
+
1190
+ def repl(match: re.Match[str]) -> str:
1191
+ attr = match.group(1)
1192
+ value = match.group(2)
1193
+ return f'@{attr}="{value}"'
1194
+
1195
+ return pattern.sub(repl, text)
1196
+
1197
+
1198
+ def _run_yarrrml_parser(input_path: Path, output_path: Path) -> None:
1199
+ parser = shutil.which("yarrrml-parser")
1200
+ if not parser:
1201
+ raise RuntimeError(
1202
+ "yarrrml-parser is required. Install with: npm install -g @rmlio/yarrrml-parser"
1203
+ )
1204
+ if output_path.exists():
1205
+ output_path.unlink()
1206
+ result = subprocess.run(
1207
+ [parser, "-i", str(input_path), "-o", str(output_path)],
1208
+ capture_output=True,
1209
+ text=True,
1210
+ check=False,
1211
+ )
1212
+ if not output_path.exists():
1213
+ error = (result.stderr or result.stdout).strip()
1214
+ raise RuntimeError(f"yarrrml-parser failed to produce output. {error}")
1215
+ if result.returncode != 0:
1216
+ raise RuntimeError(f"yarrrml-parser failed: {result.stderr.strip()}")
1217
+
1218
+
1219
+ def _materialize_graph(mapping_path: Path) -> Graph:
1220
+ try:
1221
+ import morph_kgc
1222
+ except ImportError as exc:
1223
+ raise RuntimeError(
1224
+ "morph-kgc is required. Install with: pip install morph-kgc"
1225
+ ) from exc
1226
+
1227
+ config = (
1228
+ "[CONFIGURATION]\n"
1229
+ "output_format = N-TRIPLES\n"
1230
+ "\n"
1231
+ "[DataSource1]\n"
1232
+ f"mappings = {mapping_path}\n"
1233
+ )
1234
+ return morph_kgc.materialize(config)
1235
+
1236
+
1237
+ def materialize_yarrrml(
1238
+ yarrrml: str,
1239
+ xhtml_path: Path,
1240
+ workdir: Path,
1241
+ *,
1242
+ url: str | None = None,
1243
+ ) -> Graph:
1244
+ file_uri = xhtml_path.as_posix()
1245
+ normalized = _replace_sources_with_file(yarrrml, file_uri)
1246
+ if url:
1247
+ normalized = re.sub(
1248
+ r"(schema:url\s*,\s*['\"])__URL__(['\"])",
1249
+ rf"\1{url}\2",
1250
+ normalized,
1251
+ )
1252
+ workdir.mkdir(parents=True, exist_ok=True)
1253
+ yarrml_path = workdir / "mapping.yarrrml"
1254
+ rml_path = workdir / "mapping.ttl"
1255
+ yarrml_path.write_text(normalized)
1256
+ _run_yarrrml_parser(yarrml_path, rml_path)
1257
+ _ensure_subject_termtype_iri(rml_path)
1258
+ _normalize_reference_formulation(rml_path)
1259
+ return _materialize_graph(rml_path)
1260
+
1261
+
1262
+ def normalize_yarrrml_mappings(
1263
+ yarrrml: str,
1264
+ url: str,
1265
+ xhtml_path: Path,
1266
+ target_type: str | None = None,
1267
+ ) -> tuple[str, list[dict[str, Any]]]:
1268
+ return _normalize_agent_yarrml(yarrrml, url, xhtml_path.as_posix(), target_type)
1269
+
1270
+
1271
+ def materialize_yarrrml_jsonld(
1272
+ yarrrml: str,
1273
+ xhtml_path: Path,
1274
+ workdir: Path,
1275
+ *,
1276
+ url: str | None = None,
1277
+ ) -> dict[str, Any] | list[Any]:
1278
+ file_uri = xhtml_path.as_posix()
1279
+ normalized = _replace_sources_with_file(yarrrml, file_uri)
1280
+ if url:
1281
+ normalized = re.sub(
1282
+ r"(schema:url\s*,\s*['\"])__URL__(['\"])",
1283
+ rf"\1{url}\2",
1284
+ normalized,
1285
+ )
1286
+ workdir.mkdir(parents=True, exist_ok=True)
1287
+ yarrml_path = workdir / "mapping.yarrml"
1288
+ rml_path = workdir / "mapping.ttl"
1289
+ yarrml_path.write_text(normalized)
1290
+ _run_yarrrml_parser(yarrml_path, rml_path)
1291
+ _ensure_subject_termtype_iri(rml_path)
1292
+ _normalize_reference_formulation(rml_path)
1293
+ return _materialize_jsonld(rml_path)
1294
+
1295
+
1296
+ def postprocess_jsonld(
1297
+ jsonld_raw: dict[str, Any] | list[Any],
1298
+ mappings: list[dict[str, Any]],
1299
+ xhtml: str,
1300
+ dataset_uri: str,
1301
+ url: str,
1302
+ target_type: str | None = None,
1303
+ ) -> dict[str, Any]:
1304
+ jsonld_raw = _fill_jsonld_from_mappings(jsonld_raw, mappings, xhtml)
1305
+ _ensure_node_ids(jsonld_raw, dataset_uri, url)
1306
+ _dedupe_review_notes(jsonld_raw)
1307
+ normalized = normalize_jsonld(
1308
+ jsonld_raw, dataset_uri, url, target_type, embed_nodes=False
1309
+ )
1310
+ _materialize_literal_nodes(normalized, dataset_uri, url)
1311
+ _ensure_author_name(normalized, xhtml, dataset_uri, url)
1312
+ _ensure_review_url(normalized, url)
1313
+ _prune_empty_rating_nodes(normalized)
1314
+ return normalized
1315
+
1316
+
1317
+ def _prune_empty_rating_nodes(data: dict[str, Any] | list[Any]) -> None:
1318
+ nodes = _flatten_jsonld(data)
1319
+ if not nodes:
1320
+ return
1321
+ rating_value_key = f"{_SCHEMA_BASE}/ratingValue"
1322
+ empty_rating_ids: set[str] = set()
1323
+
1324
+ def _has_rating_value(node: dict[str, Any]) -> bool:
1325
+ value = node.get(rating_value_key, node.get("ratingValue"))
1326
+ if value is None:
1327
+ return False
1328
+ values = value if isinstance(value, list) else [value]
1329
+ for item in values:
1330
+ if isinstance(item, dict):
1331
+ text = item.get("@value") or item.get("value")
1332
+ else:
1333
+ text = item
1334
+ if isinstance(text, str) and text.strip():
1335
+ return True
1336
+ if isinstance(text, (int, float)):
1337
+ return True
1338
+ return False
1339
+
1340
+ for node in nodes:
1341
+ if not isinstance(node, dict):
1342
+ continue
1343
+ node_types = {
1344
+ normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
1345
+ }
1346
+ if "Rating" not in node_types:
1347
+ continue
1348
+ if not _has_rating_value(node):
1349
+ node_id = node.get("@id")
1350
+ if isinstance(node_id, str):
1351
+ empty_rating_ids.add(node_id)
1352
+ if not empty_rating_ids:
1353
+ return
1354
+
1355
+ def _filter_refs(values: Any) -> Any:
1356
+ if isinstance(values, list):
1357
+ filtered = [
1358
+ value
1359
+ for value in values
1360
+ if not (
1361
+ isinstance(value, dict) and value.get("@id") in empty_rating_ids
1362
+ )
1363
+ ]
1364
+ return filtered
1365
+ return values
1366
+
1367
+ for node in nodes:
1368
+ if not isinstance(node, dict):
1369
+ continue
1370
+ for key in (f"{_SCHEMA_BASE}/reviewRating", "reviewRating"):
1371
+ if key in node:
1372
+ node[key] = _filter_refs(node[key])
1373
+ if not node[key]:
1374
+ node.pop(key, None)
1375
+
1376
+ if isinstance(data, dict) and isinstance(data.get("@graph"), list):
1377
+ data["@graph"] = [
1378
+ node
1379
+ for node in data["@graph"]
1380
+ if not (isinstance(node, dict) and node.get("@id") in empty_rating_ids)
1381
+ ]
1382
+ elif isinstance(data, list):
1383
+ data[:] = [
1384
+ node
1385
+ for node in data
1386
+ if not (isinstance(node, dict) and node.get("@id") in empty_rating_ids)
1387
+ ]
1388
+
1389
+
1390
+ def _dedupe_review_notes(data: dict[str, Any] | list[Any]) -> None:
1391
+ nodes = _flatten_jsonld(data)
1392
+ if not nodes:
1393
+ return
1394
+ pos_key = f"{_SCHEMA_BASE}/positiveNotes"
1395
+ neg_key = f"{_SCHEMA_BASE}/negativeNotes"
1396
+
1397
+ def _extract_values(values: Any) -> list[str]:
1398
+ if isinstance(values, list):
1399
+ items = values
1400
+ else:
1401
+ items = [values]
1402
+ normalized: list[str] = []
1403
+ for item in items:
1404
+ if isinstance(item, dict):
1405
+ value = item.get("@value") or item.get("value")
1406
+ else:
1407
+ value = item
1408
+ if isinstance(value, str):
1409
+ normalized.append(value.strip())
1410
+ return [value for value in normalized if value]
1411
+
1412
+ for node in nodes:
1413
+ if not isinstance(node, dict):
1414
+ continue
1415
+ node_types = {
1416
+ normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
1417
+ }
1418
+ if "Review" not in node_types and "Product" not in node_types:
1419
+ continue
1420
+ pos_values = _extract_values(node.get(pos_key) or node.get("positiveNotes"))
1421
+ neg_values = _extract_values(node.get(neg_key) or node.get("negativeNotes"))
1422
+ if pos_values and neg_values and pos_values == neg_values:
1423
+ node.pop(pos_key, None)
1424
+ node.pop("positiveNotes", None)
1425
+
1426
+
1427
+ def _materialize_literal_nodes(
1428
+ data: dict[str, Any] | list[Any],
1429
+ dataset_uri: str,
1430
+ url: str,
1431
+ ) -> None:
1432
+ nodes = _flatten_jsonld(data)
1433
+ if not nodes:
1434
+ return
1435
+ graph = data.get("@graph") if isinstance(data, dict) else None
1436
+ if not isinstance(graph, list):
1437
+ return
1438
+ schema_author = f"{_SCHEMA_BASE}/author"
1439
+ schema_item = f"{_SCHEMA_BASE}/itemReviewed"
1440
+ schema_publisher = f"{_SCHEMA_BASE}/publisher"
1441
+ schema_name = f"{_SCHEMA_BASE}/name"
1442
+
1443
+ def _ensure_node(type_name: str, name: str, index: int) -> dict[str, Any]:
1444
+ node_id = build_id_base(dataset_uri, type_name, name, url, index)
1445
+ node = {
1446
+ "@id": node_id,
1447
+ "@type": [f"{_SCHEMA_BASE}/{type_name}"],
1448
+ schema_name: [{"@value": name}],
1449
+ "@context": _SCHEMA_BASE,
1450
+ }
1451
+ graph.append(node)
1452
+ return node
1453
+
1454
+ def _replace_literal(
1455
+ node: dict[str, Any], key: str, type_name: str, start_index: int
1456
+ ) -> None:
1457
+ values = node.get(key)
1458
+ if not values:
1459
+ return
1460
+ items = values if isinstance(values, list) else [values]
1461
+ new_refs: list[dict[str, Any]] = []
1462
+ for idx, item in enumerate(items, start=start_index):
1463
+ if isinstance(item, dict) and item.get("@id"):
1464
+ new_refs.append(item)
1465
+ continue
1466
+ if isinstance(item, dict) and "@value" in item:
1467
+ name = str(item["@value"]).strip()
1468
+ else:
1469
+ name = str(item).strip()
1470
+ if not name:
1471
+ continue
1472
+ new_node = _ensure_node(type_name, name, idx)
1473
+ new_refs.append({"@id": new_node["@id"]})
1474
+ if new_refs:
1475
+ node[key] = new_refs
1476
+ else:
1477
+ node.pop(key, None)
1478
+
1479
+ review_nodes = [
1480
+ node
1481
+ for node in nodes
1482
+ if isinstance(node, dict)
1483
+ and "Review"
1484
+ in {normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)}
1485
+ ]
1486
+ for index, review in enumerate(review_nodes, start=1):
1487
+ _replace_literal(review, schema_author, "Person", index)
1488
+ _replace_literal(review, schema_item, "Product", index + 100)
1489
+ _replace_literal(review, schema_publisher, "Organization", index + 200)
1490
+
1491
+
1492
+ def _ensure_review_url(data: dict[str, Any] | list[Any], url: str) -> None:
1493
+ nodes = _flatten_jsonld(data)
1494
+ if not nodes or not url:
1495
+ return
1496
+ url_key = f"{_SCHEMA_BASE}/url"
1497
+ for node in nodes:
1498
+ if not isinstance(node, dict):
1499
+ continue
1500
+ node_types = {
1501
+ normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
1502
+ }
1503
+ if "Review" not in node_types:
1504
+ continue
1505
+ if url_key not in node:
1506
+ node[url_key] = [{"@value": url}]
1507
+
1508
+
1509
+ def _ensure_author_name(
1510
+ data: dict[str, Any] | list[Any],
1511
+ xhtml: str,
1512
+ dataset_uri: str,
1513
+ url: str,
1514
+ ) -> None:
1515
+ author_name = _extract_author_name(xhtml)
1516
+ if not author_name:
1517
+ return
1518
+ nodes = _flatten_jsonld(data)
1519
+ if not nodes:
1520
+ return
1521
+ schema_name = f"{_SCHEMA_BASE}/name"
1522
+ schema_author = f"{_SCHEMA_BASE}/author"
1523
+ graph = data.get("@graph") if isinstance(data, dict) else None
1524
+ if not isinstance(graph, list):
1525
+ return
1526
+
1527
+ author_nodes = [
1528
+ node
1529
+ for node in nodes
1530
+ if isinstance(node, dict)
1531
+ and "Person"
1532
+ in {normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)}
1533
+ ]
1534
+ for node in author_nodes:
1535
+ if schema_name not in node:
1536
+ node[schema_name] = [{"@value": author_name}]
1537
+
1538
+ review_nodes = [
1539
+ node
1540
+ for node in nodes
1541
+ if isinstance(node, dict)
1542
+ and "Review"
1543
+ in {normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)}
1544
+ ]
1545
+ for review in review_nodes:
1546
+ if schema_author in review:
1547
+ continue
1548
+ author_node = build_id_base(dataset_uri, "Person", author_name, url, 0)
1549
+ graph.append(
1550
+ {
1551
+ "@id": author_node,
1552
+ "@type": [f"{_SCHEMA_BASE}/Person"],
1553
+ schema_name: [{"@value": author_name}],
1554
+ "@context": _SCHEMA_BASE,
1555
+ }
1556
+ )
1557
+ review[schema_author] = [{"@id": author_node}]
1558
+
1559
+
1560
+ def _extract_author_name(xhtml: str) -> str | None:
1561
+ try:
1562
+ from lxml import html as lxml_html
1563
+ except Exception:
1564
+ return None
1565
+ parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
1566
+ try:
1567
+ doc = lxml_html.document_fromstring(xhtml, parser=parser)
1568
+ except Exception:
1569
+ return None
1570
+
1571
+ candidates = [
1572
+ "//meta[@name='author']/@content",
1573
+ "//meta[@property='article:author']/@content",
1574
+ "//a[@rel='author']/text()",
1575
+ "//*[contains(normalize-space(.), 'Written by')]/following::a[1]/text()",
1576
+ ]
1577
+ for path in candidates:
1578
+ try:
1579
+ results = doc.xpath(path)
1580
+ except Exception:
1581
+ continue
1582
+ for value in results:
1583
+ if isinstance(value, str) and value.strip():
1584
+ return value.strip()
1585
+ return None
1586
+
1587
+
1588
+ def _materialize_jsonld(mapping_path: Path) -> dict[str, Any] | list[Any]:
1589
+ graph = _materialize_graph(mapping_path)
1590
+ jsonld_str = graph.serialize(format="json-ld")
1591
+ return json.loads(jsonld_str)
1592
+
1593
+
1594
+ def _ensure_subject_termtype_iri(mapping_path: Path) -> None:
1595
+ graph = Graph()
1596
+ graph.parse(mapping_path, format="turtle")
1597
+ for subject_map in graph.subjects(RDF.type, _RR.SubjectMap):
1598
+ graph.add((subject_map, _RR.termType, _RR.IRI))
1599
+ graph.serialize(destination=str(mapping_path), format="turtle")
1600
+
1601
+
1602
+ def _normalize_reference_formulation(mapping_path: Path) -> None:
1603
+ graph = Graph()
1604
+ graph.parse(mapping_path, format="turtle")
1605
+ replaced = False
1606
+ for predicate in (_RML.referenceFormulation, _RML_LEGACY.referenceFormulation):
1607
+ for subject in list(graph.subjects(predicate, _QL.XPath)):
1608
+ graph.remove((subject, predicate, _QL.XPath))
1609
+ graph.add((subject, predicate, _RML.XPath))
1610
+ replaced = True
1611
+ if replaced:
1612
+ graph.serialize(destination=str(mapping_path), format="turtle")
1613
+
1614
+
1615
+ def _flatten_jsonld(data: dict[str, Any] | list[Any]) -> list[dict[str, Any]]:
1616
+ if isinstance(data, list):
1617
+ return [node for node in data if isinstance(node, dict)]
1618
+ if "@graph" in data and isinstance(data["@graph"], list):
1619
+ return [node for node in data["@graph"] if isinstance(node, dict)]
1620
+ return [data] if isinstance(data, dict) else []
1621
+
1622
+
1623
+ def ensure_no_blank_nodes(graph: Graph) -> None:
1624
+ offenders: list[tuple[Identifier, Identifier, Identifier]] = []
1625
+ for subj, pred, obj in graph:
1626
+ if isinstance(subj, BNode) or isinstance(obj, BNode):
1627
+ offenders.append((subj, pred, obj))
1628
+ if len(offenders) >= 5:
1629
+ break
1630
+ if offenders:
1631
+ sample = "; ".join(f"{s} {p} {o}" for s, p, o in offenders)
1632
+ raise RuntimeError(
1633
+ "Blank nodes are not allowed in RDF output. "
1634
+ f"Found {len(offenders)} sample triples with blank nodes: {sample}"
1635
+ )
1636
+
1637
+
1638
+ def _collect_jsonld_nodes(data: Any) -> list[dict[str, Any]]:
1639
+ nodes: list[dict[str, Any]] = []
1640
+
1641
+ def _walk(value: Any) -> None:
1642
+ if isinstance(value, dict):
1643
+ if _is_jsonld_node(value):
1644
+ nodes.append(value)
1645
+ for child in value.values():
1646
+ _walk(child)
1647
+ elif isinstance(value, list):
1648
+ for item in value:
1649
+ _walk(item)
1650
+
1651
+ _walk(data)
1652
+ return nodes
1653
+
1654
+
1655
+ def _strip_iri_suffix(value: str) -> str:
1656
+ return value[:-4] if value.endswith("~iri") else value
1657
+
1658
+
1659
+ def _normalize_iri_suffixes(data: Any) -> Any:
1660
+ if isinstance(data, dict):
1661
+ out: dict[str, Any] = {}
1662
+ for key, value in data.items():
1663
+ if key == "@id" and isinstance(value, str):
1664
+ out[key] = _strip_iri_suffix(value)
1665
+ else:
1666
+ out[key] = _normalize_iri_suffixes(value)
1667
+ return out
1668
+ if isinstance(data, list):
1669
+ return [_normalize_iri_suffixes(item) for item in data]
1670
+ return data
1671
+
1672
+
1673
+ def _xpath_first_text(doc: Any, xpath: str) -> str | None:
1674
+ def _eval(path: str) -> list[Any]:
1675
+ return doc.xpath(path)
1676
+
1677
+ try:
1678
+ result = _eval(xpath)
1679
+ except Exception:
1680
+ return None
1681
+ if not result:
1682
+ relaxed = _relax_xpath(xpath)
1683
+ if relaxed != xpath:
1684
+ try:
1685
+ result = _eval(relaxed)
1686
+ except Exception:
1687
+ return None
1688
+ if not result:
1689
+ return None
1690
+ for item in result:
1691
+ if isinstance(item, str):
1692
+ text = item.strip()
1693
+ elif hasattr(item, "text_content"):
1694
+ text = item.text_content().strip()
1695
+ else:
1696
+ text = str(item).strip()
1697
+ if text:
1698
+ return text
1699
+ return None
1700
+
1701
+
1702
+ def _relax_xpath(value: str) -> str:
1703
+ relaxed = value
1704
+ relaxed = re.sub(r"@class=\"([^\"]+)\"", r'contains(@class, "\1")', relaxed)
1705
+ relaxed = re.sub(r"@id=\"([^\"]+)\"", r'contains(@id, "\1")', relaxed)
1706
+ relaxed = relaxed.replace("//div[", "//*[")
1707
+ relaxed = relaxed.replace("/div[", "/*[")
1708
+ relaxed = relaxed.replace("//p[", "//*[")
1709
+ relaxed = relaxed.replace("/p[", "/*[")
1710
+ return relaxed
1711
+
1712
+
1713
+ def _extract_list_items(doc: Any, xpaths: list[str]) -> list[str]:
1714
+ items: list[str] = []
1715
+ seen: set[str] = set()
1716
+ for path in xpaths:
1717
+ try:
1718
+ results = doc.xpath(path)
1719
+ except Exception:
1720
+ continue
1721
+ for item in results:
1722
+ if hasattr(item, "text_content"):
1723
+ text = item.text_content().strip()
1724
+ else:
1725
+ text = str(item).strip()
1726
+ if text:
1727
+ if text in seen:
1728
+ continue
1729
+ seen.add(text)
1730
+ items.append(text)
1731
+ return items
1732
+
1733
+
1734
+ def _build_item_list(items: list[str]) -> dict[str, Any]:
1735
+ entries = []
1736
+ for idx, name in enumerate(items, start=1):
1737
+ entries.append(
1738
+ {
1739
+ "@type": "ListItem",
1740
+ "position": idx,
1741
+ "name": name,
1742
+ }
1743
+ )
1744
+ return {"@type": "ItemList", "itemListElement": entries}
1745
+
1746
+
1747
+ def _is_item_list_value(value: Any) -> bool:
1748
+ if not isinstance(value, list) or not value:
1749
+ return False
1750
+ item = value[0]
1751
+ if not isinstance(item, dict):
1752
+ return False
1753
+ return normalize_type(item.get("@type")) == "ItemList"
1754
+
1755
+
1756
+ def _extract_rating_number(text: str | None) -> str | None:
1757
+ if not text:
1758
+ return None
1759
+ match = re.search(r"-?\\d+(?:\\.\\d+)?", text)
1760
+ if not match:
1761
+ return None
1762
+ try:
1763
+ value = float(match.group(0))
1764
+ except ValueError:
1765
+ return None
1766
+ if value < 0 or value > 5:
1767
+ return None
1768
+ return match.group(0)
1769
+
1770
+
1771
+ def _extract_rating_value(doc: Any) -> str | None:
1772
+ candidates = [
1773
+ "//*[@itemprop='ratingValue']/text()",
1774
+ "//*[@data-rating]/@data-rating",
1775
+ "//*[contains(@class, 'rating')]/text()",
1776
+ "//*[contains(@class, 'Rating')]/text()",
1777
+ "//*[contains(@id, 'rating')]/text()",
1778
+ "//*[contains(@id, 'Rating')]/text()",
1779
+ "//*[contains(@aria-label, 'rating')]/@aria-label",
1780
+ "//*[contains(@aria-label, 'star')]/@aria-label",
1781
+ ]
1782
+ for xpath in candidates:
1783
+ text = _xpath_first_text(doc, xpath)
1784
+ value = _extract_rating_number(text)
1785
+ if value is not None:
1786
+ return value
1787
+ return None
1788
+
1789
+
1790
+ def enrich_graph_from_xhtml(graph: Graph, xhtml: str, url: str | None = None) -> None:
1791
+ try:
1792
+ from lxml import html as lxml_html
1793
+ except Exception:
1794
+ return
1795
+ parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
1796
+ try:
1797
+ doc = lxml_html.document_fromstring(xhtml, parser=parser)
1798
+ except Exception:
1799
+ return
1800
+
1801
+ schema = Namespace(f"{_SCHEMA_BASE}/")
1802
+ review_type = URIRef(f"{_SCHEMA_BASE}/Review")
1803
+ review_nodes = list(graph.subjects(RDF.type, review_type))
1804
+ if not review_nodes:
1805
+ return
1806
+
1807
+ title = (
1808
+ _xpath_first_text(doc, '/html/head/meta[@property="og:title"]/@content')
1809
+ or _xpath_first_text(doc, "/html/head/title/text()")
1810
+ or _xpath_first_text(doc, "//h1[1]")
1811
+ )
1812
+ description = _xpath_first_text(
1813
+ doc, '/html/head/meta[@property="og:description"]/@content'
1814
+ ) or _xpath_first_text(doc, '/html/head/meta[@name="description"]/@content')
1815
+ author_name = (
1816
+ _xpath_first_text(doc, '/html/head/meta[@name="author"]/@content')
1817
+ or _xpath_first_text(doc, '/html/head/meta[@property="author"]/@content')
1818
+ or _xpath_first_text(
1819
+ doc, '/html/head/meta[@property="article:author"]/@content'
1820
+ )
1821
+ )
1822
+ item_name = _xpath_first_text(doc, "//figure//img/@alt") or _xpath_first_text(
1823
+ doc, "//h1[1]"
1824
+ )
1825
+
1826
+ for review in review_nodes:
1827
+ if url and graph.value(review, schema.url) is None:
1828
+ graph.add((review, schema.url, Literal(url)))
1829
+ if title and graph.value(review, schema.name) is None:
1830
+ graph.add((review, schema.name, Literal(title)))
1831
+ if description and graph.value(review, schema.description) is None:
1832
+ graph.add((review, schema.description, Literal(description)))
1833
+
1834
+ author = graph.value(review, schema.author)
1835
+ if (
1836
+ author is not None
1837
+ and author_name
1838
+ and graph.value(author, schema.name) is None
1839
+ ):
1840
+ graph.add((author, schema.name, Literal(author_name)))
1841
+
1842
+ item = graph.value(review, schema.itemReviewed)
1843
+ if item is not None and item_name and graph.value(item, schema.name) is None:
1844
+ graph.add((item, schema.name, Literal(item_name)))
1845
+
1846
+ rating = graph.value(review, schema.reviewRating)
1847
+ if rating is not None and graph.value(rating, schema.ratingValue) is None:
1848
+ rating_value = _extract_rating_value(doc)
1849
+ if rating_value:
1850
+ graph.add((rating, schema.ratingValue, Literal(rating_value)))
1851
+
1852
+
1853
+ def _fill_jsonld_from_mappings(
1854
+ data: dict[str, Any] | list[Any],
1855
+ mappings: list[dict[str, Any]],
1856
+ xhtml: str,
1857
+ ) -> dict[str, Any] | list[Any]:
1858
+ try:
1859
+ from lxml import html as lxml_html
1860
+ except Exception:
1861
+ return data
1862
+ parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
1863
+ try:
1864
+ doc = lxml_html.document_fromstring(xhtml, parser=parser)
1865
+ except Exception:
1866
+ return data
1867
+
1868
+ nodes = _flatten_jsonld(data)
1869
+ node_by_id: dict[str, dict[str, Any]] = {
1870
+ str(node.get("@id")): node
1871
+ for node in nodes
1872
+ if isinstance(node, dict) and node.get("@id")
1873
+ }
1874
+ for node_id, node in list(node_by_id.items()):
1875
+ if node_id.endswith("~iri"):
1876
+ node_by_id.setdefault(node_id[: -len("~iri")], node)
1877
+
1878
+ def _author_url_fallback() -> str | None:
1879
+ return _xpath_first_text(doc, "/html/head/link[@rel='author']/@href")
1880
+
1881
+ author_url_fallback = _author_url_fallback()
1882
+
1883
+ for mapping in mappings:
1884
+ name = mapping.get("name")
1885
+ if not name:
1886
+ continue
1887
+ node_id = f"http://example.com/{name}~iri"
1888
+ node = node_by_id.get(node_id) or node_by_id.get(f"http://example.com/{name}")
1889
+ if node is None:
1890
+ continue
1891
+ for prop, obj in mapping.get("props", []):
1892
+ prop_name = prop[7:] if prop.startswith("schema:") else prop
1893
+ if prop_name in {"a", "url"}:
1894
+ continue
1895
+ full_prop = f"{_SCHEMA_BASE}/{prop_name}"
1896
+ if full_prop in node:
1897
+ continue
1898
+ if not obj:
1899
+ continue
1900
+ if obj.startswith("ex:") and obj.endswith("~iri"):
1901
+ target = obj.split("ex:", 1)[1].split("~", 1)[0]
1902
+ node[full_prop] = [{"@id": f"http://example.com/{target}"}]
1903
+ continue
1904
+ if _looks_like_xpath(obj):
1905
+ xpath = _normalize_xpath_reference(_simplify_xpath(obj))
1906
+ text = _xpath_first_text(doc, xpath)
1907
+ if text:
1908
+ if prop_name in {"ratingValue", "bestRating", "worstRating"}:
1909
+ match = re.search(r"-?\\d+(?:\\.\\d+)?", text)
1910
+ if not match:
1911
+ continue
1912
+ text = match.group(0)
1913
+ node[full_prop] = [{"@value": text}]
1914
+ if prop_name == "name":
1915
+ node_type = node.get("@type") or []
1916
+ node_types = {
1917
+ normalize_type(t) for t in node_type if isinstance(t, str)
1918
+ }
1919
+ if node_types & {"Person", "Organization"}:
1920
+ url_xpath = f"{xpath}/@href"
1921
+ url_value = _xpath_first_text(doc, url_xpath)
1922
+ if url_value:
1923
+ node[f"{_SCHEMA_BASE}/url"] = [{"@value": url_value}]
1924
+ elif author_url_fallback:
1925
+ node[f"{_SCHEMA_BASE}/url"] = [
1926
+ {"@value": author_url_fallback}
1927
+ ]
1928
+ continue
1929
+ if obj:
1930
+ node[full_prop] = [{"@value": obj}]
1931
+
1932
+ for node in nodes:
1933
+ if not isinstance(node, dict):
1934
+ continue
1935
+ node_types = {
1936
+ normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
1937
+ }
1938
+ if "Review" not in node_types:
1939
+ continue
1940
+ review_rating_prop = f"{_SCHEMA_BASE}/reviewRating"
1941
+ review_rating_pairs: list[tuple[dict[str, Any], dict[str, Any]]] = []
1942
+ for rating_ref in node.get(review_rating_prop, []):
1943
+ if isinstance(rating_ref, dict) and rating_ref.get("@id"):
1944
+ rating_node = node_by_id.get(str(rating_ref["@id"]))
1945
+ if isinstance(rating_node, dict):
1946
+ review_rating_pairs.append((rating_ref, rating_node))
1947
+ valid_rating_refs: list[dict[str, Any]] = []
1948
+ for rating_ref, rating_node in review_rating_pairs:
1949
+ rating_value_key = f"{_SCHEMA_BASE}/ratingValue"
1950
+ if rating_value_key not in rating_node:
1951
+ rating_value = _extract_rating_value(doc)
1952
+ if rating_value:
1953
+ rating_node[rating_value_key] = [{"@value": rating_value}]
1954
+ if rating_value_key in rating_node:
1955
+ valid_rating_refs.append(rating_ref)
1956
+ if review_rating_pairs:
1957
+ if valid_rating_refs:
1958
+ node[review_rating_prop] = valid_rating_refs
1959
+ else:
1960
+ node.pop(review_rating_prop, None)
1961
+ if f"{_SCHEMA_BASE}/description" not in node:
1962
+ description = _xpath_first_text(
1963
+ doc, '/html/head/meta[@property="og:description"]/@content'
1964
+ ) or _xpath_first_text(doc, '/html/head/meta[@name="description"]/@content')
1965
+ if description:
1966
+ node[f"{_SCHEMA_BASE}/description"] = [{"@value": description}]
1967
+ return data
1968
+
1969
+
1970
+ def _extract_type(node: dict[str, Any]) -> str | None:
1971
+ raw = node.get("@type")
1972
+ if isinstance(raw, list) and raw:
1973
+ raw = raw[0]
1974
+ if isinstance(raw, str):
1975
+ return normalize_type(raw)
1976
+ return None
1977
+
1978
+
1979
+ def _extract_name(node: dict[str, Any]) -> str | None:
1980
+ for key in ("name", "headline", "title"):
1981
+ value = node.get(key)
1982
+ if isinstance(value, str) and value.strip():
1983
+ return value.strip()
1984
+ return None
1985
+
1986
+
1987
+ def _extract_text_value(value: Any) -> str | None:
1988
+ if isinstance(value, str) and value.strip():
1989
+ return value.strip()
1990
+ if isinstance(value, dict):
1991
+ raw = value.get("@value") or value.get("@id")
1992
+ if isinstance(raw, str) and raw.strip():
1993
+ return raw.strip()
1994
+ if isinstance(value, list):
1995
+ for item in value:
1996
+ text = _extract_text_value(item)
1997
+ if text:
1998
+ return text
1999
+ return None
2000
+
2001
+
2002
+ def _extract_name_any(node: dict[str, Any]) -> str | None:
2003
+ name = _extract_name(node)
2004
+ if name:
2005
+ return name
2006
+ for key in (
2007
+ f"{_SCHEMA_BASE}/name",
2008
+ f"{_SCHEMA_BASE}/headline",
2009
+ f"{_SCHEMA_BASE}/title",
2010
+ ):
2011
+ value = _extract_text_value(node.get(key))
2012
+ if value:
2013
+ return value
2014
+ return None
2015
+
2016
+
2017
+ def _extract_url_any(node: dict[str, Any]) -> str | None:
2018
+ value = node.get("url")
2019
+ text = _extract_text_value(value)
2020
+ if text:
2021
+ return text
2022
+ value = node.get(f"{_SCHEMA_BASE}/url")
2023
+ text = _extract_text_value(value)
2024
+ if text:
2025
+ return text
2026
+ return None
2027
+
2028
+
2029
+ def _local_prop_name(name: str) -> str:
2030
+ if name.startswith(_SCHEMA_BASE):
2031
+ return name.rsplit("/", 1)[-1]
2032
+ if name.startswith("schema:"):
2033
+ return name.split(":", 1)[-1]
2034
+ return name
2035
+
2036
+
2037
+ _INDEPENDENT_PROPERTIES = {
2038
+ "author",
2039
+ "creator",
2040
+ "publisher",
2041
+ "editor",
2042
+ "contributor",
2043
+ "copyrightHolder",
2044
+ "brand",
2045
+ "manufacturer",
2046
+ "provider",
2047
+ "seller",
2048
+ "organizer",
2049
+ "performer",
2050
+ "actor",
2051
+ "director",
2052
+ "producer",
2053
+ "member",
2054
+ "memberOf",
2055
+ "affiliation",
2056
+ "parentOrganization",
2057
+ "subOrganization",
2058
+ "alumniOf",
2059
+ "sponsor",
2060
+ "about",
2061
+ "mentions",
2062
+ "mainEntity",
2063
+ "mainEntityOfPage",
2064
+ "isPartOf",
2065
+ "partOfSeries",
2066
+ "location",
2067
+ "areaServed",
2068
+ }
2069
+
2070
+
2071
+ def _is_jsonld_node(node: dict[str, Any]) -> bool:
2072
+ if "@type" in node:
2073
+ return True
2074
+ return any(isinstance(key, str) and key.startswith(_SCHEMA_BASE) for key in node)
2075
+
2076
+
2077
+ def _ensure_node_ids(
2078
+ data: dict[str, Any] | list[Any],
2079
+ dataset_uri: str,
2080
+ url: str,
2081
+ ) -> None:
2082
+ seen: set[str] = set()
2083
+ replacements: dict[str, str] = {}
2084
+
2085
+ def _collect(value: Any) -> None:
2086
+ if isinstance(value, dict):
2087
+ node_id = value.get("@id")
2088
+ if isinstance(node_id, str) and node_id and not node_id.startswith("_:"):
2089
+ seen.add(node_id)
2090
+ for child in value.values():
2091
+ _collect(child)
2092
+ elif isinstance(value, list):
2093
+ for item in value:
2094
+ _collect(item)
2095
+
2096
+ def _assign(
2097
+ value: Any,
2098
+ counter: list[int],
2099
+ parent_id: str | None = None,
2100
+ prop_name: str | None = None,
2101
+ ) -> None:
2102
+ if isinstance(value, dict):
2103
+ if _is_jsonld_node(value):
2104
+ node_id = value.get("@id")
2105
+ local_prop = _local_prop_name(prop_name or "")
2106
+ use_parent = bool(
2107
+ parent_id
2108
+ and local_prop
2109
+ and local_prop not in _INDEPENDENT_PROPERTIES
2110
+ )
2111
+ base_uri = parent_id if use_parent else dataset_uri
2112
+ needs_id = (
2113
+ not isinstance(node_id, str)
2114
+ or not node_id
2115
+ or node_id.startswith("_:")
2116
+ or (use_parent and not node_id.startswith(parent_id or ""))
2117
+ or (not use_parent and not node_id.startswith(dataset_uri))
2118
+ )
2119
+ if needs_id:
2120
+ type_name = _extract_type(value) or "Thing"
2121
+ name = _extract_name_any(value)
2122
+ if type_name == "ListItem":
2123
+ position = _extract_text_value(
2124
+ value.get("position")
2125
+ ) or _extract_text_value(value.get(f"{_SCHEMA_BASE}/position"))
2126
+ if position:
2127
+ name = f"item-{position}"
2128
+ if not name:
2129
+ name = _dash_type(type_name)
2130
+ node_url = _extract_url_any(value)
2131
+ base_id = build_id_base(
2132
+ base_uri, type_name, name, node_url, counter[0]
2133
+ )
2134
+ candidate = base_id
2135
+ suffix = 1
2136
+ while candidate in seen:
2137
+ suffix += 1
2138
+ candidate = f"{base_id}-{suffix}"
2139
+ if isinstance(node_id, str) and node_id and node_id != candidate:
2140
+ replacements[node_id] = candidate
2141
+ if node_id.endswith("~iri"):
2142
+ replacements[node_id[: -len("~iri")]] = candidate
2143
+ else:
2144
+ replacements[f"{node_id}~iri"] = candidate
2145
+ value["@id"] = candidate
2146
+ seen.add(candidate)
2147
+ counter[0] += 1
2148
+ current_id = (
2149
+ value.get("@id") if isinstance(value.get("@id"), str) else parent_id
2150
+ )
2151
+ for key, child in value.items():
2152
+ if key in ("@id", "@type"):
2153
+ continue
2154
+ _assign(
2155
+ child,
2156
+ counter,
2157
+ current_id if isinstance(current_id, str) else None,
2158
+ key,
2159
+ )
2160
+ elif isinstance(value, list):
2161
+ for item in value:
2162
+ _assign(item, counter, parent_id, prop_name)
2163
+
2164
+ def _replace(value: Any) -> None:
2165
+ if isinstance(value, dict):
2166
+ node_id = value.get("@id")
2167
+ if isinstance(node_id, str) and node_id in replacements:
2168
+ value["@id"] = replacements[node_id]
2169
+ for child in value.values():
2170
+ _replace(child)
2171
+ elif isinstance(value, list):
2172
+ for item in value:
2173
+ _replace(item)
2174
+
2175
+ _collect(data)
2176
+ _assign(data, [1])
2177
+ if replacements:
2178
+ _replace(data)
2179
+
2180
+
2181
+ def _blank_node_errors(data: dict[str, Any] | list[Any]) -> list[str]:
2182
+ errors: list[str] = []
2183
+
2184
+ def _walk(value: Any) -> None:
2185
+ if isinstance(value, dict):
2186
+ if _is_jsonld_node(value):
2187
+ node_id = value.get("@id")
2188
+ if (
2189
+ not isinstance(node_id, str)
2190
+ or not node_id
2191
+ or node_id.startswith("_:")
2192
+ ):
2193
+ errors.append(
2194
+ "JSON-LD node missing @id or uses a blank node identifier"
2195
+ )
2196
+ for child in value.values():
2197
+ _walk(child)
2198
+ elif isinstance(value, list):
2199
+ for item in value:
2200
+ _walk(item)
2201
+
2202
+ _walk(data)
2203
+ return errors
2204
+
2205
+
2206
+ def _review_rating_dropped(
2207
+ data: dict[str, Any] | list[Any],
2208
+ mappings: list[dict[str, Any]],
2209
+ target_type: str | None,
2210
+ ) -> bool:
2211
+ target = normalize_type(target_type or "Thing")
2212
+ if target != "Review":
2213
+ return False
2214
+ mapped_props = _main_mapping_props(mappings)
2215
+ if "reviewRating" not in mapped_props:
2216
+ return False
2217
+ nodes = _flatten_jsonld(data)
2218
+ for node in nodes:
2219
+ if not isinstance(node, dict):
2220
+ continue
2221
+ node_types = {
2222
+ normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
2223
+ }
2224
+ if "Review" in node_types:
2225
+ return f"{_SCHEMA_BASE}/reviewRating" not in node
2226
+ return False
2227
+
2228
+
2229
+ def _build_id_map(
2230
+ nodes: list[dict[str, Any]],
2231
+ dataset_uri: str,
2232
+ url: str,
2233
+ ) -> dict[str, str]:
2234
+ id_map: dict[str, str] = {}
2235
+ for idx, node in enumerate(nodes):
2236
+ old_id = node.get("@id") or f"_:b{idx}"
2237
+ type_name = _extract_type(node) or "Thing"
2238
+ name = _extract_name_any(node) or _dash_type(type_name)
2239
+ node_url = _extract_url_any(node)
2240
+ if isinstance(old_id, str) and old_id.startswith(dataset_uri):
2241
+ new_id = old_id
2242
+ else:
2243
+ new_id = build_id(dataset_uri, type_name, name, node_url, idx + 1)
2244
+ if old_id in id_map:
2245
+ new_id = f"{new_id}-{idx}"
2246
+ id_map[str(old_id)] = new_id
2247
+ return id_map
2248
+
2249
+
2250
+ def _rewrite_refs(
2251
+ value: Any,
2252
+ id_map: dict[str, str],
2253
+ node_map: dict[str, dict[str, Any]],
2254
+ *,
2255
+ embed_nodes: bool,
2256
+ ) -> Any:
2257
+ if isinstance(value, dict):
2258
+ if "@id" in value and isinstance(value["@id"], str):
2259
+ ref_id = id_map.get(value["@id"], value["@id"])
2260
+ if embed_nodes and ref_id in node_map:
2261
+ return node_map[ref_id]
2262
+ return {"@id": ref_id}
2263
+ return {
2264
+ k: _rewrite_refs(v, id_map, node_map, embed_nodes=embed_nodes)
2265
+ for k, v in value.items()
2266
+ }
2267
+ if isinstance(value, list):
2268
+ return [
2269
+ _rewrite_refs(item, id_map, node_map, embed_nodes=embed_nodes)
2270
+ for item in value
2271
+ ]
2272
+ return value
2273
+
2274
+
2275
+ def _main_mapping_props(mappings: list[dict[str, Any]]) -> set[str]:
2276
+ schema_props = _schema_property_set()
2277
+ for mapping in mappings:
2278
+ if mapping.get("__main__"):
2279
+ props = mapping.get("props") or []
2280
+ clean: set[str] = set()
2281
+ for prop, _ in props:
2282
+ if not isinstance(prop, str):
2283
+ continue
2284
+ name = prop[7:] if prop.startswith("schema:") else prop
2285
+ if "~" in name or "http" in name or name == "a":
2286
+ continue
2287
+ base = name.split(".", 1)[0]
2288
+ if base in schema_props or name in schema_props:
2289
+ clean.add(base)
2290
+ return clean
2291
+ return set()
2292
+
2293
+
2294
+ def _missing_required_props(
2295
+ required_props: list[str],
2296
+ mapped_props: set[str],
2297
+ ) -> list[str]:
2298
+ missing: set[str] = set()
2299
+ mapped_base = {prop.split(".", 1)[0] for prop in mapped_props}
2300
+ for prop in required_props:
2301
+ base = prop.split(".", 1)[0]
2302
+ if base not in mapped_base:
2303
+ missing.add(prop)
2304
+ return sorted(missing)
2305
+
2306
+
2307
+ def _missing_recommended_props(
2308
+ recommended_props: list[str],
2309
+ mapped_props: set[str],
2310
+ ) -> list[str]:
2311
+ missing: set[str] = set()
2312
+ mapped_base = {prop.split(".", 1)[0] for prop in mapped_props}
2313
+ for prop in recommended_props:
2314
+ base = prop.split(".", 1)[0]
2315
+ if base not in mapped_base:
2316
+ missing.add(prop)
2317
+ return sorted(missing)
2318
+
2319
+
2320
+ def _google_allowed_properties(
2321
+ property_guides: dict[str, dict[str, list[str]]],
2322
+ ) -> dict[str, list[str]]:
2323
+ allowed: dict[str, list[str]] = {}
2324
+ for type_name, guide in property_guides.items():
2325
+ props = set(guide.get("required", [])) | set(guide.get("recommended", []))
2326
+ if type_name == "Review":
2327
+ props |= _REVIEW_OPTIONAL_EXTRAS
2328
+ props = sorted(props)
2329
+ allowed[type_name] = props
2330
+ return allowed
2331
+
2332
+
2333
+ def _mapping_allowed_property_set(
2334
+ property_guides: dict[str, dict[str, list[str]]],
2335
+ ) -> set[str]:
2336
+ props: set[str] = set()
2337
+ for guide in property_guides.values():
2338
+ for name in guide.get("required", []) + guide.get("recommended", []):
2339
+ props.add(name.split(".", 1)[0])
2340
+ props |= _REVIEW_OPTIONAL_EXTRAS
2341
+ return props
2342
+
2343
+
2344
+ def _mapping_violations(
2345
+ mappings: list[dict[str, Any]],
2346
+ allowed_props: set[str],
2347
+ target_type: str,
2348
+ ) -> list[str]:
2349
+ errors: list[str] = []
2350
+ for mapping in mappings:
2351
+ map_name = mapping.get("name", "mapping")
2352
+ map_type = normalize_type(mapping.get("type") or "")
2353
+ for prop, obj in mapping.get("props", []):
2354
+ prop_name = prop[7:] if prop.startswith("schema:") else prop
2355
+ base = prop_name.split(".", 1)[0]
2356
+ if base in {"a", "url"}:
2357
+ continue
2358
+ if "/" in base or base.startswith("http"):
2359
+ continue
2360
+ if base not in allowed_props:
2361
+ errors.append(f"{map_name}: property not allowed by Google: {base}")
2362
+ if _looks_like_xpath(obj):
2363
+ continue
2364
+ if (
2365
+ base in {"author", "reviewRating", "itemReviewed"}
2366
+ and obj.startswith("ex:")
2367
+ and obj.endswith("~iri")
2368
+ ):
2369
+ continue
2370
+ if obj.startswith("ex:") and obj.endswith("~iri"):
2371
+ continue
2372
+ errors.append(f"{map_name}: hard-coded literal for {base} is not allowed")
2373
+ if map_type == "Review":
2374
+ review_rating = [
2375
+ obj
2376
+ for prop, obj in mapping.get("props", [])
2377
+ if prop.endswith("reviewRating")
2378
+ ]
2379
+ for obj in review_rating:
2380
+ if not (obj.startswith("ex:") and obj.endswith("~iri")):
2381
+ errors.append(f"{map_name}: reviewRating must map to a Rating node")
2382
+ return errors
2383
+
2384
+
2385
+ def _xpath_evidence_errors(
2386
+ mappings: list[dict[str, Any]],
2387
+ xhtml: str,
2388
+ ) -> list[str]:
2389
+ try:
2390
+ from lxml import html as lxml_html
2391
+ except Exception:
2392
+ return []
2393
+ parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
2394
+ try:
2395
+ doc = lxml_html.document_fromstring(xhtml, parser=parser)
2396
+ except Exception:
2397
+ return []
2398
+ errors: list[str] = []
2399
+ for mapping in mappings:
2400
+ map_name = mapping.get("name", "mapping")
2401
+ for prop, obj in mapping.get("props", []):
2402
+ prop_name = prop[7:] if prop.startswith("schema:") else prop
2403
+ if prop_name == "url":
2404
+ continue
2405
+ if not _looks_like_xpath(obj):
2406
+ continue
2407
+ try:
2408
+ result = doc.xpath(_simplify_xpath(obj))
2409
+ except Exception:
2410
+ errors.append(f"{map_name}: invalid XPath for {prop_name}")
2411
+ continue
2412
+ if not result:
2413
+ errors.append(f"{map_name}: XPath returned no results for {prop_name}")
2414
+ continue
2415
+ if isinstance(result, list) and all(
2416
+ (isinstance(item, str) and not item.strip()) for item in result
2417
+ ):
2418
+ errors.append(f"{map_name}: XPath returned empty text for {prop_name}")
2419
+ return errors
2420
+
2421
+
2422
+ def _xpath_reusability_warnings(mappings: list[dict[str, Any]]) -> list[str]:
2423
+ warnings: list[str] = []
2424
+ id_with_digits = re.compile(r"@id\\s*=\\s*['\"][^'\"]*\\d[^'\"]*['\"]")
2425
+ for mapping in mappings:
2426
+ map_name = mapping.get("name", "mapping")
2427
+ for prop, obj in mapping.get("props", []):
2428
+ prop_name = prop[7:] if prop.startswith("schema:") else prop
2429
+ if not _looks_like_xpath(obj):
2430
+ continue
2431
+ candidate = obj.replace('\\"', '"').replace("\\'", "'")
2432
+ if id_with_digits.search(candidate):
2433
+ warnings.append(
2434
+ f"{map_name}: XPath for {prop_name} uses a numeric @id; prefer a reusable selector."
2435
+ )
2436
+ return warnings
2437
+
2438
+
2439
+ def _mapping_type_sanity(
2440
+ mappings: list[dict[str, Any]],
2441
+ expected_types: dict[str, tuple[str, ...]],
2442
+ ) -> list[str]:
2443
+ errors: list[str] = []
2444
+ mapping_types = {
2445
+ m.get("name"): normalize_type(m.get("type") or "") for m in mappings
2446
+ }
2447
+ for mapping in mappings:
2448
+ map_name = mapping.get("name")
2449
+ for prop, obj in mapping.get("props", []):
2450
+ prop_name = prop[7:] if prop.startswith("schema:") else prop
2451
+ if obj.startswith("ex:") and obj.endswith("~iri"):
2452
+ target = obj.split("ex:", 1)[1].split("~", 1)[0]
2453
+ expected = expected_types.get(prop_name)
2454
+ if expected:
2455
+ actual = mapping_types.get(target, "")
2456
+ if actual and actual not in expected:
2457
+ errors.append(
2458
+ f"{map_name}: {prop_name} must map to {expected}, got {actual}"
2459
+ )
2460
+ return errors
2461
+
2462
+
2463
+ def _format_result_path(value: Identifier | None) -> str:
2464
+ if value is None:
2465
+ return "unknown"
2466
+ if isinstance(value, URIRef):
2467
+ return _short_schema_name(value) or str(value)
2468
+ return str(value)
2469
+
2470
+
2471
+ def _validation_messages(
2472
+ result: ValidationResult, max_items: int = 20
2473
+ ) -> tuple[list[str], list[str]]:
2474
+ errors: list[str] = []
2475
+ warnings: list[str] = []
2476
+ count = 0
2477
+ for res in result.report_graph.subjects(RDF.type, _SH.ValidationResult):
2478
+ severity = result.report_graph.value(res, _SH.resultSeverity)
2479
+ message = result.report_graph.value(res, _SH.resultMessage)
2480
+ path = result.report_graph.value(res, _SH.resultPath)
2481
+ source_shape = result.report_graph.value(res, _SH.sourceShape)
2482
+ source_label = result.shape_source_map.get(source_shape, "unknown")
2483
+ line = f"{_format_result_path(path)}: {message or 'validation error'} (shape: {source_label})"
2484
+ if severity == _SH.Warning:
2485
+ warnings.append(line)
2486
+ else:
2487
+ errors.append(line)
2488
+ count += 1
2489
+ if count >= max_items:
2490
+ break
2491
+ return errors, warnings
2492
+
2493
+
2494
+ def _validation_messages_for_types(
2495
+ result: ValidationResult,
2496
+ allowed_types: set[str],
2497
+ max_items: int = 20,
2498
+ ) -> tuple[list[str], list[str]]:
2499
+ errors: list[str] = []
2500
+ warnings: list[str] = []
2501
+ count = 0
2502
+ for res in result.report_graph.subjects(RDF.type, _SH.ValidationResult):
2503
+ severity = result.report_graph.value(res, _SH.resultSeverity)
2504
+ message = result.report_graph.value(res, _SH.resultMessage)
2505
+ path = result.report_graph.value(res, _SH.resultPath)
2506
+ source_shape = result.report_graph.value(res, _SH.sourceShape)
2507
+ source_label = result.shape_source_map.get(source_shape, "unknown")
2508
+ focus = result.report_graph.value(res, _SH.focusNode)
2509
+ focus_types: set[str] = set()
2510
+ if focus is not None:
2511
+ for t in result.data_graph.objects(focus, RDF.type):
2512
+ if isinstance(t, URIRef):
2513
+ focus_types.add(normalize_type(str(t)))
2514
+ relevant = not focus_types or bool(focus_types & allowed_types)
2515
+ line = f"{_format_result_path(path)}: {message or 'validation error'} (shape: {source_label})"
2516
+ if severity == _SH.Warning or not relevant:
2517
+ warnings.append(line)
2518
+ else:
2519
+ errors.append(line)
2520
+ count += 1
2521
+ if count >= max_items:
2522
+ break
2523
+ return errors, warnings
2524
+
2525
+
2526
+ def normalize_jsonld(
2527
+ data: dict[str, Any] | list[Any],
2528
+ dataset_uri: str,
2529
+ url: str,
2530
+ target_type: str | None,
2531
+ *,
2532
+ embed_nodes: bool = True,
2533
+ ) -> dict[str, Any]:
2534
+ data = _normalize_iri_suffixes(data)
2535
+ nodes = _collect_jsonld_nodes(data)
2536
+ if not nodes:
2537
+ raise RuntimeError("No JSON-LD nodes produced by morph-kgc.")
2538
+
2539
+ target = normalize_type(target_type) if target_type else None
2540
+ main_node: dict[str, Any] | None = None
2541
+ main_old_id = "_:b0"
2542
+ for idx, node in enumerate(nodes):
2543
+ node_type = _extract_type(node)
2544
+ if target and node_type == target:
2545
+ main_node = node
2546
+ main_old_id = str(node.get("@id") or f"_:b{idx}")
2547
+ break
2548
+ if main_node is None:
2549
+ main_node = nodes[0]
2550
+ main_old_id = str(main_node.get("@id") or "_:b0")
2551
+
2552
+ id_map = _build_id_map(nodes, dataset_uri, url)
2553
+ node_map: dict[str, dict[str, Any]] = {}
2554
+ for idx, node in enumerate(nodes):
2555
+ old_id = str(node.get("@id") or f"_:b{idx}")
2556
+ new_id = id_map[old_id]
2557
+ node["@id"] = new_id
2558
+ node_map[new_id] = node
2559
+
2560
+ for node in nodes:
2561
+ for key, value in list(node.items()):
2562
+ if key in ("@id", "@type"):
2563
+ continue
2564
+ node[key] = _rewrite_refs(value, id_map, node_map, embed_nodes=embed_nodes)
2565
+
2566
+ main_id = id_map.get(main_old_id)
2567
+ if not main_id:
2568
+ raise RuntimeError("Failed to resolve main node @id.")
2569
+ if embed_nodes:
2570
+ main = node_map[main_id]
2571
+ main["@context"] = _SCHEMA_BASE
2572
+ blank_nodes = _blank_node_errors(main)
2573
+ if blank_nodes:
2574
+ raise RuntimeError("Blank nodes are not allowed in JSON-LD output.")
2575
+ return main
2576
+ for node in node_map.values():
2577
+ node.setdefault("@context", _SCHEMA_BASE)
2578
+ graph = {"@context": _SCHEMA_BASE, "@graph": list(node_map.values())}
2579
+ blank_nodes = _blank_node_errors(graph)
2580
+ if blank_nodes:
2581
+ raise RuntimeError("Blank nodes are not allowed in JSON-LD output.")
2582
+ return graph
2583
+
2584
+
2585
+ def generate_from_agent(
2586
+ url: str,
2587
+ html: str,
2588
+ xhtml: str,
2589
+ cleaned_xhtml: str,
2590
+ api_key: str,
2591
+ dataset_uri: str,
2592
+ target_type: str | None,
2593
+ workdir: Path,
2594
+ debug: bool = False,
2595
+ max_retries: int = 2,
2596
+ max_nesting_depth: int = 2,
2597
+ quality_check: bool = True,
2598
+ log: Callable[[str], None] | None = None,
2599
+ ) -> tuple[str, dict[str, Any]]:
2600
+ debug_path = workdir / "agent_debug.json" if debug else None
2601
+ target_name = normalize_type(target_type or "Thing")
2602
+ property_guides = property_guides_with_related(target_name, max_nesting_depth)
2603
+ allowed_properties = _google_allowed_properties(property_guides)
2604
+ allowed_property_set = _mapping_allowed_property_set(property_guides)
2605
+ workdir.mkdir(parents=True, exist_ok=True)
2606
+ requirements_path = workdir / "requirements.json"
2607
+ requirements_path.write_text(
2608
+ json.dumps(
2609
+ {
2610
+ "target_type": target_name,
2611
+ "max_depth": max_nesting_depth,
2612
+ "types": property_guides,
2613
+ "allowed_properties": allowed_properties,
2614
+ },
2615
+ indent=2,
2616
+ )
2617
+ )
2618
+ html_path = (workdir / "rendered.html").resolve()
2619
+ html_path.write_text(html)
2620
+ xhtml_path = (workdir / "page.xhtml").resolve()
2621
+ xhtml_path.write_text(xhtml)
2622
+ cleaned_path = (workdir / "page.cleaned.xhtml").resolve()
2623
+ cleaned_path.write_text(cleaned_xhtml)
2624
+
2625
+ shape_specs = shape_specs_for_types(list(property_guides.keys()))
2626
+ mapping_validation_path = workdir / "mapping.validation.json"
2627
+ mapping_jsonld_path = workdir / "mapping.jsonld"
2628
+
2629
+ yarrml = ""
2630
+ mappings: list[dict[str, Any]] = []
2631
+ missing_required: list[str] = []
2632
+ previous_yarrml: str | None = None
2633
+ validation_errors: list[str] | None = None
2634
+ validation_report: list[str] | None = None
2635
+ missing_recommended: list[str] = []
2636
+ xpath_warnings: list[str] = []
2637
+ quality_feedback: list[str] | None = None
2638
+ quality_score: int | None = None
2639
+ jsonld_raw: dict[str, Any] | list[Any] | None = None
2640
+ normalized_jsonld: dict[str, Any] | None = None
2641
+
2642
+ for attempt in range(max_retries + 1):
2643
+ yarrml = ask_agent_for_yarrml(
2644
+ api_key,
2645
+ url,
2646
+ cleaned_xhtml,
2647
+ target_type,
2648
+ debug=debug,
2649
+ debug_path=debug_path,
2650
+ property_guides=property_guides,
2651
+ missing_required=missing_required if attempt > 0 else None,
2652
+ missing_recommended=missing_recommended if attempt > 0 else None,
2653
+ previous_yarrml=previous_yarrml if attempt > 0 else None,
2654
+ validation_errors=validation_errors if attempt > 0 else None,
2655
+ validation_report=validation_report if attempt > 0 else None,
2656
+ xpath_warnings=xpath_warnings if attempt > 0 else None,
2657
+ allow_properties=allowed_properties,
2658
+ quality_feedback=quality_feedback if attempt > 0 else None,
2659
+ )
2660
+
2661
+ yarrml, mappings = _normalize_agent_yarrml(
2662
+ yarrml,
2663
+ url,
2664
+ cleaned_path.as_posix(),
2665
+ target_type,
2666
+ )
2667
+ yarrml_path = workdir / "mapping.yarrml"
2668
+ rml_path = workdir / "mapping.ttl"
2669
+ yarrml_path.write_text(yarrml)
2670
+
2671
+ try:
2672
+ _run_yarrrml_parser(yarrml_path, rml_path)
2673
+ _ensure_subject_termtype_iri(rml_path)
2674
+ _normalize_reference_formulation(rml_path)
2675
+ jsonld_raw = _materialize_jsonld(rml_path)
2676
+ jsonld_raw = _fill_jsonld_from_mappings(jsonld_raw, mappings, cleaned_xhtml)
2677
+ _ensure_node_ids(jsonld_raw, dataset_uri, url)
2678
+ mapping_jsonld_path.write_text(json.dumps(jsonld_raw, indent=2))
2679
+ normalized_jsonld = postprocess_jsonld(
2680
+ jsonld_raw,
2681
+ mappings,
2682
+ cleaned_xhtml,
2683
+ dataset_uri,
2684
+ url,
2685
+ target_type=target_type,
2686
+ )
2687
+ final_jsonld_path = workdir / "structured-data.jsonld"
2688
+ final_jsonld_path.write_text(json.dumps(normalized_jsonld, indent=2))
2689
+ validation_result = validate_file(
2690
+ str(final_jsonld_path), shape_specs=shape_specs
2691
+ )
2692
+ errors, warnings = _validation_messages_for_types(
2693
+ validation_result,
2694
+ set(property_guides.keys()),
2695
+ )
2696
+ validation_errors = errors or None
2697
+ validation_report = (
2698
+ validation_result.report_text.splitlines()
2699
+ if validation_result
2700
+ else None
2701
+ )
2702
+ except Exception as exc:
2703
+ mapping_validation_path.write_text(
2704
+ json.dumps(
2705
+ {
2706
+ "conforms": False,
2707
+ "warning_count": 0,
2708
+ "errors": [str(exc)],
2709
+ "warnings": [],
2710
+ },
2711
+ indent=2,
2712
+ )
2713
+ )
2714
+ validation_errors = [str(exc)]
2715
+ validation_report = None
2716
+ previous_yarrml = yarrml
2717
+ continue
2718
+
2719
+ mapped_props = _main_mapping_props(mappings)
2720
+ required_props = property_guides.get(target_name, {}).get("required", [])
2721
+ recommended_props = property_guides.get(target_name, {}).get("recommended", [])
2722
+ missing_required = _missing_required_props(required_props, mapped_props)
2723
+ missing_recommended = _missing_recommended_props(
2724
+ recommended_props, mapped_props
2725
+ )
2726
+ mapping_errors: list[str] = []
2727
+ mapping_errors.extend(
2728
+ _mapping_violations(mappings, allowed_property_set, target_name)
2729
+ )
2730
+ evidence_warnings = _xpath_evidence_errors(mappings, cleaned_xhtml)
2731
+ reusability_warnings = _xpath_reusability_warnings(mappings)
2732
+ expected_types = {
2733
+ "reviewRating": ("Rating",),
2734
+ "author": ("Person", "Organization"),
2735
+ }
2736
+ mapping_errors.extend(_mapping_type_sanity(mappings, expected_types))
2737
+ if reusability_warnings:
2738
+ mapping_errors.extend(reusability_warnings)
2739
+ warnings_out: list[str] = list(warnings) if "warnings" in locals() else []
2740
+ if evidence_warnings:
2741
+ warnings_out.extend(evidence_warnings)
2742
+ if reusability_warnings:
2743
+ warnings_out.extend(reusability_warnings)
2744
+ if missing_required:
2745
+ warnings_out.append(
2746
+ f"Missing required properties: {', '.join(missing_required)}"
2747
+ )
2748
+ if mapping_errors:
2749
+ mapping_validation_path.write_text(
2750
+ json.dumps(
2751
+ {
2752
+ "conforms": False,
2753
+ "warning_count": validation_result.warning_count
2754
+ if "validation_result" in locals()
2755
+ else 0,
2756
+ "errors": mapping_errors,
2757
+ "warnings": warnings_out,
2758
+ "shacl_errors": errors if "errors" in locals() else [],
2759
+ },
2760
+ indent=2,
2761
+ )
2762
+ )
2763
+ validation_errors = mapping_errors
2764
+ validation_report = None
2765
+ xpath_warnings = warnings_out
2766
+ else:
2767
+ if _review_rating_dropped(jsonld_raw, mappings, target_type):
2768
+ warnings_out.append("Review ratingValue missing; reviewRating dropped.")
2769
+ mapping_validation_path.write_text(
2770
+ json.dumps(
2771
+ {
2772
+ "conforms": validation_result.conforms
2773
+ if "validation_result" in locals()
2774
+ else True,
2775
+ "warning_count": validation_result.warning_count
2776
+ if "validation_result" in locals()
2777
+ else 0,
2778
+ "errors": errors if "errors" in locals() else [],
2779
+ "warnings": warnings_out,
2780
+ },
2781
+ indent=2,
2782
+ )
2783
+ )
2784
+ validation_errors = errors or None
2785
+ validation_report = (
2786
+ validation_result.report_text.splitlines()
2787
+ if validation_result
2788
+ else None
2789
+ )
2790
+ xpath_warnings = warnings_out
2791
+ if quality_check:
2792
+ quality_score = None
2793
+ quality_feedback = None
2794
+ try:
2795
+ quality_payload = ask_agent_for_quality(
2796
+ api_key,
2797
+ url,
2798
+ cleaned_xhtml,
2799
+ normalized_jsonld,
2800
+ property_guides,
2801
+ target_type,
2802
+ )
2803
+ except RuntimeError:
2804
+ quality_payload = None
2805
+ if isinstance(quality_payload, dict):
2806
+ score = quality_payload.get("score")
2807
+ if isinstance(score, (int, float)):
2808
+ quality_score = int(score)
2809
+ missing = quality_payload.get("missing_in_jsonld")
2810
+ notes = quality_payload.get("notes")
2811
+ suggested = quality_payload.get("suggested_xpath")
2812
+ feedback: list[str] = []
2813
+ if isinstance(missing, list) and missing:
2814
+ feedback.append("Missing in JSON-LD (present in XHTML):")
2815
+ feedback.extend([str(item) for item in missing])
2816
+ if isinstance(suggested, dict) and suggested:
2817
+ feedback.append("Suggested XPath for missing properties:")
2818
+ for key, value in suggested.items():
2819
+ feedback.append(f"- {key}: {value}")
2820
+ if isinstance(notes, list) and notes:
2821
+ feedback.append("Notes:")
2822
+ feedback.extend([str(item) for item in notes])
2823
+ if feedback:
2824
+ feedback.append(
2825
+ f"Quality score: {quality_score}"
2826
+ if quality_score is not None
2827
+ else "Quality score unavailable"
2828
+ )
2829
+ quality_feedback = feedback
2830
+ if validation_errors is None and (
2831
+ not missing_required or attempt >= max_retries
2832
+ ):
2833
+ if (
2834
+ not quality_check
2835
+ or quality_score is None
2836
+ or quality_score >= 7
2837
+ or attempt >= max_retries
2838
+ ):
2839
+ break
2840
+ previous_yarrml = yarrml
2841
+
2842
+ if jsonld_raw is None:
2843
+ raise RuntimeError(
2844
+ "Failed to produce JSON-LD from the generated YARRRML mapping."
2845
+ )
2846
+ if validation_errors:
2847
+ logger = logging.getLogger("worai")
2848
+ logger.warning(
2849
+ "YARRRML mapping failed validation after retries; proceeding anyway. "
2850
+ f"See {mapping_validation_path} for details."
2851
+ )
2852
+
2853
+ if normalized_jsonld is None:
2854
+ normalized_jsonld = normalize_jsonld(
2855
+ jsonld_raw, dataset_uri, url, target_type, embed_nodes=False
2856
+ )
2857
+ jsonld = normalized_jsonld
2858
+ return yarrml, jsonld
2859
+
2860
+
2861
+ __all__ = [
2862
+ "StructuredDataOptions",
2863
+ "StructuredDataResult",
2864
+ "build_output_basename",
2865
+ "ensure_no_blank_nodes",
2866
+ "generate_from_agent",
2867
+ "get_dataset_uri",
2868
+ "get_dataset_uri_async",
2869
+ "make_reusable_yarrrml",
2870
+ "materialize_yarrrml_jsonld",
2871
+ "normalize_type",
2872
+ "normalize_yarrrml_mappings",
2873
+ "postprocess_jsonld",
2874
+ "shape_specs_for_type",
2875
+ ]