wordlift-sdk 2.9.1__py3-none-any.whl → 2.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +1 -1
- wordlift_sdk/render/__init__.py +30 -0
- wordlift_sdk/render/browser.py +132 -0
- wordlift_sdk/render/cleanup_options.py +24 -0
- wordlift_sdk/render/html_renderer.py +86 -0
- wordlift_sdk/render/render_options.py +21 -0
- wordlift_sdk/render/rendered_page.py +13 -0
- wordlift_sdk/render/xhtml_cleaner.py +126 -0
- wordlift_sdk/structured_data/__init__.py +27 -0
- wordlift_sdk/structured_data/agent.py +49 -0
- wordlift_sdk/structured_data/agent_generator.py +12 -0
- wordlift_sdk/structured_data/batch.py +220 -0
- wordlift_sdk/structured_data/constants.py +1 -0
- wordlift_sdk/structured_data/dataset_resolver.py +32 -0
- wordlift_sdk/structured_data/debug.py +23 -0
- wordlift_sdk/structured_data/engine.py +2875 -0
- wordlift_sdk/structured_data/inputs.py +58 -0
- wordlift_sdk/structured_data/io.py +44 -0
- wordlift_sdk/structured_data/materialization.py +70 -0
- wordlift_sdk/structured_data/models.py +48 -0
- wordlift_sdk/structured_data/orchestrator.py +194 -0
- wordlift_sdk/structured_data/rendering.py +43 -0
- wordlift_sdk/structured_data/schema_guide.py +17 -0
- wordlift_sdk/structured_data/structured_data_engine.py +58 -0
- wordlift_sdk/structured_data/validation.py +31 -0
- wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
- wordlift_sdk/url_source/__init__.py +7 -2
- wordlift_sdk/validation/__init__.py +7 -0
- wordlift_sdk/validation/generator.py +446 -0
- wordlift_sdk/validation/shacl.py +205 -0
- wordlift_sdk/validation/shacls/__init__.py +1 -0
- wordlift_sdk/validation/shacls/google-article.ttl +148 -0
- wordlift_sdk/validation/shacls/google-book.ttl +660 -0
- wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
- wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
- wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
- wordlift_sdk/validation/shacls/google-course.ttl +43 -0
- wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
- wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
- wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
- wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
- wordlift_sdk/validation/shacls/google-event.ttl +46 -0
- wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
- wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
- wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
- wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
- wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
- wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
- wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
- wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
- wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
- wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
- wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
- wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
- wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
- wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
- wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
- wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
- wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
- wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
- wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
- wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
- wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
- wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
- wordlift_sdk/validation/shacls/google-video.ttl +149 -0
- wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/METADATA +3 -1
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/RECORD +69 -5
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,2875 @@
|
|
|
1
|
+
"""Generate structured data from a rendered web page."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from importlib import resources
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Callable
|
|
16
|
+
from urllib.parse import urlparse
|
|
17
|
+
|
|
18
|
+
import wordlift_client
|
|
19
|
+
from wordlift_client import ApiClient, Configuration
|
|
20
|
+
from wordlift_client import AgentApi
|
|
21
|
+
from wordlift_client.models.ask_request import AskRequest
|
|
22
|
+
from rdflib import Graph, Namespace, RDF
|
|
23
|
+
from rdflib.term import BNode, Identifier, Literal, URIRef
|
|
24
|
+
|
|
25
|
+
from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
|
|
26
|
+
from wordlift_sdk.validation.shacl import ValidationResult, validate_file
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_SCHEMA_BASE = "https://schema.org"
|
|
30
|
+
_SCHEMA_HTTP = "http://schema.org/"
|
|
31
|
+
_AGENT_BASE_URL = "https://api.wordlift.io/agent"
|
|
32
|
+
_AGENT_MODEL = "gpt-5.1"
|
|
33
|
+
_RR = Namespace("http://www.w3.org/ns/r2rml#")
|
|
34
|
+
_RML = Namespace("http://w3id.org/rml/")
|
|
35
|
+
_RML_LEGACY = Namespace("http://semweb.mmlab.be/ns/rml#")
|
|
36
|
+
_QL = Namespace("http://semweb.mmlab.be/ns/ql#")
|
|
37
|
+
_SH = Namespace("http://www.w3.org/ns/shacl#")
|
|
38
|
+
_REVIEW_OPTIONAL_EXTRAS = {
|
|
39
|
+
"description",
|
|
40
|
+
"positiveNotes",
|
|
41
|
+
"negativeNotes",
|
|
42
|
+
"reviewBody",
|
|
43
|
+
"image",
|
|
44
|
+
"inLanguage",
|
|
45
|
+
"publisher",
|
|
46
|
+
"datePublished",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class StructuredDataOptions:
|
|
52
|
+
url: str
|
|
53
|
+
target_type: str | None
|
|
54
|
+
dataset_uri: str
|
|
55
|
+
headless: bool = True
|
|
56
|
+
timeout_ms: int = 30000
|
|
57
|
+
wait_until: str = "networkidle"
|
|
58
|
+
max_retries: int = 2
|
|
59
|
+
max_xhtml_chars: int = 40000
|
|
60
|
+
max_text_node_chars: int = 400
|
|
61
|
+
max_nesting_depth: int = 2
|
|
62
|
+
verbose: bool = True
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class StructuredDataResult:
|
|
67
|
+
jsonld: dict[str, Any]
|
|
68
|
+
yarrml: str
|
|
69
|
+
jsonld_filename: str
|
|
70
|
+
yarrml_filename: str
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _build_client(api_key: str, base_url: str) -> ApiClient:
|
|
74
|
+
config = Configuration(host=base_url)
|
|
75
|
+
config.api_key["ApiKey"] = api_key
|
|
76
|
+
config.api_key_prefix["ApiKey"] = "Key"
|
|
77
|
+
return ApiClient(config)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _build_agent_client(api_key: str) -> ApiClient:
|
|
81
|
+
config = Configuration(host=_AGENT_BASE_URL)
|
|
82
|
+
config.api_key["ApiKey"] = api_key
|
|
83
|
+
config.api_key_prefix["ApiKey"] = "Key"
|
|
84
|
+
return ApiClient(config)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
async def get_dataset_uri_async(api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
|
|
88
|
+
async with _build_client(api_key, base_url) as api_client:
|
|
89
|
+
api = wordlift_client.AccountApi(api_client)
|
|
90
|
+
account = await api.get_me()
|
|
91
|
+
dataset_uri = getattr(account, "dataset_uri", None)
|
|
92
|
+
if not dataset_uri:
|
|
93
|
+
raise RuntimeError("Failed to resolve dataset_uri from account get_me.")
|
|
94
|
+
return dataset_uri
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_dataset_uri(api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
|
|
98
|
+
return asyncio.run(get_dataset_uri_async(api_key, base_url))
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def normalize_type(value: str) -> str:
|
|
102
|
+
value = value.strip()
|
|
103
|
+
if value.startswith("schema:"):
|
|
104
|
+
return value.split(":", 1)[1]
|
|
105
|
+
if value.startswith("http://schema.org/"):
|
|
106
|
+
return value.split("/", 3)[-1]
|
|
107
|
+
if value.startswith("https://schema.org/"):
|
|
108
|
+
return value.split("/", 3)[-1]
|
|
109
|
+
return value
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
_GOOGLE_SHAPES_CACHE: Graph | None = None
|
|
113
|
+
_SCHEMA_SHAPES_CACHE: Graph | None = None
|
|
114
|
+
_SCHEMA_PROP_CACHE: set[str] | None = None
|
|
115
|
+
_SCHEMA_RANGE_CACHE: dict[str, dict[str, set[str]]] | None = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _load_google_shapes() -> Graph:
|
|
119
|
+
global _GOOGLE_SHAPES_CACHE
|
|
120
|
+
if _GOOGLE_SHAPES_CACHE is not None:
|
|
121
|
+
return _GOOGLE_SHAPES_CACHE
|
|
122
|
+
graph = Graph()
|
|
123
|
+
shapes_dir = resources.files("wordlift_sdk.validation.shacls")
|
|
124
|
+
for entry in shapes_dir.iterdir():
|
|
125
|
+
if not entry.is_file() or not entry.name.endswith(".ttl"):
|
|
126
|
+
continue
|
|
127
|
+
if entry.name.startswith("google-") or entry.name == "review-snippet.ttl":
|
|
128
|
+
graph.parse(data=entry.read_text(encoding="utf-8"), format="turtle")
|
|
129
|
+
_GOOGLE_SHAPES_CACHE = graph
|
|
130
|
+
return graph
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _load_schema_shapes() -> Graph:
|
|
134
|
+
global _SCHEMA_SHAPES_CACHE
|
|
135
|
+
if _SCHEMA_SHAPES_CACHE is not None:
|
|
136
|
+
return _SCHEMA_SHAPES_CACHE
|
|
137
|
+
graph = Graph()
|
|
138
|
+
shapes_dir = resources.files("wordlift_sdk.validation.shacls")
|
|
139
|
+
schema_path = shapes_dir.joinpath("schemaorg-grammar.ttl")
|
|
140
|
+
if not schema_path.is_file():
|
|
141
|
+
raise RuntimeError(
|
|
142
|
+
"schemaorg-grammar.ttl not found. Regenerate with scripts/generate_schema_shacls.py."
|
|
143
|
+
)
|
|
144
|
+
graph.parse(data=schema_path.read_text(encoding="utf-8"), format="turtle")
|
|
145
|
+
_SCHEMA_SHAPES_CACHE = graph
|
|
146
|
+
return graph
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _schema_property_set() -> set[str]:
|
|
150
|
+
global _SCHEMA_PROP_CACHE
|
|
151
|
+
if _SCHEMA_PROP_CACHE is not None:
|
|
152
|
+
return _SCHEMA_PROP_CACHE
|
|
153
|
+
graph = _load_schema_shapes()
|
|
154
|
+
props: set[str] = set()
|
|
155
|
+
for prop in graph.objects(None, _SH.path):
|
|
156
|
+
name = _path_to_string(graph, prop)
|
|
157
|
+
if not name:
|
|
158
|
+
continue
|
|
159
|
+
props.add(name)
|
|
160
|
+
props.add(name.split(".", 1)[0])
|
|
161
|
+
_SCHEMA_PROP_CACHE = props
|
|
162
|
+
return props
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _rdf_list_items(graph: Graph, head: Identifier) -> list[Identifier]:
|
|
166
|
+
items: list[Identifier] = []
|
|
167
|
+
current: Identifier | None = head
|
|
168
|
+
while current and current != RDF.nil:
|
|
169
|
+
first = graph.value(current, RDF.first)
|
|
170
|
+
if first is None:
|
|
171
|
+
break
|
|
172
|
+
items.append(first)
|
|
173
|
+
current = graph.value(current, RDF.rest)
|
|
174
|
+
return items
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _schema_property_ranges() -> dict[str, dict[str, set[str]]]:
|
|
178
|
+
global _SCHEMA_RANGE_CACHE
|
|
179
|
+
if _SCHEMA_RANGE_CACHE is not None:
|
|
180
|
+
return _SCHEMA_RANGE_CACHE
|
|
181
|
+
graph = _load_schema_shapes()
|
|
182
|
+
ranges: dict[str, dict[str, set[str]]] = {}
|
|
183
|
+
for shape in graph.subjects(_SH.targetClass, None):
|
|
184
|
+
target_class = graph.value(shape, _SH.targetClass)
|
|
185
|
+
type_name = _short_schema_name(target_class)
|
|
186
|
+
if not type_name:
|
|
187
|
+
continue
|
|
188
|
+
for prop in graph.objects(shape, _SH.property):
|
|
189
|
+
path = graph.value(prop, _SH.path)
|
|
190
|
+
if path is None:
|
|
191
|
+
continue
|
|
192
|
+
prop_name = _path_to_string(graph, path)
|
|
193
|
+
if not prop_name:
|
|
194
|
+
continue
|
|
195
|
+
or_list = graph.value(prop, _SH["or"])
|
|
196
|
+
if or_list is None:
|
|
197
|
+
continue
|
|
198
|
+
for item in _rdf_list_items(graph, or_list):
|
|
199
|
+
class_node = graph.value(item, _SH["class"])
|
|
200
|
+
class_name = _short_schema_name(class_node)
|
|
201
|
+
if not class_name:
|
|
202
|
+
continue
|
|
203
|
+
ranges.setdefault(type_name, {}).setdefault(prop_name, set()).add(
|
|
204
|
+
class_name
|
|
205
|
+
)
|
|
206
|
+
_SCHEMA_RANGE_CACHE = ranges
|
|
207
|
+
return ranges
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _short_schema_name(value: Identifier) -> str | None:
|
|
211
|
+
if not isinstance(value, URIRef):
|
|
212
|
+
return None
|
|
213
|
+
text = str(value)
|
|
214
|
+
if text.startswith(_SCHEMA_BASE):
|
|
215
|
+
return text[len(_SCHEMA_BASE) + 1 :]
|
|
216
|
+
if text.startswith(_SCHEMA_HTTP):
|
|
217
|
+
return text[len(_SCHEMA_HTTP) :]
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _path_to_string(graph: Graph, path: Identifier) -> str | None:
|
|
222
|
+
if isinstance(path, URIRef):
|
|
223
|
+
return _short_schema_name(path)
|
|
224
|
+
if isinstance(path, BNode):
|
|
225
|
+
parts: list[str] = []
|
|
226
|
+
current: Identifier | None = path
|
|
227
|
+
while current and current != RDF.nil:
|
|
228
|
+
first = graph.value(current, RDF.first)
|
|
229
|
+
if first is None:
|
|
230
|
+
break
|
|
231
|
+
name = _short_schema_name(first)
|
|
232
|
+
if not name:
|
|
233
|
+
break
|
|
234
|
+
parts.append(name)
|
|
235
|
+
current = graph.value(current, RDF.rest)
|
|
236
|
+
if parts:
|
|
237
|
+
return ".".join(parts)
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _property_guide_for_type(type_name: str) -> dict[str, list[str]]:
|
|
242
|
+
type_name = normalize_type(type_name)
|
|
243
|
+
targets = {
|
|
244
|
+
URIRef(f"{_SCHEMA_BASE}/{type_name}"),
|
|
245
|
+
URIRef(f"{_SCHEMA_HTTP}{type_name}"),
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
required: set[str] = set()
|
|
249
|
+
recommended: set[str] = set()
|
|
250
|
+
google_graph = _load_google_shapes()
|
|
251
|
+
for target in targets:
|
|
252
|
+
for shape in google_graph.subjects(_SH.targetClass, target):
|
|
253
|
+
for prop in google_graph.objects(shape, _SH.property):
|
|
254
|
+
path = google_graph.value(prop, _SH.path)
|
|
255
|
+
if path is None:
|
|
256
|
+
continue
|
|
257
|
+
min_count = google_graph.value(prop, _SH.minCount)
|
|
258
|
+
if isinstance(min_count, Literal):
|
|
259
|
+
try:
|
|
260
|
+
if int(min_count) < 1:
|
|
261
|
+
continue
|
|
262
|
+
except Exception:
|
|
263
|
+
continue
|
|
264
|
+
else:
|
|
265
|
+
continue
|
|
266
|
+
prop_name = _path_to_string(google_graph, path)
|
|
267
|
+
if not prop_name:
|
|
268
|
+
continue
|
|
269
|
+
severity = google_graph.value(prop, _SH.severity)
|
|
270
|
+
if severity == _SH.Warning:
|
|
271
|
+
recommended.add(prop_name)
|
|
272
|
+
else:
|
|
273
|
+
required.add(prop_name)
|
|
274
|
+
|
|
275
|
+
schema_props: set[str] = set()
|
|
276
|
+
schema_graph = _load_schema_shapes()
|
|
277
|
+
for target in targets:
|
|
278
|
+
for shape in schema_graph.subjects(_SH.targetClass, target):
|
|
279
|
+
for prop in schema_graph.objects(shape, _SH.property):
|
|
280
|
+
path = schema_graph.value(prop, _SH.path)
|
|
281
|
+
if path is None:
|
|
282
|
+
continue
|
|
283
|
+
prop_name = _path_to_string(schema_graph, path)
|
|
284
|
+
if not prop_name:
|
|
285
|
+
continue
|
|
286
|
+
schema_props.add(prop_name)
|
|
287
|
+
|
|
288
|
+
optional = sorted(schema_props.difference(required).difference(recommended))
|
|
289
|
+
|
|
290
|
+
return {
|
|
291
|
+
"required": sorted(required),
|
|
292
|
+
"recommended": sorted(recommended),
|
|
293
|
+
"optional": optional,
|
|
294
|
+
"schema": sorted(schema_props),
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _related_types_for_type(
|
|
299
|
+
type_name: str,
|
|
300
|
+
property_guide: dict[str, list[str]],
|
|
301
|
+
ranges: dict[str, dict[str, set[str]]],
|
|
302
|
+
) -> list[str]:
|
|
303
|
+
related: set[str] = set()
|
|
304
|
+
prop_ranges = ranges.get(type_name, {})
|
|
305
|
+
prop_candidates = property_guide.get("required", []) + property_guide.get(
|
|
306
|
+
"recommended", []
|
|
307
|
+
)
|
|
308
|
+
if not prop_candidates:
|
|
309
|
+
prop_candidates = property_guide.get("schema", [])
|
|
310
|
+
for prop in prop_candidates:
|
|
311
|
+
base = prop.split(".", 1)[0]
|
|
312
|
+
for range_type in prop_ranges.get(base, set()):
|
|
313
|
+
if range_type == "Thing":
|
|
314
|
+
continue
|
|
315
|
+
related.add(range_type)
|
|
316
|
+
return sorted(related)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def property_guides_with_related(
|
|
320
|
+
type_name: str,
|
|
321
|
+
max_depth: int = 2,
|
|
322
|
+
) -> dict[str, dict[str, list[str]]]:
|
|
323
|
+
type_name = normalize_type(type_name)
|
|
324
|
+
ranges = _schema_property_ranges()
|
|
325
|
+
guides: dict[str, dict[str, list[str]]] = {}
|
|
326
|
+
queue: list[tuple[str, int]] = [(type_name, 0)]
|
|
327
|
+
seen: set[str] = set()
|
|
328
|
+
|
|
329
|
+
while queue:
|
|
330
|
+
current, depth = queue.pop(0)
|
|
331
|
+
if current in seen:
|
|
332
|
+
continue
|
|
333
|
+
seen.add(current)
|
|
334
|
+
guide = _property_guide_for_type(current)
|
|
335
|
+
guides[current] = guide
|
|
336
|
+
if depth >= max_depth:
|
|
337
|
+
continue
|
|
338
|
+
for related in _related_types_for_type(current, guide, ranges):
|
|
339
|
+
if related not in seen:
|
|
340
|
+
queue.append((related, depth + 1))
|
|
341
|
+
|
|
342
|
+
return guides
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def shape_specs_for_type(type_name: str) -> list[str]:
|
|
346
|
+
return all_shape_specs()
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def shape_specs_for_types(type_names: list[str]) -> list[str]:
|
|
350
|
+
return all_shape_specs()
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def all_shape_specs() -> list[str]:
|
|
354
|
+
shapes_dir = resources.files("wordlift_sdk.validation.shacls")
|
|
355
|
+
shape_specs: list[str] = []
|
|
356
|
+
for entry in shapes_dir.iterdir():
|
|
357
|
+
if not entry.is_file() or not entry.name.endswith(".ttl"):
|
|
358
|
+
continue
|
|
359
|
+
if entry.name not in shape_specs:
|
|
360
|
+
shape_specs.append(entry.name)
|
|
361
|
+
if "schemaorg-grammar.ttl" not in shape_specs:
|
|
362
|
+
shape_specs.append("schemaorg-grammar.ttl")
|
|
363
|
+
return shape_specs
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _slugify(value: str, default: str) -> str:
|
|
367
|
+
cleaned = re.sub(r"[^A-Za-z0-9]+", "-", value.strip().lower()).strip("-")
|
|
368
|
+
return cleaned or default
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def _dash_type(value: str) -> str:
|
|
372
|
+
value = re.sub(r"[^A-Za-z0-9]+", "-", value.strip())
|
|
373
|
+
value = re.sub(r"(?<!^)(?=[A-Z])", "-", value)
|
|
374
|
+
return re.sub(r"-+", "-", value).strip("-").lower()
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _pluralize(value: str) -> str:
|
|
378
|
+
if value.endswith("y") and len(value) > 1 and value[-2] not in "aeiou":
|
|
379
|
+
return value[:-1] + "ies"
|
|
380
|
+
if value.endswith(("s", "x", "z", "ch", "sh")):
|
|
381
|
+
return value + "es"
|
|
382
|
+
return value + "s"
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _hash_url(url: str, length: int = 12) -> str:
|
|
386
|
+
return hashlib.sha256(url.encode("utf-8")).hexdigest()[:length]
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def build_output_basename(url: str, default: str = "page") -> str:
|
|
390
|
+
parsed = urlparse(url)
|
|
391
|
+
base = f"{parsed.netloc}{parsed.path}".strip("/")
|
|
392
|
+
slug = _slugify(base or url, default=default)
|
|
393
|
+
return f"{slug}--{_hash_url(url)}"
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def build_id(
|
|
397
|
+
dataset_uri: str,
|
|
398
|
+
type_name: str,
|
|
399
|
+
name: str,
|
|
400
|
+
url: str | None,
|
|
401
|
+
index: int,
|
|
402
|
+
) -> str:
|
|
403
|
+
return build_id_base(dataset_uri, type_name, name, url, index)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def build_id_base(
|
|
407
|
+
base_uri: str,
|
|
408
|
+
type_name: str,
|
|
409
|
+
name: str,
|
|
410
|
+
url: str | None,
|
|
411
|
+
index: int,
|
|
412
|
+
) -> str:
|
|
413
|
+
base = base_uri.rstrip("/")
|
|
414
|
+
dashed_type = _dash_type(type_name)
|
|
415
|
+
plural_type = _pluralize(dashed_type)
|
|
416
|
+
name_slug = _slugify(name, default=_dash_type(type_name))
|
|
417
|
+
if url:
|
|
418
|
+
suffix = _hash_url(url)
|
|
419
|
+
else:
|
|
420
|
+
suffix = str(index)
|
|
421
|
+
return f"{base}/{plural_type}/{name_slug}-{suffix}"
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _format_prop_list(items: list[str]) -> str:
|
|
425
|
+
if not items:
|
|
426
|
+
return "none"
|
|
427
|
+
return ", ".join(items)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _agent_prompt(
|
|
431
|
+
url: str,
|
|
432
|
+
html: str,
|
|
433
|
+
target_type: str | None,
|
|
434
|
+
property_guides: dict[str, dict[str, list[str]]] | None = None,
|
|
435
|
+
missing_required: list[str] | None = None,
|
|
436
|
+
missing_recommended: list[str] | None = None,
|
|
437
|
+
previous_yarrml: str | None = None,
|
|
438
|
+
validation_errors: list[str] | None = None,
|
|
439
|
+
validation_report: list[str] | None = None,
|
|
440
|
+
xpath_warnings: list[str] | None = None,
|
|
441
|
+
allow_properties: dict[str, list[str]] | None = None,
|
|
442
|
+
quality_feedback: list[str] | None = None,
|
|
443
|
+
) -> str:
|
|
444
|
+
target = target_type or "AUTO"
|
|
445
|
+
guide_lines: list[str] = []
|
|
446
|
+
if property_guides:
|
|
447
|
+
guide_lines.append(
|
|
448
|
+
"Property guide by type (Google required/recommended + Schema.org grammar):"
|
|
449
|
+
)
|
|
450
|
+
for type_name, guide in property_guides.items():
|
|
451
|
+
guide_lines.append(f"- {type_name}:")
|
|
452
|
+
guide_lines.append(
|
|
453
|
+
f" - Required (Google): {_format_prop_list(guide.get('required', []))}"
|
|
454
|
+
)
|
|
455
|
+
guide_lines.append(
|
|
456
|
+
f" - Recommended (Google): {_format_prop_list(guide.get('recommended', []))}"
|
|
457
|
+
)
|
|
458
|
+
guide_lines.append(
|
|
459
|
+
" - Optional (Schema.org, excluding required/recommended): "
|
|
460
|
+
f"{_format_prop_list(guide.get('optional', []))}"
|
|
461
|
+
)
|
|
462
|
+
guide_lines.append(
|
|
463
|
+
f" - All Schema.org properties for {type_name}: {_format_prop_list(guide.get('schema', []))}"
|
|
464
|
+
)
|
|
465
|
+
guide_lines.append("")
|
|
466
|
+
guide_lines.append(
|
|
467
|
+
"If a required property is not present on the page, omit it (do not fabricate)."
|
|
468
|
+
)
|
|
469
|
+
guide_lines.append("Only use properties listed in the guide for each type.")
|
|
470
|
+
guide_lines.append("")
|
|
471
|
+
if allow_properties:
|
|
472
|
+
guide_lines.append("Allowed properties (Google only):")
|
|
473
|
+
for type_name, props in allow_properties.items():
|
|
474
|
+
guide_lines.append(f"- {type_name}: {_format_prop_list(props)}")
|
|
475
|
+
guide_lines.append("")
|
|
476
|
+
|
|
477
|
+
refine_lines: list[str] = []
|
|
478
|
+
if missing_required:
|
|
479
|
+
refine_lines.append("Missing required properties in the previous mapping:")
|
|
480
|
+
refine_lines.append(", ".join(missing_required))
|
|
481
|
+
refine_lines.append(
|
|
482
|
+
"Update the mapping to add these properties if the data exists on the page."
|
|
483
|
+
)
|
|
484
|
+
refine_lines.append("Keep existing correct mappings and selectors.")
|
|
485
|
+
refine_lines.append("")
|
|
486
|
+
if missing_recommended:
|
|
487
|
+
refine_lines.append("Missing recommended properties in the previous mapping:")
|
|
488
|
+
refine_lines.append(", ".join(missing_recommended))
|
|
489
|
+
refine_lines.append("Add these properties if the data exists on the page.")
|
|
490
|
+
refine_lines.append("")
|
|
491
|
+
if validation_errors:
|
|
492
|
+
refine_lines.append("Validation errors from the previous mapping:")
|
|
493
|
+
refine_lines.extend(validation_errors)
|
|
494
|
+
refine_lines.append("Fix these issues without fabricating data.")
|
|
495
|
+
refine_lines.append("")
|
|
496
|
+
if validation_report:
|
|
497
|
+
refine_lines.append("Validation report from the previous mapping:")
|
|
498
|
+
refine_lines.extend(validation_report)
|
|
499
|
+
refine_lines.append(
|
|
500
|
+
"Use the report to fix the mapping without fabricating data."
|
|
501
|
+
)
|
|
502
|
+
refine_lines.append("")
|
|
503
|
+
if xpath_warnings:
|
|
504
|
+
refine_lines.append("XPath evaluation warnings from the previous mapping:")
|
|
505
|
+
refine_lines.extend(xpath_warnings)
|
|
506
|
+
refine_lines.append("Fix the XPath selectors that returned no results.")
|
|
507
|
+
refine_lines.append("")
|
|
508
|
+
if quality_feedback:
|
|
509
|
+
refine_lines.append("Quality feedback from the previous mapping:")
|
|
510
|
+
refine_lines.extend(quality_feedback)
|
|
511
|
+
refine_lines.append(
|
|
512
|
+
"Improve the mapping to raise the quality score while only using data present in XHTML."
|
|
513
|
+
)
|
|
514
|
+
refine_lines.append("")
|
|
515
|
+
if previous_yarrml:
|
|
516
|
+
refine_lines.append("Previous mapping:")
|
|
517
|
+
refine_lines.append(previous_yarrml.strip())
|
|
518
|
+
refine_lines.append("")
|
|
519
|
+
|
|
520
|
+
guide_text = "\n".join(guide_lines) if guide_lines else ""
|
|
521
|
+
refine_text = "\n".join(refine_lines) if refine_lines else ""
|
|
522
|
+
return (
|
|
523
|
+
f"analyze the entities on this webpage: {url}\n"
|
|
524
|
+
"\n"
|
|
525
|
+
"You are a structured data extraction agent.\n"
|
|
526
|
+
"Goal: produce a YARRRML mapping using XPath only.\n"
|
|
527
|
+
"Use the provided XHTML source instead of fetching the URL.\n"
|
|
528
|
+
"Do NOT parse any existing structured data (JSON-LD, RDFa, Microdata).\n"
|
|
529
|
+
"Do NOT emit @id values. IDs will be assigned locally.\n"
|
|
530
|
+
"Output ONLY the YARRRML mapping (no prose, no code fences).\n"
|
|
531
|
+
"\n"
|
|
532
|
+
f"Target Schema.org type: {target}\n"
|
|
533
|
+
"\n"
|
|
534
|
+
"Requirements:\n"
|
|
535
|
+
"- Use XPath in all selectors.\n"
|
|
536
|
+
"- Use $(xpath) for XPath references (not {xpath}).\n"
|
|
537
|
+
'- Do NOT wrap XPath expressions in quotes inside $(...). Use $(/path), not $("/path").\n'
|
|
538
|
+
'- Always quote attribute values in XPath (e.g., @id="..."). Do NOT use @id=foo.\n'
|
|
539
|
+
"- The main mapping must include schema:url with the exact URL.\n"
|
|
540
|
+
"- Always include schema:name for every mapped node.\n"
|
|
541
|
+
"- Include schema:description for Review if available.\n"
|
|
542
|
+
"- Include schema:image if available (prefer og:image). \n"
|
|
543
|
+
"- Include schema:inLanguage if available (html/@lang). \n"
|
|
544
|
+
"- Include schema:publisher if available (prefer og:site_name as Organization). \n"
|
|
545
|
+
"- Include schema:reviewBody for Review if available (main article text). Prefer the paragraph immediately following the H1\n"
|
|
546
|
+
" (e.g., following-sibling::p[1]) and only use class-based selectors if necessary.\n"
|
|
547
|
+
'- Include schema:datePublished for Review if available (time/@datetime or meta[property="article:published_time"],\n'
|
|
548
|
+
" otherwise use the first byline date).\n"
|
|
549
|
+
"- Include positiveNotes/negativeNotes for Review if available.\n"
|
|
550
|
+
"- Include relevant properties for the main type.\n"
|
|
551
|
+
"- If Target Schema.org type is AUTO, infer the best type and use it.\n"
|
|
552
|
+
"- Define dependent nodes as separate mappings and link them from the main mapping.\n"
|
|
553
|
+
"- Prefer reusable XPath selectors that generalize across pages using the same template.\n"
|
|
554
|
+
"- Avoid brittle selectors that depend on full class names, IDs, or numeric suffixes unless there is no alternative.\n"
|
|
555
|
+
"- Prefer structural paths (head/meta, main/h1, time[@datetime], link[@rel], figure/img) and stable attributes.\n"
|
|
556
|
+
"- If you must use classes or IDs, prefer contains(@class, 'stable-token') over exact matches and avoid numeric IDs.\n"
|
|
557
|
+
"- NEVER use table IDs with numeric suffixes (e.g., tablepress-12345). Instead, locate tables by header text\n"
|
|
558
|
+
" (e.g., th contains 'APR'/'rating') and then select the adjacent cell by position or data-th.\n"
|
|
559
|
+
'- Do NOT key selectors off a specific person name or URL slug; use byline labels like "Written by" or metadata instead.\n'
|
|
560
|
+
"- For author, prefer metadata or rel links first (meta[name=author], meta[property=article:author], link[rel=author]) before class-based selectors.\n"
|
|
561
|
+
'- If the page shows a byline label (e.g., "Written by"), select the author link or text immediately following that label.\n'
|
|
562
|
+
"- For positiveNotes/negativeNotes (Review/Product and subclasses only), anchor on semantic headings (Pros/Cons, Advantages/Disadvantages,\n"
|
|
563
|
+
" What we like/What we don't like). Prefer heading text matches (contains(., 'Pros')) over IDs/classes,\n"
|
|
564
|
+
" and select the closest following list items (li) from ul/ol, rows from tables, or terms/defs from dl.\n"
|
|
565
|
+
" Detect the page language and include localized heading variants with English as a fallback. Avoid site-specific classes/IDs unless there is no alternative.\n"
|
|
566
|
+
"- Only include reviewRating if the page explicitly provides a rating score (stars or numeric rating). Do NOT infer ratings from APR/fee tables or unrelated metrics.\n"
|
|
567
|
+
"- Do NOT use hard-coded literal values. All values must come from XPath except schema:url.\n"
|
|
568
|
+
"- ratingValue must be a literal extracted from XPath (not an IRI).\n"
|
|
569
|
+
"- reviewRating must point to a Rating node.\n"
|
|
570
|
+
"- author must be a Person or Organization node.\n"
|
|
571
|
+
"\n"
|
|
572
|
+
f"{guide_text}"
|
|
573
|
+
f"{refine_text}"
|
|
574
|
+
"XHTML:\n"
|
|
575
|
+
f"{html}\n"
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def _quality_prompt(
|
|
580
|
+
url: str,
|
|
581
|
+
xhtml: str,
|
|
582
|
+
jsonld: dict[str, Any] | list[Any],
|
|
583
|
+
property_guides: dict[str, dict[str, list[str]]] | None,
|
|
584
|
+
target_type: str | None,
|
|
585
|
+
) -> str:
|
|
586
|
+
guide_lines: list[str] = []
|
|
587
|
+
if property_guides:
|
|
588
|
+
guide_lines.append(
|
|
589
|
+
"Property guide by type (Google required/recommended + Schema.org grammar):"
|
|
590
|
+
)
|
|
591
|
+
for type_name, guide in property_guides.items():
|
|
592
|
+
guide_lines.append(f"- {type_name}:")
|
|
593
|
+
guide_lines.append(
|
|
594
|
+
f" - Required (Google): {_format_prop_list(guide.get('required', []))}"
|
|
595
|
+
)
|
|
596
|
+
guide_lines.append(
|
|
597
|
+
f" - Recommended (Google): {_format_prop_list(guide.get('recommended', []))}"
|
|
598
|
+
)
|
|
599
|
+
guide_lines.append(
|
|
600
|
+
" - Optional (Schema.org, excluding required/recommended): "
|
|
601
|
+
f"{_format_prop_list(guide.get('optional', []))}"
|
|
602
|
+
)
|
|
603
|
+
guide_lines.append("")
|
|
604
|
+
guide_text = "\n".join(guide_lines) if guide_lines else ""
|
|
605
|
+
payload = json.dumps(jsonld, ensure_ascii=True)
|
|
606
|
+
return (
|
|
607
|
+
f"analyze the entities on this webpage: {url}\n"
|
|
608
|
+
"\n"
|
|
609
|
+
"You are evaluating structured data quality.\n"
|
|
610
|
+
"Compare the XHTML and JSON-LD. Only count properties that are present in XHTML.\n"
|
|
611
|
+
"Do NOT penalize missing properties if they do not appear in the XHTML.\n"
|
|
612
|
+
"Return a JSON object with keys: score (0-10 integer), missing_in_jsonld (list),\n"
|
|
613
|
+
"suggested_xpath (object mapping property -> XPath), notes (list).\n"
|
|
614
|
+
"Use XPath in suggested_xpath and keep it generic/reusable.\n"
|
|
615
|
+
"\n"
|
|
616
|
+
f"Target Schema.org type: {target_type or 'AUTO'}\n"
|
|
617
|
+
"\n"
|
|
618
|
+
f"{guide_text}"
|
|
619
|
+
"XHTML:\n"
|
|
620
|
+
f"{xhtml}\n"
|
|
621
|
+
"\n"
|
|
622
|
+
"JSON-LD:\n"
|
|
623
|
+
f"{payload}\n"
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
async def _ask_agent_async(
|
|
628
|
+
prompt: str, api_key: str, model: str | None = None
|
|
629
|
+
) -> object:
|
|
630
|
+
async with _build_agent_client(api_key) as api_client:
|
|
631
|
+
api = AgentApi(api_client)
|
|
632
|
+
ask_request = AskRequest(message=prompt, model=model or _AGENT_MODEL)
|
|
633
|
+
return await api.ask_request_api_ask_post(ask_request)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _collect_strings(payload: Any, results: list[str]) -> None:
|
|
637
|
+
if isinstance(payload, str):
|
|
638
|
+
if payload.strip():
|
|
639
|
+
results.append(payload)
|
|
640
|
+
return
|
|
641
|
+
if isinstance(payload, dict):
|
|
642
|
+
for value in payload.values():
|
|
643
|
+
_collect_strings(value, results)
|
|
644
|
+
return
|
|
645
|
+
if isinstance(payload, list):
|
|
646
|
+
for value in payload:
|
|
647
|
+
_collect_strings(value, results)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def _extract_agent_text(payload: Any) -> str | None:
|
|
651
|
+
if isinstance(payload, str) and payload.strip():
|
|
652
|
+
return payload.strip()
|
|
653
|
+
if isinstance(payload, dict):
|
|
654
|
+
for key in (
|
|
655
|
+
"response",
|
|
656
|
+
"answer",
|
|
657
|
+
"content",
|
|
658
|
+
"result",
|
|
659
|
+
"output",
|
|
660
|
+
"text",
|
|
661
|
+
"message",
|
|
662
|
+
):
|
|
663
|
+
if key in payload:
|
|
664
|
+
value = _extract_agent_text(payload.get(key))
|
|
665
|
+
if value:
|
|
666
|
+
return value
|
|
667
|
+
strings: list[str] = []
|
|
668
|
+
_collect_strings(payload, strings)
|
|
669
|
+
for value in strings:
|
|
670
|
+
if "mappings:" in value or "prefixes:" in value:
|
|
671
|
+
return value.strip()
|
|
672
|
+
for value in strings:
|
|
673
|
+
if value.strip():
|
|
674
|
+
return value.strip()
|
|
675
|
+
return None
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def _extract_agent_json(payload: Any) -> dict[str, Any] | None:
|
|
679
|
+
if isinstance(payload, dict):
|
|
680
|
+
return payload
|
|
681
|
+
text = _extract_agent_text(payload)
|
|
682
|
+
if not text:
|
|
683
|
+
return None
|
|
684
|
+
start = text.find("{")
|
|
685
|
+
end = text.rfind("}")
|
|
686
|
+
if start == -1 or end == -1 or end <= start:
|
|
687
|
+
return None
|
|
688
|
+
snippet = text[start : end + 1]
|
|
689
|
+
try:
|
|
690
|
+
return json.loads(snippet)
|
|
691
|
+
except Exception:
|
|
692
|
+
return None
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def ask_agent_for_yarrml(
|
|
696
|
+
api_key: str,
|
|
697
|
+
url: str,
|
|
698
|
+
html: str,
|
|
699
|
+
target_type: str | None,
|
|
700
|
+
debug: bool = False,
|
|
701
|
+
debug_path: Path | None = None,
|
|
702
|
+
property_guides: dict[str, dict[str, list[str]]] | None = None,
|
|
703
|
+
missing_required: list[str] | None = None,
|
|
704
|
+
missing_recommended: list[str] | None = None,
|
|
705
|
+
previous_yarrml: str | None = None,
|
|
706
|
+
validation_errors: list[str] | None = None,
|
|
707
|
+
validation_report: list[str] | None = None,
|
|
708
|
+
xpath_warnings: list[str] | None = None,
|
|
709
|
+
allow_properties: dict[str, list[str]] | None = None,
|
|
710
|
+
quality_feedback: list[str] | None = None,
|
|
711
|
+
) -> str:
|
|
712
|
+
prompt = _agent_prompt(
|
|
713
|
+
url,
|
|
714
|
+
html,
|
|
715
|
+
target_type,
|
|
716
|
+
property_guides=property_guides,
|
|
717
|
+
missing_required=missing_required,
|
|
718
|
+
missing_recommended=missing_recommended,
|
|
719
|
+
previous_yarrml=previous_yarrml,
|
|
720
|
+
validation_errors=validation_errors,
|
|
721
|
+
validation_report=validation_report,
|
|
722
|
+
xpath_warnings=xpath_warnings,
|
|
723
|
+
allow_properties=allow_properties,
|
|
724
|
+
quality_feedback=quality_feedback,
|
|
725
|
+
)
|
|
726
|
+
try:
|
|
727
|
+
response = asyncio.run(_ask_agent_async(prompt, api_key))
|
|
728
|
+
except Exception as exc:
|
|
729
|
+
raise RuntimeError(f"Agent request failed: {exc}") from exc
|
|
730
|
+
|
|
731
|
+
if isinstance(response, dict):
|
|
732
|
+
data = response
|
|
733
|
+
else:
|
|
734
|
+
try:
|
|
735
|
+
data = response.model_dump()
|
|
736
|
+
except Exception:
|
|
737
|
+
data = {}
|
|
738
|
+
|
|
739
|
+
if debug and debug_path is not None:
|
|
740
|
+
debug_path.parent.mkdir(parents=True, exist_ok=True)
|
|
741
|
+
debug_payload = {
|
|
742
|
+
"prompt": prompt,
|
|
743
|
+
"response": data,
|
|
744
|
+
}
|
|
745
|
+
debug_path.write_text(json.dumps(debug_payload, indent=2))
|
|
746
|
+
|
|
747
|
+
extracted = _extract_agent_text(data)
|
|
748
|
+
if extracted:
|
|
749
|
+
return extracted
|
|
750
|
+
|
|
751
|
+
raise RuntimeError("Agent response did not include YARRRML content.")
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
def ask_agent_for_quality(
|
|
755
|
+
api_key: str,
|
|
756
|
+
url: str,
|
|
757
|
+
xhtml: str,
|
|
758
|
+
jsonld: dict[str, Any] | list[Any],
|
|
759
|
+
property_guides: dict[str, dict[str, list[str]]] | None,
|
|
760
|
+
target_type: str | None,
|
|
761
|
+
) -> dict[str, Any] | None:
|
|
762
|
+
prompt = _quality_prompt(url, xhtml, jsonld, property_guides, target_type)
|
|
763
|
+
try:
|
|
764
|
+
response = asyncio.run(_ask_agent_async(prompt, api_key))
|
|
765
|
+
except Exception as exc:
|
|
766
|
+
raise RuntimeError(f"Agent quality request failed: {exc}") from exc
|
|
767
|
+
return _extract_agent_json(response)
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def _replace_sources_with_file(yarrml: str, file_uri: str) -> str:
|
|
771
|
+
pattern = re.compile(r"(\[\s*['\"])([^'\"]+)(['\"]\s*,\s*['\"]xpath['\"])")
|
|
772
|
+
inline_pattern = re.compile(r"(\[\s*)([^,\]]+?~xpath)(\s*,)")
|
|
773
|
+
|
|
774
|
+
def repl(match: re.Match[str]) -> str:
|
|
775
|
+
return f"{match.group(1)}{file_uri}{match.group(3)}"
|
|
776
|
+
|
|
777
|
+
def repl_inline(match: re.Match[str]) -> str:
|
|
778
|
+
return f"{match.group(1)}{file_uri}~xpath{match.group(3)}"
|
|
779
|
+
|
|
780
|
+
yarrml = pattern.sub(repl, yarrml)
|
|
781
|
+
return inline_pattern.sub(repl_inline, yarrml)
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def _replace_sources_with_placeholder(yarrml: str, placeholder: str) -> str:
|
|
785
|
+
pattern = re.compile(r"(\[\s*['\"])([^'\"]+)(['\"]\s*,\s*['\"]xpath['\"])")
|
|
786
|
+
inline_pattern = re.compile(r"(\[\s*)([^,\]]+?~xpath)(\s*,)")
|
|
787
|
+
|
|
788
|
+
def repl(match: re.Match[str]) -> str:
|
|
789
|
+
return f"{match.group(1)}{placeholder}{match.group(3)}"
|
|
790
|
+
|
|
791
|
+
def repl_inline(match: re.Match[str]) -> str:
|
|
792
|
+
return f"{match.group(1)}{placeholder}~xpath{match.group(3)}"
|
|
793
|
+
|
|
794
|
+
yarrml = pattern.sub(repl, yarrml)
|
|
795
|
+
return inline_pattern.sub(repl_inline, yarrml)
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def make_reusable_yarrrml(
|
|
799
|
+
yarrml: str, url: str, source_placeholder: str = "__XHTML__"
|
|
800
|
+
) -> str:
|
|
801
|
+
normalized = _replace_sources_with_placeholder(yarrml, source_placeholder)
|
|
802
|
+
escaped_url = re.escape(url)
|
|
803
|
+
normalized = re.sub(
|
|
804
|
+
rf"(schema:url\s*,\s*['\"])({escaped_url})(['\"])",
|
|
805
|
+
r"\1__URL__\3",
|
|
806
|
+
normalized,
|
|
807
|
+
)
|
|
808
|
+
return normalized
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def _strip_quotes(value: str | None) -> str:
|
|
812
|
+
if not isinstance(value, str):
|
|
813
|
+
return ""
|
|
814
|
+
value = value.strip()
|
|
815
|
+
if (value.startswith('"') and value.endswith('"')) or (
|
|
816
|
+
value.startswith("'") and value.endswith("'")
|
|
817
|
+
):
|
|
818
|
+
return value[1:-1]
|
|
819
|
+
return value
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def _strip_wrapped_list(value: str | None) -> str:
|
|
823
|
+
if not isinstance(value, str):
|
|
824
|
+
return ""
|
|
825
|
+
value = value.strip()
|
|
826
|
+
if value.startswith("[") and value.endswith("]"):
|
|
827
|
+
value = value[1:-1].strip()
|
|
828
|
+
return _strip_quotes(value)
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def _strip_all_quotes(value: str | None) -> str:
|
|
832
|
+
if not isinstance(value, str):
|
|
833
|
+
return ""
|
|
834
|
+
value = value.strip()
|
|
835
|
+
while (value.startswith('"') and value.endswith('"')) or (
|
|
836
|
+
value.startswith("'") and value.endswith("'")
|
|
837
|
+
):
|
|
838
|
+
value = value[1:-1].strip()
|
|
839
|
+
return value
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def _normalize_xpath_literal(value: str) -> str:
|
|
843
|
+
value = value.strip()
|
|
844
|
+
if value.startswith("{") and value.endswith("}"):
|
|
845
|
+
inner = value[1:-1].strip()
|
|
846
|
+
return f"$({inner})"
|
|
847
|
+
if value.startswith("$(xpath)/"):
|
|
848
|
+
tail = value[len("$(xpath)") :]
|
|
849
|
+
return f"$({tail})"
|
|
850
|
+
if value.startswith("$(xpath://") or value.startswith("$(xpath:/"):
|
|
851
|
+
tail = value[len("$(xpath:") :]
|
|
852
|
+
return f"$({tail})"
|
|
853
|
+
if value.startswith("$(") and value.endswith(")"):
|
|
854
|
+
inner = value[2:-1].strip()
|
|
855
|
+
if (inner.startswith('"') and inner.endswith('"')) or (
|
|
856
|
+
inner.startswith("'") and inner.endswith("'")
|
|
857
|
+
):
|
|
858
|
+
return f"$({inner[1:-1]})"
|
|
859
|
+
if value.startswith("$(xpath)',"):
|
|
860
|
+
_, _, tail = value.partition(",")
|
|
861
|
+
tail = _strip_all_quotes(tail.strip())
|
|
862
|
+
return _normalize_xpath_literal(f"$({tail})")
|
|
863
|
+
return value
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
def _looks_like_xpath(value: str) -> bool:
|
|
867
|
+
value = value.strip()
|
|
868
|
+
return (
|
|
869
|
+
(value.startswith("$(") and value.endswith(")"))
|
|
870
|
+
or value.startswith("/")
|
|
871
|
+
or value.startswith(".//")
|
|
872
|
+
or value.startswith("//")
|
|
873
|
+
or value.startswith("normalize-")
|
|
874
|
+
or value.startswith("normalize(")
|
|
875
|
+
or value.startswith("string(")
|
|
876
|
+
or value.startswith("concat(")
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
def _simplify_xpath(value: str) -> str:
|
|
881
|
+
value = value.strip()
|
|
882
|
+
if value.startswith("$(") and value.endswith(")"):
|
|
883
|
+
value = value[2:-1].strip()
|
|
884
|
+
if value.startswith("xpath="):
|
|
885
|
+
value = value[len("xpath=") :].strip()
|
|
886
|
+
if value.startswith("xpath://") or value.startswith("xpath:/"):
|
|
887
|
+
value = value[len("xpath:") :]
|
|
888
|
+
if value.startswith(('"', "'")) and value.endswith(('"', "'")) and len(value) >= 2:
|
|
889
|
+
value = value[1:-1]
|
|
890
|
+
value = value.replace('\\"', '"').replace("\\'", "'")
|
|
891
|
+
match = re.match(r"(?:normalize-space|string)\((.+)\)$", value)
|
|
892
|
+
if match:
|
|
893
|
+
return match.group(1).strip()
|
|
894
|
+
return value
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def _normalize_xpath_reference(value: str) -> str:
|
|
898
|
+
value = value.strip()
|
|
899
|
+
value = value.replace("/text()", "")
|
|
900
|
+
value = value.replace("text()", "")
|
|
901
|
+
value = re.sub(r"contains\(@class,\s*\"([^\"]+)\"\)", r"@class=\"\1\"", value)
|
|
902
|
+
value = re.sub(r"contains\(@class,\s*'([^']+)'\)", r"@class=\"\1\"", value)
|
|
903
|
+
value = re.sub(r"contains\(@id,\s*\"([^\"]+)\"\)", r"@id=\"\1\"", value)
|
|
904
|
+
value = re.sub(r"contains\(@id,\s*'([^']+)'\)", r"@id=\"\1\"", value)
|
|
905
|
+
return value
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def _first_list_value(value: str) -> str:
|
|
909
|
+
value = value.strip()
|
|
910
|
+
if value.startswith("[") and value.endswith("]"):
|
|
911
|
+
match = re.search(r"['\"]([^'\"]+)['\"]", value)
|
|
912
|
+
if match:
|
|
913
|
+
return match.group(1)
|
|
914
|
+
return _strip_all_quotes(_strip_wrapped_list(value))
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def _normalize_agent_yarrml(
|
|
918
|
+
raw: str,
|
|
919
|
+
url: str,
|
|
920
|
+
file_uri: str,
|
|
921
|
+
target_type: str | None,
|
|
922
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
923
|
+
raw = _quote_unquoted_xpath_attributes(raw)
|
|
924
|
+
raw = re.sub(r"(['\"])\\{([^{}]+)\\}\\1", r"\\1$(\\2)\\1", raw)
|
|
925
|
+
raw = re.sub(
|
|
926
|
+
r"(['\"])\\$\\(xpath\\)\\1\\s*,\\s*(['\"])([^'\"]+)\\2", r"\\1$(\\3)\\1", raw
|
|
927
|
+
)
|
|
928
|
+
lines = raw.splitlines()
|
|
929
|
+
mappings: list[dict[str, Any]] = []
|
|
930
|
+
current: dict[str, Any] | None = None
|
|
931
|
+
in_mappings = False
|
|
932
|
+
mappings_indent: int | None = None
|
|
933
|
+
ignore_keys = {"mappings", "prefixes", "sources", "po"}
|
|
934
|
+
last_p: str | None = None
|
|
935
|
+
pending_o: str | None = None
|
|
936
|
+
in_props_block = False
|
|
937
|
+
props_indent = 0
|
|
938
|
+
in_sources = False
|
|
939
|
+
sources_indent: int | None = None
|
|
940
|
+
|
|
941
|
+
for line in lines:
|
|
942
|
+
stripped = line.strip()
|
|
943
|
+
indent = len(line) - len(line.lstrip(" "))
|
|
944
|
+
if not stripped:
|
|
945
|
+
continue
|
|
946
|
+
if stripped == "mappings:":
|
|
947
|
+
in_mappings = True
|
|
948
|
+
mappings_indent = indent
|
|
949
|
+
continue
|
|
950
|
+
if (
|
|
951
|
+
in_mappings
|
|
952
|
+
and stripped.endswith(":")
|
|
953
|
+
and not stripped.startswith("-")
|
|
954
|
+
and not stripped.startswith("s:")
|
|
955
|
+
and mappings_indent is not None
|
|
956
|
+
and indent == mappings_indent + 2
|
|
957
|
+
):
|
|
958
|
+
name = stripped[:-1].strip()
|
|
959
|
+
if name in ignore_keys:
|
|
960
|
+
continue
|
|
961
|
+
current = {"name": name, "type": None, "props": []}
|
|
962
|
+
mappings.append(current)
|
|
963
|
+
last_p = None
|
|
964
|
+
pending_o = None
|
|
965
|
+
in_props_block = False
|
|
966
|
+
continue
|
|
967
|
+
if current is None:
|
|
968
|
+
continue
|
|
969
|
+
if stripped == "sources:":
|
|
970
|
+
in_sources = True
|
|
971
|
+
sources_indent = indent
|
|
972
|
+
continue
|
|
973
|
+
if in_sources:
|
|
974
|
+
if sources_indent is not None and indent > sources_indent:
|
|
975
|
+
continue
|
|
976
|
+
in_sources = False
|
|
977
|
+
sources_indent = None
|
|
978
|
+
if stripped == "po:":
|
|
979
|
+
continue
|
|
980
|
+
if stripped == "p:":
|
|
981
|
+
in_props_block = True
|
|
982
|
+
props_indent = indent
|
|
983
|
+
pending_o = None
|
|
984
|
+
last_p = None
|
|
985
|
+
continue
|
|
986
|
+
if in_props_block and indent <= props_indent and stripped != "p:":
|
|
987
|
+
in_props_block = False
|
|
988
|
+
pending_o = None
|
|
989
|
+
if stripped.startswith("- [") and "value:" in stripped:
|
|
990
|
+
if current is None:
|
|
991
|
+
continue
|
|
992
|
+
match = re.search(r"value:\s*\"([^\"]+)\"", stripped) or re.search(
|
|
993
|
+
r"value:\s*'([^']+)'", stripped
|
|
994
|
+
)
|
|
995
|
+
if match:
|
|
996
|
+
current["source_xpath"] = match.group(1)
|
|
997
|
+
else:
|
|
998
|
+
match = re.search(r"\\['html'\\]\\s*,\\s*\"([^\"]+)\"", stripped)
|
|
999
|
+
if match:
|
|
1000
|
+
current["source_xpath"] = match.group(1)
|
|
1001
|
+
continue
|
|
1002
|
+
if stripped.startswith("- ["):
|
|
1003
|
+
if current is None:
|
|
1004
|
+
continue
|
|
1005
|
+
match = re.match(r"- \[\s*\"([^\"]+)\"\s*\]$", stripped) or re.match(
|
|
1006
|
+
r"- \[\s*'([^']+)'\s*\]$", stripped
|
|
1007
|
+
)
|
|
1008
|
+
if match:
|
|
1009
|
+
current["source_xpath"] = match.group(1)
|
|
1010
|
+
continue
|
|
1011
|
+
if stripped.startswith("s:") and stripped.endswith(":") and stripped != "s:":
|
|
1012
|
+
prop_name = stripped[2:-1].strip()
|
|
1013
|
+
if prop_name:
|
|
1014
|
+
pending_o = prop_name
|
|
1015
|
+
continue
|
|
1016
|
+
if stripped.startswith("s:") and not stripped.startswith("s: "):
|
|
1017
|
+
rest = stripped[2:]
|
|
1018
|
+
prop, _, obj_part = rest.partition(":")
|
|
1019
|
+
prop = prop.strip()
|
|
1020
|
+
obj = _normalize_xpath_literal(
|
|
1021
|
+
_strip_all_quotes(_strip_wrapped_list(obj_part.strip()))
|
|
1022
|
+
)
|
|
1023
|
+
if prop:
|
|
1024
|
+
current["props"].append((prop, obj))
|
|
1025
|
+
continue
|
|
1026
|
+
if stripped.startswith("s:"):
|
|
1027
|
+
value = _strip_all_quotes(
|
|
1028
|
+
_strip_wrapped_list(stripped.split(":", 1)[1].strip())
|
|
1029
|
+
)
|
|
1030
|
+
if value.startswith(("http://", "https://")) or _looks_like_xpath(value):
|
|
1031
|
+
continue
|
|
1032
|
+
if value.startswith("schema:"):
|
|
1033
|
+
current["type"] = normalize_type(value)
|
|
1034
|
+
continue
|
|
1035
|
+
if re.fullmatch(r"[A-Za-z][A-Za-z0-9]+", value):
|
|
1036
|
+
current["type"] = value
|
|
1037
|
+
continue
|
|
1038
|
+
if stripped.startswith("- p:"):
|
|
1039
|
+
_, value = stripped.split("p:", 1)
|
|
1040
|
+
last_p = _strip_quotes(value.strip())
|
|
1041
|
+
pending_o = None
|
|
1042
|
+
continue
|
|
1043
|
+
if in_props_block and stripped.startswith("schema:") and stripped.endswith(":"):
|
|
1044
|
+
pending_o = stripped[:-1].strip()
|
|
1045
|
+
continue
|
|
1046
|
+
if stripped.startswith("o:") and last_p:
|
|
1047
|
+
_, value = stripped.split("o:", 1)
|
|
1048
|
+
obj = _normalize_xpath_literal(
|
|
1049
|
+
_strip_all_quotes(_strip_wrapped_list(value.strip()))
|
|
1050
|
+
)
|
|
1051
|
+
if obj:
|
|
1052
|
+
current["props"].append((last_p, obj))
|
|
1053
|
+
last_p = None
|
|
1054
|
+
pending_o = None
|
|
1055
|
+
else:
|
|
1056
|
+
pending_o = last_p
|
|
1057
|
+
continue
|
|
1058
|
+
if stripped.startswith("- [a,") or stripped.startswith("- [ a,"):
|
|
1059
|
+
match = re.search(r"'schema:([^']+)'", stripped) or re.search(
|
|
1060
|
+
r"\"schema:([^\"]+)\"", stripped
|
|
1061
|
+
)
|
|
1062
|
+
if match:
|
|
1063
|
+
current["type"] = normalize_type(match.group(1))
|
|
1064
|
+
continue
|
|
1065
|
+
if stripped.startswith("value:") and pending_o:
|
|
1066
|
+
_, value = stripped.split("value:", 1)
|
|
1067
|
+
obj = _normalize_xpath_literal(_first_list_value(value.strip()))
|
|
1068
|
+
current["props"].append((pending_o, obj))
|
|
1069
|
+
pending_o = None
|
|
1070
|
+
last_p = None
|
|
1071
|
+
continue
|
|
1072
|
+
if stripped.startswith("mapping:") and pending_o:
|
|
1073
|
+
_, value = stripped.split("mapping:", 1)
|
|
1074
|
+
obj = _normalize_xpath_literal(
|
|
1075
|
+
_strip_all_quotes(_strip_wrapped_list(value.strip()))
|
|
1076
|
+
)
|
|
1077
|
+
if obj:
|
|
1078
|
+
current["props"].append((pending_o, obj))
|
|
1079
|
+
pending_o = None
|
|
1080
|
+
last_p = None
|
|
1081
|
+
continue
|
|
1082
|
+
if stripped.startswith("- ["):
|
|
1083
|
+
if "p:" in stripped and "o:" in stripped:
|
|
1084
|
+
match = re.search(r"\[p:\s*([^,]+),\s*o:\s*(.+)\]$", stripped)
|
|
1085
|
+
if match:
|
|
1086
|
+
prop = _strip_quotes(match.group(1).strip())
|
|
1087
|
+
obj = _normalize_xpath_literal(
|
|
1088
|
+
_strip_all_quotes(_strip_wrapped_list(match.group(2).strip()))
|
|
1089
|
+
)
|
|
1090
|
+
if prop == "a":
|
|
1091
|
+
continue
|
|
1092
|
+
current["props"].append((prop, obj))
|
|
1093
|
+
continue
|
|
1094
|
+
match = re.search(r"\[\s*([^,]+)\s*,\s*(.+)\]$", stripped)
|
|
1095
|
+
if match:
|
|
1096
|
+
prop = _strip_quotes(match.group(1).strip())
|
|
1097
|
+
obj = _normalize_xpath_literal(
|
|
1098
|
+
_strip_all_quotes(_strip_wrapped_list(match.group(2).strip()))
|
|
1099
|
+
)
|
|
1100
|
+
if prop == "a":
|
|
1101
|
+
continue
|
|
1102
|
+
current["props"].append((prop, obj))
|
|
1103
|
+
|
|
1104
|
+
if not mappings:
|
|
1105
|
+
raise RuntimeError("Agent response did not include recognizable mappings.")
|
|
1106
|
+
|
|
1107
|
+
mapping_names = {m["name"] for m in mappings}
|
|
1108
|
+
target = normalize_type(target_type) if target_type else None
|
|
1109
|
+
denied_props = {"schema:html", "html"}
|
|
1110
|
+
schema_props = _schema_property_set()
|
|
1111
|
+
|
|
1112
|
+
main_mapping = None
|
|
1113
|
+
for mapping in mappings:
|
|
1114
|
+
if target and normalize_type(mapping["type"] or "") == target:
|
|
1115
|
+
main_mapping = mapping
|
|
1116
|
+
break
|
|
1117
|
+
if main_mapping is None:
|
|
1118
|
+
main_mapping = mappings[0]
|
|
1119
|
+
if main_mapping:
|
|
1120
|
+
main_mapping["__main__"] = True
|
|
1121
|
+
|
|
1122
|
+
output_lines = [
|
|
1123
|
+
"prefixes:",
|
|
1124
|
+
f" schema: '{_SCHEMA_BASE}/'",
|
|
1125
|
+
" ex: 'http://example.com/'",
|
|
1126
|
+
"mappings:",
|
|
1127
|
+
]
|
|
1128
|
+
|
|
1129
|
+
for mapping in mappings:
|
|
1130
|
+
map_name = mapping["name"]
|
|
1131
|
+
map_type = mapping["type"] or ("Review" if mapping is main_mapping else "Thing")
|
|
1132
|
+
map_type = normalize_type(map_type)
|
|
1133
|
+
output_lines += [
|
|
1134
|
+
f" {map_name}:",
|
|
1135
|
+
" sources:",
|
|
1136
|
+
f" - [{file_uri}~xpath, '/']",
|
|
1137
|
+
f" s: ex:{map_name}~iri",
|
|
1138
|
+
" po:",
|
|
1139
|
+
f" - [a, 'schema:{map_type}']",
|
|
1140
|
+
]
|
|
1141
|
+
props = list(mapping["props"])
|
|
1142
|
+
source_xpath = mapping.get("source_xpath")
|
|
1143
|
+
|
|
1144
|
+
if mapping is main_mapping:
|
|
1145
|
+
has_url = any(p == "schema:url" for p, _ in props)
|
|
1146
|
+
if not has_url:
|
|
1147
|
+
props.insert(0, ("schema:url", url))
|
|
1148
|
+
|
|
1149
|
+
for prop, obj in props:
|
|
1150
|
+
if not prop.startswith("schema:"):
|
|
1151
|
+
prop = f"schema:{prop}"
|
|
1152
|
+
prop_name = prop[7:]
|
|
1153
|
+
if prop_name == "a" or prop == "schema:a":
|
|
1154
|
+
continue
|
|
1155
|
+
if "~" in prop_name or "http" in prop_name:
|
|
1156
|
+
continue
|
|
1157
|
+
if prop in denied_props:
|
|
1158
|
+
continue
|
|
1159
|
+
if prop_name not in schema_props:
|
|
1160
|
+
continue
|
|
1161
|
+
if not obj:
|
|
1162
|
+
continue
|
|
1163
|
+
if obj == "{value}" and source_xpath:
|
|
1164
|
+
obj = source_xpath
|
|
1165
|
+
if (
|
|
1166
|
+
isinstance(obj, str)
|
|
1167
|
+
and obj.startswith("ex:")
|
|
1168
|
+
and obj[3:] in mapping_names
|
|
1169
|
+
):
|
|
1170
|
+
obj = obj[3:]
|
|
1171
|
+
if obj in mapping_names:
|
|
1172
|
+
output_lines.append(f" - [{prop}, ex:{obj}~iri]")
|
|
1173
|
+
continue
|
|
1174
|
+
if _looks_like_xpath(obj):
|
|
1175
|
+
xpath = _normalize_xpath_reference(_simplify_xpath(obj)).replace(
|
|
1176
|
+
"'", '"'
|
|
1177
|
+
)
|
|
1178
|
+
output_lines.append(f" - [{prop}, '$(%s)']" % xpath)
|
|
1179
|
+
continue
|
|
1180
|
+
output_lines.append(f" - [{prop}, '{obj}']")
|
|
1181
|
+
|
|
1182
|
+
return "\n".join(output_lines) + "\n", mappings
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
def _quote_unquoted_xpath_attributes(text: str) -> str:
|
|
1186
|
+
pattern = re.compile(
|
|
1187
|
+
r"@(id|class|property|rel|name|type|itemprop|content|href|src)\s*=\s*([A-Za-z0-9_-]+)"
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
def repl(match: re.Match[str]) -> str:
|
|
1191
|
+
attr = match.group(1)
|
|
1192
|
+
value = match.group(2)
|
|
1193
|
+
return f'@{attr}="{value}"'
|
|
1194
|
+
|
|
1195
|
+
return pattern.sub(repl, text)
|
|
1196
|
+
|
|
1197
|
+
|
|
1198
|
+
def _run_yarrrml_parser(input_path: Path, output_path: Path) -> None:
|
|
1199
|
+
parser = shutil.which("yarrrml-parser")
|
|
1200
|
+
if not parser:
|
|
1201
|
+
raise RuntimeError(
|
|
1202
|
+
"yarrrml-parser is required. Install with: npm install -g @rmlio/yarrrml-parser"
|
|
1203
|
+
)
|
|
1204
|
+
if output_path.exists():
|
|
1205
|
+
output_path.unlink()
|
|
1206
|
+
result = subprocess.run(
|
|
1207
|
+
[parser, "-i", str(input_path), "-o", str(output_path)],
|
|
1208
|
+
capture_output=True,
|
|
1209
|
+
text=True,
|
|
1210
|
+
check=False,
|
|
1211
|
+
)
|
|
1212
|
+
if not output_path.exists():
|
|
1213
|
+
error = (result.stderr or result.stdout).strip()
|
|
1214
|
+
raise RuntimeError(f"yarrrml-parser failed to produce output. {error}")
|
|
1215
|
+
if result.returncode != 0:
|
|
1216
|
+
raise RuntimeError(f"yarrrml-parser failed: {result.stderr.strip()}")
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
def _materialize_graph(mapping_path: Path) -> Graph:
|
|
1220
|
+
try:
|
|
1221
|
+
import morph_kgc
|
|
1222
|
+
except ImportError as exc:
|
|
1223
|
+
raise RuntimeError(
|
|
1224
|
+
"morph-kgc is required. Install with: pip install morph-kgc"
|
|
1225
|
+
) from exc
|
|
1226
|
+
|
|
1227
|
+
config = (
|
|
1228
|
+
"[CONFIGURATION]\n"
|
|
1229
|
+
"output_format = N-TRIPLES\n"
|
|
1230
|
+
"\n"
|
|
1231
|
+
"[DataSource1]\n"
|
|
1232
|
+
f"mappings = {mapping_path}\n"
|
|
1233
|
+
)
|
|
1234
|
+
return morph_kgc.materialize(config)
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
def materialize_yarrrml(
|
|
1238
|
+
yarrrml: str,
|
|
1239
|
+
xhtml_path: Path,
|
|
1240
|
+
workdir: Path,
|
|
1241
|
+
*,
|
|
1242
|
+
url: str | None = None,
|
|
1243
|
+
) -> Graph:
|
|
1244
|
+
file_uri = xhtml_path.as_posix()
|
|
1245
|
+
normalized = _replace_sources_with_file(yarrrml, file_uri)
|
|
1246
|
+
if url:
|
|
1247
|
+
normalized = re.sub(
|
|
1248
|
+
r"(schema:url\s*,\s*['\"])__URL__(['\"])",
|
|
1249
|
+
rf"\1{url}\2",
|
|
1250
|
+
normalized,
|
|
1251
|
+
)
|
|
1252
|
+
workdir.mkdir(parents=True, exist_ok=True)
|
|
1253
|
+
yarrml_path = workdir / "mapping.yarrrml"
|
|
1254
|
+
rml_path = workdir / "mapping.ttl"
|
|
1255
|
+
yarrml_path.write_text(normalized)
|
|
1256
|
+
_run_yarrrml_parser(yarrml_path, rml_path)
|
|
1257
|
+
_ensure_subject_termtype_iri(rml_path)
|
|
1258
|
+
_normalize_reference_formulation(rml_path)
|
|
1259
|
+
return _materialize_graph(rml_path)
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
def normalize_yarrrml_mappings(
|
|
1263
|
+
yarrrml: str,
|
|
1264
|
+
url: str,
|
|
1265
|
+
xhtml_path: Path,
|
|
1266
|
+
target_type: str | None = None,
|
|
1267
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
1268
|
+
return _normalize_agent_yarrml(yarrrml, url, xhtml_path.as_posix(), target_type)
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def materialize_yarrrml_jsonld(
|
|
1272
|
+
yarrrml: str,
|
|
1273
|
+
xhtml_path: Path,
|
|
1274
|
+
workdir: Path,
|
|
1275
|
+
*,
|
|
1276
|
+
url: str | None = None,
|
|
1277
|
+
) -> dict[str, Any] | list[Any]:
|
|
1278
|
+
file_uri = xhtml_path.as_posix()
|
|
1279
|
+
normalized = _replace_sources_with_file(yarrrml, file_uri)
|
|
1280
|
+
if url:
|
|
1281
|
+
normalized = re.sub(
|
|
1282
|
+
r"(schema:url\s*,\s*['\"])__URL__(['\"])",
|
|
1283
|
+
rf"\1{url}\2",
|
|
1284
|
+
normalized,
|
|
1285
|
+
)
|
|
1286
|
+
workdir.mkdir(parents=True, exist_ok=True)
|
|
1287
|
+
yarrml_path = workdir / "mapping.yarrml"
|
|
1288
|
+
rml_path = workdir / "mapping.ttl"
|
|
1289
|
+
yarrml_path.write_text(normalized)
|
|
1290
|
+
_run_yarrrml_parser(yarrml_path, rml_path)
|
|
1291
|
+
_ensure_subject_termtype_iri(rml_path)
|
|
1292
|
+
_normalize_reference_formulation(rml_path)
|
|
1293
|
+
return _materialize_jsonld(rml_path)
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
def postprocess_jsonld(
|
|
1297
|
+
jsonld_raw: dict[str, Any] | list[Any],
|
|
1298
|
+
mappings: list[dict[str, Any]],
|
|
1299
|
+
xhtml: str,
|
|
1300
|
+
dataset_uri: str,
|
|
1301
|
+
url: str,
|
|
1302
|
+
target_type: str | None = None,
|
|
1303
|
+
) -> dict[str, Any]:
|
|
1304
|
+
jsonld_raw = _fill_jsonld_from_mappings(jsonld_raw, mappings, xhtml)
|
|
1305
|
+
_ensure_node_ids(jsonld_raw, dataset_uri, url)
|
|
1306
|
+
_dedupe_review_notes(jsonld_raw)
|
|
1307
|
+
normalized = normalize_jsonld(
|
|
1308
|
+
jsonld_raw, dataset_uri, url, target_type, embed_nodes=False
|
|
1309
|
+
)
|
|
1310
|
+
_materialize_literal_nodes(normalized, dataset_uri, url)
|
|
1311
|
+
_ensure_author_name(normalized, xhtml, dataset_uri, url)
|
|
1312
|
+
_ensure_review_url(normalized, url)
|
|
1313
|
+
_prune_empty_rating_nodes(normalized)
|
|
1314
|
+
return normalized
|
|
1315
|
+
|
|
1316
|
+
|
|
1317
|
+
def _prune_empty_rating_nodes(data: dict[str, Any] | list[Any]) -> None:
|
|
1318
|
+
nodes = _flatten_jsonld(data)
|
|
1319
|
+
if not nodes:
|
|
1320
|
+
return
|
|
1321
|
+
rating_value_key = f"{_SCHEMA_BASE}/ratingValue"
|
|
1322
|
+
empty_rating_ids: set[str] = set()
|
|
1323
|
+
|
|
1324
|
+
def _has_rating_value(node: dict[str, Any]) -> bool:
|
|
1325
|
+
value = node.get(rating_value_key, node.get("ratingValue"))
|
|
1326
|
+
if value is None:
|
|
1327
|
+
return False
|
|
1328
|
+
values = value if isinstance(value, list) else [value]
|
|
1329
|
+
for item in values:
|
|
1330
|
+
if isinstance(item, dict):
|
|
1331
|
+
text = item.get("@value") or item.get("value")
|
|
1332
|
+
else:
|
|
1333
|
+
text = item
|
|
1334
|
+
if isinstance(text, str) and text.strip():
|
|
1335
|
+
return True
|
|
1336
|
+
if isinstance(text, (int, float)):
|
|
1337
|
+
return True
|
|
1338
|
+
return False
|
|
1339
|
+
|
|
1340
|
+
for node in nodes:
|
|
1341
|
+
if not isinstance(node, dict):
|
|
1342
|
+
continue
|
|
1343
|
+
node_types = {
|
|
1344
|
+
normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
|
|
1345
|
+
}
|
|
1346
|
+
if "Rating" not in node_types:
|
|
1347
|
+
continue
|
|
1348
|
+
if not _has_rating_value(node):
|
|
1349
|
+
node_id = node.get("@id")
|
|
1350
|
+
if isinstance(node_id, str):
|
|
1351
|
+
empty_rating_ids.add(node_id)
|
|
1352
|
+
if not empty_rating_ids:
|
|
1353
|
+
return
|
|
1354
|
+
|
|
1355
|
+
def _filter_refs(values: Any) -> Any:
|
|
1356
|
+
if isinstance(values, list):
|
|
1357
|
+
filtered = [
|
|
1358
|
+
value
|
|
1359
|
+
for value in values
|
|
1360
|
+
if not (
|
|
1361
|
+
isinstance(value, dict) and value.get("@id") in empty_rating_ids
|
|
1362
|
+
)
|
|
1363
|
+
]
|
|
1364
|
+
return filtered
|
|
1365
|
+
return values
|
|
1366
|
+
|
|
1367
|
+
for node in nodes:
|
|
1368
|
+
if not isinstance(node, dict):
|
|
1369
|
+
continue
|
|
1370
|
+
for key in (f"{_SCHEMA_BASE}/reviewRating", "reviewRating"):
|
|
1371
|
+
if key in node:
|
|
1372
|
+
node[key] = _filter_refs(node[key])
|
|
1373
|
+
if not node[key]:
|
|
1374
|
+
node.pop(key, None)
|
|
1375
|
+
|
|
1376
|
+
if isinstance(data, dict) and isinstance(data.get("@graph"), list):
|
|
1377
|
+
data["@graph"] = [
|
|
1378
|
+
node
|
|
1379
|
+
for node in data["@graph"]
|
|
1380
|
+
if not (isinstance(node, dict) and node.get("@id") in empty_rating_ids)
|
|
1381
|
+
]
|
|
1382
|
+
elif isinstance(data, list):
|
|
1383
|
+
data[:] = [
|
|
1384
|
+
node
|
|
1385
|
+
for node in data
|
|
1386
|
+
if not (isinstance(node, dict) and node.get("@id") in empty_rating_ids)
|
|
1387
|
+
]
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
def _dedupe_review_notes(data: dict[str, Any] | list[Any]) -> None:
|
|
1391
|
+
nodes = _flatten_jsonld(data)
|
|
1392
|
+
if not nodes:
|
|
1393
|
+
return
|
|
1394
|
+
pos_key = f"{_SCHEMA_BASE}/positiveNotes"
|
|
1395
|
+
neg_key = f"{_SCHEMA_BASE}/negativeNotes"
|
|
1396
|
+
|
|
1397
|
+
def _extract_values(values: Any) -> list[str]:
|
|
1398
|
+
if isinstance(values, list):
|
|
1399
|
+
items = values
|
|
1400
|
+
else:
|
|
1401
|
+
items = [values]
|
|
1402
|
+
normalized: list[str] = []
|
|
1403
|
+
for item in items:
|
|
1404
|
+
if isinstance(item, dict):
|
|
1405
|
+
value = item.get("@value") or item.get("value")
|
|
1406
|
+
else:
|
|
1407
|
+
value = item
|
|
1408
|
+
if isinstance(value, str):
|
|
1409
|
+
normalized.append(value.strip())
|
|
1410
|
+
return [value for value in normalized if value]
|
|
1411
|
+
|
|
1412
|
+
for node in nodes:
|
|
1413
|
+
if not isinstance(node, dict):
|
|
1414
|
+
continue
|
|
1415
|
+
node_types = {
|
|
1416
|
+
normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
|
|
1417
|
+
}
|
|
1418
|
+
if "Review" not in node_types and "Product" not in node_types:
|
|
1419
|
+
continue
|
|
1420
|
+
pos_values = _extract_values(node.get(pos_key) or node.get("positiveNotes"))
|
|
1421
|
+
neg_values = _extract_values(node.get(neg_key) or node.get("negativeNotes"))
|
|
1422
|
+
if pos_values and neg_values and pos_values == neg_values:
|
|
1423
|
+
node.pop(pos_key, None)
|
|
1424
|
+
node.pop("positiveNotes", None)
|
|
1425
|
+
|
|
1426
|
+
|
|
1427
|
+
def _materialize_literal_nodes(
|
|
1428
|
+
data: dict[str, Any] | list[Any],
|
|
1429
|
+
dataset_uri: str,
|
|
1430
|
+
url: str,
|
|
1431
|
+
) -> None:
|
|
1432
|
+
nodes = _flatten_jsonld(data)
|
|
1433
|
+
if not nodes:
|
|
1434
|
+
return
|
|
1435
|
+
graph = data.get("@graph") if isinstance(data, dict) else None
|
|
1436
|
+
if not isinstance(graph, list):
|
|
1437
|
+
return
|
|
1438
|
+
schema_author = f"{_SCHEMA_BASE}/author"
|
|
1439
|
+
schema_item = f"{_SCHEMA_BASE}/itemReviewed"
|
|
1440
|
+
schema_publisher = f"{_SCHEMA_BASE}/publisher"
|
|
1441
|
+
schema_name = f"{_SCHEMA_BASE}/name"
|
|
1442
|
+
|
|
1443
|
+
def _ensure_node(type_name: str, name: str, index: int) -> dict[str, Any]:
|
|
1444
|
+
node_id = build_id_base(dataset_uri, type_name, name, url, index)
|
|
1445
|
+
node = {
|
|
1446
|
+
"@id": node_id,
|
|
1447
|
+
"@type": [f"{_SCHEMA_BASE}/{type_name}"],
|
|
1448
|
+
schema_name: [{"@value": name}],
|
|
1449
|
+
"@context": _SCHEMA_BASE,
|
|
1450
|
+
}
|
|
1451
|
+
graph.append(node)
|
|
1452
|
+
return node
|
|
1453
|
+
|
|
1454
|
+
def _replace_literal(
|
|
1455
|
+
node: dict[str, Any], key: str, type_name: str, start_index: int
|
|
1456
|
+
) -> None:
|
|
1457
|
+
values = node.get(key)
|
|
1458
|
+
if not values:
|
|
1459
|
+
return
|
|
1460
|
+
items = values if isinstance(values, list) else [values]
|
|
1461
|
+
new_refs: list[dict[str, Any]] = []
|
|
1462
|
+
for idx, item in enumerate(items, start=start_index):
|
|
1463
|
+
if isinstance(item, dict) and item.get("@id"):
|
|
1464
|
+
new_refs.append(item)
|
|
1465
|
+
continue
|
|
1466
|
+
if isinstance(item, dict) and "@value" in item:
|
|
1467
|
+
name = str(item["@value"]).strip()
|
|
1468
|
+
else:
|
|
1469
|
+
name = str(item).strip()
|
|
1470
|
+
if not name:
|
|
1471
|
+
continue
|
|
1472
|
+
new_node = _ensure_node(type_name, name, idx)
|
|
1473
|
+
new_refs.append({"@id": new_node["@id"]})
|
|
1474
|
+
if new_refs:
|
|
1475
|
+
node[key] = new_refs
|
|
1476
|
+
else:
|
|
1477
|
+
node.pop(key, None)
|
|
1478
|
+
|
|
1479
|
+
review_nodes = [
|
|
1480
|
+
node
|
|
1481
|
+
for node in nodes
|
|
1482
|
+
if isinstance(node, dict)
|
|
1483
|
+
and "Review"
|
|
1484
|
+
in {normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)}
|
|
1485
|
+
]
|
|
1486
|
+
for index, review in enumerate(review_nodes, start=1):
|
|
1487
|
+
_replace_literal(review, schema_author, "Person", index)
|
|
1488
|
+
_replace_literal(review, schema_item, "Product", index + 100)
|
|
1489
|
+
_replace_literal(review, schema_publisher, "Organization", index + 200)
|
|
1490
|
+
|
|
1491
|
+
|
|
1492
|
+
def _ensure_review_url(data: dict[str, Any] | list[Any], url: str) -> None:
|
|
1493
|
+
nodes = _flatten_jsonld(data)
|
|
1494
|
+
if not nodes or not url:
|
|
1495
|
+
return
|
|
1496
|
+
url_key = f"{_SCHEMA_BASE}/url"
|
|
1497
|
+
for node in nodes:
|
|
1498
|
+
if not isinstance(node, dict):
|
|
1499
|
+
continue
|
|
1500
|
+
node_types = {
|
|
1501
|
+
normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
|
|
1502
|
+
}
|
|
1503
|
+
if "Review" not in node_types:
|
|
1504
|
+
continue
|
|
1505
|
+
if url_key not in node:
|
|
1506
|
+
node[url_key] = [{"@value": url}]
|
|
1507
|
+
|
|
1508
|
+
|
|
1509
|
+
def _ensure_author_name(
|
|
1510
|
+
data: dict[str, Any] | list[Any],
|
|
1511
|
+
xhtml: str,
|
|
1512
|
+
dataset_uri: str,
|
|
1513
|
+
url: str,
|
|
1514
|
+
) -> None:
|
|
1515
|
+
author_name = _extract_author_name(xhtml)
|
|
1516
|
+
if not author_name:
|
|
1517
|
+
return
|
|
1518
|
+
nodes = _flatten_jsonld(data)
|
|
1519
|
+
if not nodes:
|
|
1520
|
+
return
|
|
1521
|
+
schema_name = f"{_SCHEMA_BASE}/name"
|
|
1522
|
+
schema_author = f"{_SCHEMA_BASE}/author"
|
|
1523
|
+
graph = data.get("@graph") if isinstance(data, dict) else None
|
|
1524
|
+
if not isinstance(graph, list):
|
|
1525
|
+
return
|
|
1526
|
+
|
|
1527
|
+
author_nodes = [
|
|
1528
|
+
node
|
|
1529
|
+
for node in nodes
|
|
1530
|
+
if isinstance(node, dict)
|
|
1531
|
+
and "Person"
|
|
1532
|
+
in {normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)}
|
|
1533
|
+
]
|
|
1534
|
+
for node in author_nodes:
|
|
1535
|
+
if schema_name not in node:
|
|
1536
|
+
node[schema_name] = [{"@value": author_name}]
|
|
1537
|
+
|
|
1538
|
+
review_nodes = [
|
|
1539
|
+
node
|
|
1540
|
+
for node in nodes
|
|
1541
|
+
if isinstance(node, dict)
|
|
1542
|
+
and "Review"
|
|
1543
|
+
in {normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)}
|
|
1544
|
+
]
|
|
1545
|
+
for review in review_nodes:
|
|
1546
|
+
if schema_author in review:
|
|
1547
|
+
continue
|
|
1548
|
+
author_node = build_id_base(dataset_uri, "Person", author_name, url, 0)
|
|
1549
|
+
graph.append(
|
|
1550
|
+
{
|
|
1551
|
+
"@id": author_node,
|
|
1552
|
+
"@type": [f"{_SCHEMA_BASE}/Person"],
|
|
1553
|
+
schema_name: [{"@value": author_name}],
|
|
1554
|
+
"@context": _SCHEMA_BASE,
|
|
1555
|
+
}
|
|
1556
|
+
)
|
|
1557
|
+
review[schema_author] = [{"@id": author_node}]
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
def _extract_author_name(xhtml: str) -> str | None:
|
|
1561
|
+
try:
|
|
1562
|
+
from lxml import html as lxml_html
|
|
1563
|
+
except Exception:
|
|
1564
|
+
return None
|
|
1565
|
+
parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
|
|
1566
|
+
try:
|
|
1567
|
+
doc = lxml_html.document_fromstring(xhtml, parser=parser)
|
|
1568
|
+
except Exception:
|
|
1569
|
+
return None
|
|
1570
|
+
|
|
1571
|
+
candidates = [
|
|
1572
|
+
"//meta[@name='author']/@content",
|
|
1573
|
+
"//meta[@property='article:author']/@content",
|
|
1574
|
+
"//a[@rel='author']/text()",
|
|
1575
|
+
"//*[contains(normalize-space(.), 'Written by')]/following::a[1]/text()",
|
|
1576
|
+
]
|
|
1577
|
+
for path in candidates:
|
|
1578
|
+
try:
|
|
1579
|
+
results = doc.xpath(path)
|
|
1580
|
+
except Exception:
|
|
1581
|
+
continue
|
|
1582
|
+
for value in results:
|
|
1583
|
+
if isinstance(value, str) and value.strip():
|
|
1584
|
+
return value.strip()
|
|
1585
|
+
return None
|
|
1586
|
+
|
|
1587
|
+
|
|
1588
|
+
def _materialize_jsonld(mapping_path: Path) -> dict[str, Any] | list[Any]:
|
|
1589
|
+
graph = _materialize_graph(mapping_path)
|
|
1590
|
+
jsonld_str = graph.serialize(format="json-ld")
|
|
1591
|
+
return json.loads(jsonld_str)
|
|
1592
|
+
|
|
1593
|
+
|
|
1594
|
+
def _ensure_subject_termtype_iri(mapping_path: Path) -> None:
|
|
1595
|
+
graph = Graph()
|
|
1596
|
+
graph.parse(mapping_path, format="turtle")
|
|
1597
|
+
for subject_map in graph.subjects(RDF.type, _RR.SubjectMap):
|
|
1598
|
+
graph.add((subject_map, _RR.termType, _RR.IRI))
|
|
1599
|
+
graph.serialize(destination=str(mapping_path), format="turtle")
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
def _normalize_reference_formulation(mapping_path: Path) -> None:
|
|
1603
|
+
graph = Graph()
|
|
1604
|
+
graph.parse(mapping_path, format="turtle")
|
|
1605
|
+
replaced = False
|
|
1606
|
+
for predicate in (_RML.referenceFormulation, _RML_LEGACY.referenceFormulation):
|
|
1607
|
+
for subject in list(graph.subjects(predicate, _QL.XPath)):
|
|
1608
|
+
graph.remove((subject, predicate, _QL.XPath))
|
|
1609
|
+
graph.add((subject, predicate, _RML.XPath))
|
|
1610
|
+
replaced = True
|
|
1611
|
+
if replaced:
|
|
1612
|
+
graph.serialize(destination=str(mapping_path), format="turtle")
|
|
1613
|
+
|
|
1614
|
+
|
|
1615
|
+
def _flatten_jsonld(data: dict[str, Any] | list[Any]) -> list[dict[str, Any]]:
|
|
1616
|
+
if isinstance(data, list):
|
|
1617
|
+
return [node for node in data if isinstance(node, dict)]
|
|
1618
|
+
if "@graph" in data and isinstance(data["@graph"], list):
|
|
1619
|
+
return [node for node in data["@graph"] if isinstance(node, dict)]
|
|
1620
|
+
return [data] if isinstance(data, dict) else []
|
|
1621
|
+
|
|
1622
|
+
|
|
1623
|
+
def ensure_no_blank_nodes(graph: Graph) -> None:
|
|
1624
|
+
offenders: list[tuple[Identifier, Identifier, Identifier]] = []
|
|
1625
|
+
for subj, pred, obj in graph:
|
|
1626
|
+
if isinstance(subj, BNode) or isinstance(obj, BNode):
|
|
1627
|
+
offenders.append((subj, pred, obj))
|
|
1628
|
+
if len(offenders) >= 5:
|
|
1629
|
+
break
|
|
1630
|
+
if offenders:
|
|
1631
|
+
sample = "; ".join(f"{s} {p} {o}" for s, p, o in offenders)
|
|
1632
|
+
raise RuntimeError(
|
|
1633
|
+
"Blank nodes are not allowed in RDF output. "
|
|
1634
|
+
f"Found {len(offenders)} sample triples with blank nodes: {sample}"
|
|
1635
|
+
)
|
|
1636
|
+
|
|
1637
|
+
|
|
1638
|
+
def _collect_jsonld_nodes(data: Any) -> list[dict[str, Any]]:
|
|
1639
|
+
nodes: list[dict[str, Any]] = []
|
|
1640
|
+
|
|
1641
|
+
def _walk(value: Any) -> None:
|
|
1642
|
+
if isinstance(value, dict):
|
|
1643
|
+
if _is_jsonld_node(value):
|
|
1644
|
+
nodes.append(value)
|
|
1645
|
+
for child in value.values():
|
|
1646
|
+
_walk(child)
|
|
1647
|
+
elif isinstance(value, list):
|
|
1648
|
+
for item in value:
|
|
1649
|
+
_walk(item)
|
|
1650
|
+
|
|
1651
|
+
_walk(data)
|
|
1652
|
+
return nodes
|
|
1653
|
+
|
|
1654
|
+
|
|
1655
|
+
def _strip_iri_suffix(value: str) -> str:
|
|
1656
|
+
return value[:-4] if value.endswith("~iri") else value
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
def _normalize_iri_suffixes(data: Any) -> Any:
|
|
1660
|
+
if isinstance(data, dict):
|
|
1661
|
+
out: dict[str, Any] = {}
|
|
1662
|
+
for key, value in data.items():
|
|
1663
|
+
if key == "@id" and isinstance(value, str):
|
|
1664
|
+
out[key] = _strip_iri_suffix(value)
|
|
1665
|
+
else:
|
|
1666
|
+
out[key] = _normalize_iri_suffixes(value)
|
|
1667
|
+
return out
|
|
1668
|
+
if isinstance(data, list):
|
|
1669
|
+
return [_normalize_iri_suffixes(item) for item in data]
|
|
1670
|
+
return data
|
|
1671
|
+
|
|
1672
|
+
|
|
1673
|
+
def _xpath_first_text(doc: Any, xpath: str) -> str | None:
|
|
1674
|
+
def _eval(path: str) -> list[Any]:
|
|
1675
|
+
return doc.xpath(path)
|
|
1676
|
+
|
|
1677
|
+
try:
|
|
1678
|
+
result = _eval(xpath)
|
|
1679
|
+
except Exception:
|
|
1680
|
+
return None
|
|
1681
|
+
if not result:
|
|
1682
|
+
relaxed = _relax_xpath(xpath)
|
|
1683
|
+
if relaxed != xpath:
|
|
1684
|
+
try:
|
|
1685
|
+
result = _eval(relaxed)
|
|
1686
|
+
except Exception:
|
|
1687
|
+
return None
|
|
1688
|
+
if not result:
|
|
1689
|
+
return None
|
|
1690
|
+
for item in result:
|
|
1691
|
+
if isinstance(item, str):
|
|
1692
|
+
text = item.strip()
|
|
1693
|
+
elif hasattr(item, "text_content"):
|
|
1694
|
+
text = item.text_content().strip()
|
|
1695
|
+
else:
|
|
1696
|
+
text = str(item).strip()
|
|
1697
|
+
if text:
|
|
1698
|
+
return text
|
|
1699
|
+
return None
|
|
1700
|
+
|
|
1701
|
+
|
|
1702
|
+
def _relax_xpath(value: str) -> str:
|
|
1703
|
+
relaxed = value
|
|
1704
|
+
relaxed = re.sub(r"@class=\"([^\"]+)\"", r'contains(@class, "\1")', relaxed)
|
|
1705
|
+
relaxed = re.sub(r"@id=\"([^\"]+)\"", r'contains(@id, "\1")', relaxed)
|
|
1706
|
+
relaxed = relaxed.replace("//div[", "//*[")
|
|
1707
|
+
relaxed = relaxed.replace("/div[", "/*[")
|
|
1708
|
+
relaxed = relaxed.replace("//p[", "//*[")
|
|
1709
|
+
relaxed = relaxed.replace("/p[", "/*[")
|
|
1710
|
+
return relaxed
|
|
1711
|
+
|
|
1712
|
+
|
|
1713
|
+
def _extract_list_items(doc: Any, xpaths: list[str]) -> list[str]:
|
|
1714
|
+
items: list[str] = []
|
|
1715
|
+
seen: set[str] = set()
|
|
1716
|
+
for path in xpaths:
|
|
1717
|
+
try:
|
|
1718
|
+
results = doc.xpath(path)
|
|
1719
|
+
except Exception:
|
|
1720
|
+
continue
|
|
1721
|
+
for item in results:
|
|
1722
|
+
if hasattr(item, "text_content"):
|
|
1723
|
+
text = item.text_content().strip()
|
|
1724
|
+
else:
|
|
1725
|
+
text = str(item).strip()
|
|
1726
|
+
if text:
|
|
1727
|
+
if text in seen:
|
|
1728
|
+
continue
|
|
1729
|
+
seen.add(text)
|
|
1730
|
+
items.append(text)
|
|
1731
|
+
return items
|
|
1732
|
+
|
|
1733
|
+
|
|
1734
|
+
def _build_item_list(items: list[str]) -> dict[str, Any]:
|
|
1735
|
+
entries = []
|
|
1736
|
+
for idx, name in enumerate(items, start=1):
|
|
1737
|
+
entries.append(
|
|
1738
|
+
{
|
|
1739
|
+
"@type": "ListItem",
|
|
1740
|
+
"position": idx,
|
|
1741
|
+
"name": name,
|
|
1742
|
+
}
|
|
1743
|
+
)
|
|
1744
|
+
return {"@type": "ItemList", "itemListElement": entries}
|
|
1745
|
+
|
|
1746
|
+
|
|
1747
|
+
def _is_item_list_value(value: Any) -> bool:
|
|
1748
|
+
if not isinstance(value, list) or not value:
|
|
1749
|
+
return False
|
|
1750
|
+
item = value[0]
|
|
1751
|
+
if not isinstance(item, dict):
|
|
1752
|
+
return False
|
|
1753
|
+
return normalize_type(item.get("@type")) == "ItemList"
|
|
1754
|
+
|
|
1755
|
+
|
|
1756
|
+
def _extract_rating_number(text: str | None) -> str | None:
|
|
1757
|
+
if not text:
|
|
1758
|
+
return None
|
|
1759
|
+
match = re.search(r"-?\\d+(?:\\.\\d+)?", text)
|
|
1760
|
+
if not match:
|
|
1761
|
+
return None
|
|
1762
|
+
try:
|
|
1763
|
+
value = float(match.group(0))
|
|
1764
|
+
except ValueError:
|
|
1765
|
+
return None
|
|
1766
|
+
if value < 0 or value > 5:
|
|
1767
|
+
return None
|
|
1768
|
+
return match.group(0)
|
|
1769
|
+
|
|
1770
|
+
|
|
1771
|
+
def _extract_rating_value(doc: Any) -> str | None:
|
|
1772
|
+
candidates = [
|
|
1773
|
+
"//*[@itemprop='ratingValue']/text()",
|
|
1774
|
+
"//*[@data-rating]/@data-rating",
|
|
1775
|
+
"//*[contains(@class, 'rating')]/text()",
|
|
1776
|
+
"//*[contains(@class, 'Rating')]/text()",
|
|
1777
|
+
"//*[contains(@id, 'rating')]/text()",
|
|
1778
|
+
"//*[contains(@id, 'Rating')]/text()",
|
|
1779
|
+
"//*[contains(@aria-label, 'rating')]/@aria-label",
|
|
1780
|
+
"//*[contains(@aria-label, 'star')]/@aria-label",
|
|
1781
|
+
]
|
|
1782
|
+
for xpath in candidates:
|
|
1783
|
+
text = _xpath_first_text(doc, xpath)
|
|
1784
|
+
value = _extract_rating_number(text)
|
|
1785
|
+
if value is not None:
|
|
1786
|
+
return value
|
|
1787
|
+
return None
|
|
1788
|
+
|
|
1789
|
+
|
|
1790
|
+
def enrich_graph_from_xhtml(graph: Graph, xhtml: str, url: str | None = None) -> None:
|
|
1791
|
+
try:
|
|
1792
|
+
from lxml import html as lxml_html
|
|
1793
|
+
except Exception:
|
|
1794
|
+
return
|
|
1795
|
+
parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
|
|
1796
|
+
try:
|
|
1797
|
+
doc = lxml_html.document_fromstring(xhtml, parser=parser)
|
|
1798
|
+
except Exception:
|
|
1799
|
+
return
|
|
1800
|
+
|
|
1801
|
+
schema = Namespace(f"{_SCHEMA_BASE}/")
|
|
1802
|
+
review_type = URIRef(f"{_SCHEMA_BASE}/Review")
|
|
1803
|
+
review_nodes = list(graph.subjects(RDF.type, review_type))
|
|
1804
|
+
if not review_nodes:
|
|
1805
|
+
return
|
|
1806
|
+
|
|
1807
|
+
title = (
|
|
1808
|
+
_xpath_first_text(doc, '/html/head/meta[@property="og:title"]/@content')
|
|
1809
|
+
or _xpath_first_text(doc, "/html/head/title/text()")
|
|
1810
|
+
or _xpath_first_text(doc, "//h1[1]")
|
|
1811
|
+
)
|
|
1812
|
+
description = _xpath_first_text(
|
|
1813
|
+
doc, '/html/head/meta[@property="og:description"]/@content'
|
|
1814
|
+
) or _xpath_first_text(doc, '/html/head/meta[@name="description"]/@content')
|
|
1815
|
+
author_name = (
|
|
1816
|
+
_xpath_first_text(doc, '/html/head/meta[@name="author"]/@content')
|
|
1817
|
+
or _xpath_first_text(doc, '/html/head/meta[@property="author"]/@content')
|
|
1818
|
+
or _xpath_first_text(
|
|
1819
|
+
doc, '/html/head/meta[@property="article:author"]/@content'
|
|
1820
|
+
)
|
|
1821
|
+
)
|
|
1822
|
+
item_name = _xpath_first_text(doc, "//figure//img/@alt") or _xpath_first_text(
|
|
1823
|
+
doc, "//h1[1]"
|
|
1824
|
+
)
|
|
1825
|
+
|
|
1826
|
+
for review in review_nodes:
|
|
1827
|
+
if url and graph.value(review, schema.url) is None:
|
|
1828
|
+
graph.add((review, schema.url, Literal(url)))
|
|
1829
|
+
if title and graph.value(review, schema.name) is None:
|
|
1830
|
+
graph.add((review, schema.name, Literal(title)))
|
|
1831
|
+
if description and graph.value(review, schema.description) is None:
|
|
1832
|
+
graph.add((review, schema.description, Literal(description)))
|
|
1833
|
+
|
|
1834
|
+
author = graph.value(review, schema.author)
|
|
1835
|
+
if (
|
|
1836
|
+
author is not None
|
|
1837
|
+
and author_name
|
|
1838
|
+
and graph.value(author, schema.name) is None
|
|
1839
|
+
):
|
|
1840
|
+
graph.add((author, schema.name, Literal(author_name)))
|
|
1841
|
+
|
|
1842
|
+
item = graph.value(review, schema.itemReviewed)
|
|
1843
|
+
if item is not None and item_name and graph.value(item, schema.name) is None:
|
|
1844
|
+
graph.add((item, schema.name, Literal(item_name)))
|
|
1845
|
+
|
|
1846
|
+
rating = graph.value(review, schema.reviewRating)
|
|
1847
|
+
if rating is not None and graph.value(rating, schema.ratingValue) is None:
|
|
1848
|
+
rating_value = _extract_rating_value(doc)
|
|
1849
|
+
if rating_value:
|
|
1850
|
+
graph.add((rating, schema.ratingValue, Literal(rating_value)))
|
|
1851
|
+
|
|
1852
|
+
|
|
1853
|
+
def _fill_jsonld_from_mappings(
|
|
1854
|
+
data: dict[str, Any] | list[Any],
|
|
1855
|
+
mappings: list[dict[str, Any]],
|
|
1856
|
+
xhtml: str,
|
|
1857
|
+
) -> dict[str, Any] | list[Any]:
|
|
1858
|
+
try:
|
|
1859
|
+
from lxml import html as lxml_html
|
|
1860
|
+
except Exception:
|
|
1861
|
+
return data
|
|
1862
|
+
parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
|
|
1863
|
+
try:
|
|
1864
|
+
doc = lxml_html.document_fromstring(xhtml, parser=parser)
|
|
1865
|
+
except Exception:
|
|
1866
|
+
return data
|
|
1867
|
+
|
|
1868
|
+
nodes = _flatten_jsonld(data)
|
|
1869
|
+
node_by_id: dict[str, dict[str, Any]] = {
|
|
1870
|
+
str(node.get("@id")): node
|
|
1871
|
+
for node in nodes
|
|
1872
|
+
if isinstance(node, dict) and node.get("@id")
|
|
1873
|
+
}
|
|
1874
|
+
for node_id, node in list(node_by_id.items()):
|
|
1875
|
+
if node_id.endswith("~iri"):
|
|
1876
|
+
node_by_id.setdefault(node_id[: -len("~iri")], node)
|
|
1877
|
+
|
|
1878
|
+
def _author_url_fallback() -> str | None:
|
|
1879
|
+
return _xpath_first_text(doc, "/html/head/link[@rel='author']/@href")
|
|
1880
|
+
|
|
1881
|
+
author_url_fallback = _author_url_fallback()
|
|
1882
|
+
|
|
1883
|
+
for mapping in mappings:
|
|
1884
|
+
name = mapping.get("name")
|
|
1885
|
+
if not name:
|
|
1886
|
+
continue
|
|
1887
|
+
node_id = f"http://example.com/{name}~iri"
|
|
1888
|
+
node = node_by_id.get(node_id) or node_by_id.get(f"http://example.com/{name}")
|
|
1889
|
+
if node is None:
|
|
1890
|
+
continue
|
|
1891
|
+
for prop, obj in mapping.get("props", []):
|
|
1892
|
+
prop_name = prop[7:] if prop.startswith("schema:") else prop
|
|
1893
|
+
if prop_name in {"a", "url"}:
|
|
1894
|
+
continue
|
|
1895
|
+
full_prop = f"{_SCHEMA_BASE}/{prop_name}"
|
|
1896
|
+
if full_prop in node:
|
|
1897
|
+
continue
|
|
1898
|
+
if not obj:
|
|
1899
|
+
continue
|
|
1900
|
+
if obj.startswith("ex:") and obj.endswith("~iri"):
|
|
1901
|
+
target = obj.split("ex:", 1)[1].split("~", 1)[0]
|
|
1902
|
+
node[full_prop] = [{"@id": f"http://example.com/{target}"}]
|
|
1903
|
+
continue
|
|
1904
|
+
if _looks_like_xpath(obj):
|
|
1905
|
+
xpath = _normalize_xpath_reference(_simplify_xpath(obj))
|
|
1906
|
+
text = _xpath_first_text(doc, xpath)
|
|
1907
|
+
if text:
|
|
1908
|
+
if prop_name in {"ratingValue", "bestRating", "worstRating"}:
|
|
1909
|
+
match = re.search(r"-?\\d+(?:\\.\\d+)?", text)
|
|
1910
|
+
if not match:
|
|
1911
|
+
continue
|
|
1912
|
+
text = match.group(0)
|
|
1913
|
+
node[full_prop] = [{"@value": text}]
|
|
1914
|
+
if prop_name == "name":
|
|
1915
|
+
node_type = node.get("@type") or []
|
|
1916
|
+
node_types = {
|
|
1917
|
+
normalize_type(t) for t in node_type if isinstance(t, str)
|
|
1918
|
+
}
|
|
1919
|
+
if node_types & {"Person", "Organization"}:
|
|
1920
|
+
url_xpath = f"{xpath}/@href"
|
|
1921
|
+
url_value = _xpath_first_text(doc, url_xpath)
|
|
1922
|
+
if url_value:
|
|
1923
|
+
node[f"{_SCHEMA_BASE}/url"] = [{"@value": url_value}]
|
|
1924
|
+
elif author_url_fallback:
|
|
1925
|
+
node[f"{_SCHEMA_BASE}/url"] = [
|
|
1926
|
+
{"@value": author_url_fallback}
|
|
1927
|
+
]
|
|
1928
|
+
continue
|
|
1929
|
+
if obj:
|
|
1930
|
+
node[full_prop] = [{"@value": obj}]
|
|
1931
|
+
|
|
1932
|
+
for node in nodes:
|
|
1933
|
+
if not isinstance(node, dict):
|
|
1934
|
+
continue
|
|
1935
|
+
node_types = {
|
|
1936
|
+
normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
|
|
1937
|
+
}
|
|
1938
|
+
if "Review" not in node_types:
|
|
1939
|
+
continue
|
|
1940
|
+
review_rating_prop = f"{_SCHEMA_BASE}/reviewRating"
|
|
1941
|
+
review_rating_pairs: list[tuple[dict[str, Any], dict[str, Any]]] = []
|
|
1942
|
+
for rating_ref in node.get(review_rating_prop, []):
|
|
1943
|
+
if isinstance(rating_ref, dict) and rating_ref.get("@id"):
|
|
1944
|
+
rating_node = node_by_id.get(str(rating_ref["@id"]))
|
|
1945
|
+
if isinstance(rating_node, dict):
|
|
1946
|
+
review_rating_pairs.append((rating_ref, rating_node))
|
|
1947
|
+
valid_rating_refs: list[dict[str, Any]] = []
|
|
1948
|
+
for rating_ref, rating_node in review_rating_pairs:
|
|
1949
|
+
rating_value_key = f"{_SCHEMA_BASE}/ratingValue"
|
|
1950
|
+
if rating_value_key not in rating_node:
|
|
1951
|
+
rating_value = _extract_rating_value(doc)
|
|
1952
|
+
if rating_value:
|
|
1953
|
+
rating_node[rating_value_key] = [{"@value": rating_value}]
|
|
1954
|
+
if rating_value_key in rating_node:
|
|
1955
|
+
valid_rating_refs.append(rating_ref)
|
|
1956
|
+
if review_rating_pairs:
|
|
1957
|
+
if valid_rating_refs:
|
|
1958
|
+
node[review_rating_prop] = valid_rating_refs
|
|
1959
|
+
else:
|
|
1960
|
+
node.pop(review_rating_prop, None)
|
|
1961
|
+
if f"{_SCHEMA_BASE}/description" not in node:
|
|
1962
|
+
description = _xpath_first_text(
|
|
1963
|
+
doc, '/html/head/meta[@property="og:description"]/@content'
|
|
1964
|
+
) or _xpath_first_text(doc, '/html/head/meta[@name="description"]/@content')
|
|
1965
|
+
if description:
|
|
1966
|
+
node[f"{_SCHEMA_BASE}/description"] = [{"@value": description}]
|
|
1967
|
+
return data
|
|
1968
|
+
|
|
1969
|
+
|
|
1970
|
+
def _extract_type(node: dict[str, Any]) -> str | None:
|
|
1971
|
+
raw = node.get("@type")
|
|
1972
|
+
if isinstance(raw, list) and raw:
|
|
1973
|
+
raw = raw[0]
|
|
1974
|
+
if isinstance(raw, str):
|
|
1975
|
+
return normalize_type(raw)
|
|
1976
|
+
return None
|
|
1977
|
+
|
|
1978
|
+
|
|
1979
|
+
def _extract_name(node: dict[str, Any]) -> str | None:
|
|
1980
|
+
for key in ("name", "headline", "title"):
|
|
1981
|
+
value = node.get(key)
|
|
1982
|
+
if isinstance(value, str) and value.strip():
|
|
1983
|
+
return value.strip()
|
|
1984
|
+
return None
|
|
1985
|
+
|
|
1986
|
+
|
|
1987
|
+
def _extract_text_value(value: Any) -> str | None:
|
|
1988
|
+
if isinstance(value, str) and value.strip():
|
|
1989
|
+
return value.strip()
|
|
1990
|
+
if isinstance(value, dict):
|
|
1991
|
+
raw = value.get("@value") or value.get("@id")
|
|
1992
|
+
if isinstance(raw, str) and raw.strip():
|
|
1993
|
+
return raw.strip()
|
|
1994
|
+
if isinstance(value, list):
|
|
1995
|
+
for item in value:
|
|
1996
|
+
text = _extract_text_value(item)
|
|
1997
|
+
if text:
|
|
1998
|
+
return text
|
|
1999
|
+
return None
|
|
2000
|
+
|
|
2001
|
+
|
|
2002
|
+
def _extract_name_any(node: dict[str, Any]) -> str | None:
|
|
2003
|
+
name = _extract_name(node)
|
|
2004
|
+
if name:
|
|
2005
|
+
return name
|
|
2006
|
+
for key in (
|
|
2007
|
+
f"{_SCHEMA_BASE}/name",
|
|
2008
|
+
f"{_SCHEMA_BASE}/headline",
|
|
2009
|
+
f"{_SCHEMA_BASE}/title",
|
|
2010
|
+
):
|
|
2011
|
+
value = _extract_text_value(node.get(key))
|
|
2012
|
+
if value:
|
|
2013
|
+
return value
|
|
2014
|
+
return None
|
|
2015
|
+
|
|
2016
|
+
|
|
2017
|
+
def _extract_url_any(node: dict[str, Any]) -> str | None:
|
|
2018
|
+
value = node.get("url")
|
|
2019
|
+
text = _extract_text_value(value)
|
|
2020
|
+
if text:
|
|
2021
|
+
return text
|
|
2022
|
+
value = node.get(f"{_SCHEMA_BASE}/url")
|
|
2023
|
+
text = _extract_text_value(value)
|
|
2024
|
+
if text:
|
|
2025
|
+
return text
|
|
2026
|
+
return None
|
|
2027
|
+
|
|
2028
|
+
|
|
2029
|
+
def _local_prop_name(name: str) -> str:
|
|
2030
|
+
if name.startswith(_SCHEMA_BASE):
|
|
2031
|
+
return name.rsplit("/", 1)[-1]
|
|
2032
|
+
if name.startswith("schema:"):
|
|
2033
|
+
return name.split(":", 1)[-1]
|
|
2034
|
+
return name
|
|
2035
|
+
|
|
2036
|
+
|
|
2037
|
+
_INDEPENDENT_PROPERTIES = {
|
|
2038
|
+
"author",
|
|
2039
|
+
"creator",
|
|
2040
|
+
"publisher",
|
|
2041
|
+
"editor",
|
|
2042
|
+
"contributor",
|
|
2043
|
+
"copyrightHolder",
|
|
2044
|
+
"brand",
|
|
2045
|
+
"manufacturer",
|
|
2046
|
+
"provider",
|
|
2047
|
+
"seller",
|
|
2048
|
+
"organizer",
|
|
2049
|
+
"performer",
|
|
2050
|
+
"actor",
|
|
2051
|
+
"director",
|
|
2052
|
+
"producer",
|
|
2053
|
+
"member",
|
|
2054
|
+
"memberOf",
|
|
2055
|
+
"affiliation",
|
|
2056
|
+
"parentOrganization",
|
|
2057
|
+
"subOrganization",
|
|
2058
|
+
"alumniOf",
|
|
2059
|
+
"sponsor",
|
|
2060
|
+
"about",
|
|
2061
|
+
"mentions",
|
|
2062
|
+
"mainEntity",
|
|
2063
|
+
"mainEntityOfPage",
|
|
2064
|
+
"isPartOf",
|
|
2065
|
+
"partOfSeries",
|
|
2066
|
+
"location",
|
|
2067
|
+
"areaServed",
|
|
2068
|
+
}
|
|
2069
|
+
|
|
2070
|
+
|
|
2071
|
+
def _is_jsonld_node(node: dict[str, Any]) -> bool:
|
|
2072
|
+
if "@type" in node:
|
|
2073
|
+
return True
|
|
2074
|
+
return any(isinstance(key, str) and key.startswith(_SCHEMA_BASE) for key in node)
|
|
2075
|
+
|
|
2076
|
+
|
|
2077
|
+
def _ensure_node_ids(
|
|
2078
|
+
data: dict[str, Any] | list[Any],
|
|
2079
|
+
dataset_uri: str,
|
|
2080
|
+
url: str,
|
|
2081
|
+
) -> None:
|
|
2082
|
+
seen: set[str] = set()
|
|
2083
|
+
replacements: dict[str, str] = {}
|
|
2084
|
+
|
|
2085
|
+
def _collect(value: Any) -> None:
|
|
2086
|
+
if isinstance(value, dict):
|
|
2087
|
+
node_id = value.get("@id")
|
|
2088
|
+
if isinstance(node_id, str) and node_id and not node_id.startswith("_:"):
|
|
2089
|
+
seen.add(node_id)
|
|
2090
|
+
for child in value.values():
|
|
2091
|
+
_collect(child)
|
|
2092
|
+
elif isinstance(value, list):
|
|
2093
|
+
for item in value:
|
|
2094
|
+
_collect(item)
|
|
2095
|
+
|
|
2096
|
+
def _assign(
|
|
2097
|
+
value: Any,
|
|
2098
|
+
counter: list[int],
|
|
2099
|
+
parent_id: str | None = None,
|
|
2100
|
+
prop_name: str | None = None,
|
|
2101
|
+
) -> None:
|
|
2102
|
+
if isinstance(value, dict):
|
|
2103
|
+
if _is_jsonld_node(value):
|
|
2104
|
+
node_id = value.get("@id")
|
|
2105
|
+
local_prop = _local_prop_name(prop_name or "")
|
|
2106
|
+
use_parent = bool(
|
|
2107
|
+
parent_id
|
|
2108
|
+
and local_prop
|
|
2109
|
+
and local_prop not in _INDEPENDENT_PROPERTIES
|
|
2110
|
+
)
|
|
2111
|
+
base_uri = parent_id if use_parent else dataset_uri
|
|
2112
|
+
needs_id = (
|
|
2113
|
+
not isinstance(node_id, str)
|
|
2114
|
+
or not node_id
|
|
2115
|
+
or node_id.startswith("_:")
|
|
2116
|
+
or (use_parent and not node_id.startswith(parent_id or ""))
|
|
2117
|
+
or (not use_parent and not node_id.startswith(dataset_uri))
|
|
2118
|
+
)
|
|
2119
|
+
if needs_id:
|
|
2120
|
+
type_name = _extract_type(value) or "Thing"
|
|
2121
|
+
name = _extract_name_any(value)
|
|
2122
|
+
if type_name == "ListItem":
|
|
2123
|
+
position = _extract_text_value(
|
|
2124
|
+
value.get("position")
|
|
2125
|
+
) or _extract_text_value(value.get(f"{_SCHEMA_BASE}/position"))
|
|
2126
|
+
if position:
|
|
2127
|
+
name = f"item-{position}"
|
|
2128
|
+
if not name:
|
|
2129
|
+
name = _dash_type(type_name)
|
|
2130
|
+
node_url = _extract_url_any(value)
|
|
2131
|
+
base_id = build_id_base(
|
|
2132
|
+
base_uri, type_name, name, node_url, counter[0]
|
|
2133
|
+
)
|
|
2134
|
+
candidate = base_id
|
|
2135
|
+
suffix = 1
|
|
2136
|
+
while candidate in seen:
|
|
2137
|
+
suffix += 1
|
|
2138
|
+
candidate = f"{base_id}-{suffix}"
|
|
2139
|
+
if isinstance(node_id, str) and node_id and node_id != candidate:
|
|
2140
|
+
replacements[node_id] = candidate
|
|
2141
|
+
if node_id.endswith("~iri"):
|
|
2142
|
+
replacements[node_id[: -len("~iri")]] = candidate
|
|
2143
|
+
else:
|
|
2144
|
+
replacements[f"{node_id}~iri"] = candidate
|
|
2145
|
+
value["@id"] = candidate
|
|
2146
|
+
seen.add(candidate)
|
|
2147
|
+
counter[0] += 1
|
|
2148
|
+
current_id = (
|
|
2149
|
+
value.get("@id") if isinstance(value.get("@id"), str) else parent_id
|
|
2150
|
+
)
|
|
2151
|
+
for key, child in value.items():
|
|
2152
|
+
if key in ("@id", "@type"):
|
|
2153
|
+
continue
|
|
2154
|
+
_assign(
|
|
2155
|
+
child,
|
|
2156
|
+
counter,
|
|
2157
|
+
current_id if isinstance(current_id, str) else None,
|
|
2158
|
+
key,
|
|
2159
|
+
)
|
|
2160
|
+
elif isinstance(value, list):
|
|
2161
|
+
for item in value:
|
|
2162
|
+
_assign(item, counter, parent_id, prop_name)
|
|
2163
|
+
|
|
2164
|
+
def _replace(value: Any) -> None:
|
|
2165
|
+
if isinstance(value, dict):
|
|
2166
|
+
node_id = value.get("@id")
|
|
2167
|
+
if isinstance(node_id, str) and node_id in replacements:
|
|
2168
|
+
value["@id"] = replacements[node_id]
|
|
2169
|
+
for child in value.values():
|
|
2170
|
+
_replace(child)
|
|
2171
|
+
elif isinstance(value, list):
|
|
2172
|
+
for item in value:
|
|
2173
|
+
_replace(item)
|
|
2174
|
+
|
|
2175
|
+
_collect(data)
|
|
2176
|
+
_assign(data, [1])
|
|
2177
|
+
if replacements:
|
|
2178
|
+
_replace(data)
|
|
2179
|
+
|
|
2180
|
+
|
|
2181
|
+
def _blank_node_errors(data: dict[str, Any] | list[Any]) -> list[str]:
|
|
2182
|
+
errors: list[str] = []
|
|
2183
|
+
|
|
2184
|
+
def _walk(value: Any) -> None:
|
|
2185
|
+
if isinstance(value, dict):
|
|
2186
|
+
if _is_jsonld_node(value):
|
|
2187
|
+
node_id = value.get("@id")
|
|
2188
|
+
if (
|
|
2189
|
+
not isinstance(node_id, str)
|
|
2190
|
+
or not node_id
|
|
2191
|
+
or node_id.startswith("_:")
|
|
2192
|
+
):
|
|
2193
|
+
errors.append(
|
|
2194
|
+
"JSON-LD node missing @id or uses a blank node identifier"
|
|
2195
|
+
)
|
|
2196
|
+
for child in value.values():
|
|
2197
|
+
_walk(child)
|
|
2198
|
+
elif isinstance(value, list):
|
|
2199
|
+
for item in value:
|
|
2200
|
+
_walk(item)
|
|
2201
|
+
|
|
2202
|
+
_walk(data)
|
|
2203
|
+
return errors
|
|
2204
|
+
|
|
2205
|
+
|
|
2206
|
+
def _review_rating_dropped(
|
|
2207
|
+
data: dict[str, Any] | list[Any],
|
|
2208
|
+
mappings: list[dict[str, Any]],
|
|
2209
|
+
target_type: str | None,
|
|
2210
|
+
) -> bool:
|
|
2211
|
+
target = normalize_type(target_type or "Thing")
|
|
2212
|
+
if target != "Review":
|
|
2213
|
+
return False
|
|
2214
|
+
mapped_props = _main_mapping_props(mappings)
|
|
2215
|
+
if "reviewRating" not in mapped_props:
|
|
2216
|
+
return False
|
|
2217
|
+
nodes = _flatten_jsonld(data)
|
|
2218
|
+
for node in nodes:
|
|
2219
|
+
if not isinstance(node, dict):
|
|
2220
|
+
continue
|
|
2221
|
+
node_types = {
|
|
2222
|
+
normalize_type(t) for t in node.get("@type", []) if isinstance(t, str)
|
|
2223
|
+
}
|
|
2224
|
+
if "Review" in node_types:
|
|
2225
|
+
return f"{_SCHEMA_BASE}/reviewRating" not in node
|
|
2226
|
+
return False
|
|
2227
|
+
|
|
2228
|
+
|
|
2229
|
+
def _build_id_map(
|
|
2230
|
+
nodes: list[dict[str, Any]],
|
|
2231
|
+
dataset_uri: str,
|
|
2232
|
+
url: str,
|
|
2233
|
+
) -> dict[str, str]:
|
|
2234
|
+
id_map: dict[str, str] = {}
|
|
2235
|
+
for idx, node in enumerate(nodes):
|
|
2236
|
+
old_id = node.get("@id") or f"_:b{idx}"
|
|
2237
|
+
type_name = _extract_type(node) or "Thing"
|
|
2238
|
+
name = _extract_name_any(node) or _dash_type(type_name)
|
|
2239
|
+
node_url = _extract_url_any(node)
|
|
2240
|
+
if isinstance(old_id, str) and old_id.startswith(dataset_uri):
|
|
2241
|
+
new_id = old_id
|
|
2242
|
+
else:
|
|
2243
|
+
new_id = build_id(dataset_uri, type_name, name, node_url, idx + 1)
|
|
2244
|
+
if old_id in id_map:
|
|
2245
|
+
new_id = f"{new_id}-{idx}"
|
|
2246
|
+
id_map[str(old_id)] = new_id
|
|
2247
|
+
return id_map
|
|
2248
|
+
|
|
2249
|
+
|
|
2250
|
+
def _rewrite_refs(
|
|
2251
|
+
value: Any,
|
|
2252
|
+
id_map: dict[str, str],
|
|
2253
|
+
node_map: dict[str, dict[str, Any]],
|
|
2254
|
+
*,
|
|
2255
|
+
embed_nodes: bool,
|
|
2256
|
+
) -> Any:
|
|
2257
|
+
if isinstance(value, dict):
|
|
2258
|
+
if "@id" in value and isinstance(value["@id"], str):
|
|
2259
|
+
ref_id = id_map.get(value["@id"], value["@id"])
|
|
2260
|
+
if embed_nodes and ref_id in node_map:
|
|
2261
|
+
return node_map[ref_id]
|
|
2262
|
+
return {"@id": ref_id}
|
|
2263
|
+
return {
|
|
2264
|
+
k: _rewrite_refs(v, id_map, node_map, embed_nodes=embed_nodes)
|
|
2265
|
+
for k, v in value.items()
|
|
2266
|
+
}
|
|
2267
|
+
if isinstance(value, list):
|
|
2268
|
+
return [
|
|
2269
|
+
_rewrite_refs(item, id_map, node_map, embed_nodes=embed_nodes)
|
|
2270
|
+
for item in value
|
|
2271
|
+
]
|
|
2272
|
+
return value
|
|
2273
|
+
|
|
2274
|
+
|
|
2275
|
+
def _main_mapping_props(mappings: list[dict[str, Any]]) -> set[str]:
|
|
2276
|
+
schema_props = _schema_property_set()
|
|
2277
|
+
for mapping in mappings:
|
|
2278
|
+
if mapping.get("__main__"):
|
|
2279
|
+
props = mapping.get("props") or []
|
|
2280
|
+
clean: set[str] = set()
|
|
2281
|
+
for prop, _ in props:
|
|
2282
|
+
if not isinstance(prop, str):
|
|
2283
|
+
continue
|
|
2284
|
+
name = prop[7:] if prop.startswith("schema:") else prop
|
|
2285
|
+
if "~" in name or "http" in name or name == "a":
|
|
2286
|
+
continue
|
|
2287
|
+
base = name.split(".", 1)[0]
|
|
2288
|
+
if base in schema_props or name in schema_props:
|
|
2289
|
+
clean.add(base)
|
|
2290
|
+
return clean
|
|
2291
|
+
return set()
|
|
2292
|
+
|
|
2293
|
+
|
|
2294
|
+
def _missing_required_props(
|
|
2295
|
+
required_props: list[str],
|
|
2296
|
+
mapped_props: set[str],
|
|
2297
|
+
) -> list[str]:
|
|
2298
|
+
missing: set[str] = set()
|
|
2299
|
+
mapped_base = {prop.split(".", 1)[0] for prop in mapped_props}
|
|
2300
|
+
for prop in required_props:
|
|
2301
|
+
base = prop.split(".", 1)[0]
|
|
2302
|
+
if base not in mapped_base:
|
|
2303
|
+
missing.add(prop)
|
|
2304
|
+
return sorted(missing)
|
|
2305
|
+
|
|
2306
|
+
|
|
2307
|
+
def _missing_recommended_props(
|
|
2308
|
+
recommended_props: list[str],
|
|
2309
|
+
mapped_props: set[str],
|
|
2310
|
+
) -> list[str]:
|
|
2311
|
+
missing: set[str] = set()
|
|
2312
|
+
mapped_base = {prop.split(".", 1)[0] for prop in mapped_props}
|
|
2313
|
+
for prop in recommended_props:
|
|
2314
|
+
base = prop.split(".", 1)[0]
|
|
2315
|
+
if base not in mapped_base:
|
|
2316
|
+
missing.add(prop)
|
|
2317
|
+
return sorted(missing)
|
|
2318
|
+
|
|
2319
|
+
|
|
2320
|
+
def _google_allowed_properties(
|
|
2321
|
+
property_guides: dict[str, dict[str, list[str]]],
|
|
2322
|
+
) -> dict[str, list[str]]:
|
|
2323
|
+
allowed: dict[str, list[str]] = {}
|
|
2324
|
+
for type_name, guide in property_guides.items():
|
|
2325
|
+
props = set(guide.get("required", [])) | set(guide.get("recommended", []))
|
|
2326
|
+
if type_name == "Review":
|
|
2327
|
+
props |= _REVIEW_OPTIONAL_EXTRAS
|
|
2328
|
+
props = sorted(props)
|
|
2329
|
+
allowed[type_name] = props
|
|
2330
|
+
return allowed
|
|
2331
|
+
|
|
2332
|
+
|
|
2333
|
+
def _mapping_allowed_property_set(
|
|
2334
|
+
property_guides: dict[str, dict[str, list[str]]],
|
|
2335
|
+
) -> set[str]:
|
|
2336
|
+
props: set[str] = set()
|
|
2337
|
+
for guide in property_guides.values():
|
|
2338
|
+
for name in guide.get("required", []) + guide.get("recommended", []):
|
|
2339
|
+
props.add(name.split(".", 1)[0])
|
|
2340
|
+
props |= _REVIEW_OPTIONAL_EXTRAS
|
|
2341
|
+
return props
|
|
2342
|
+
|
|
2343
|
+
|
|
2344
|
+
def _mapping_violations(
|
|
2345
|
+
mappings: list[dict[str, Any]],
|
|
2346
|
+
allowed_props: set[str],
|
|
2347
|
+
target_type: str,
|
|
2348
|
+
) -> list[str]:
|
|
2349
|
+
errors: list[str] = []
|
|
2350
|
+
for mapping in mappings:
|
|
2351
|
+
map_name = mapping.get("name", "mapping")
|
|
2352
|
+
map_type = normalize_type(mapping.get("type") or "")
|
|
2353
|
+
for prop, obj in mapping.get("props", []):
|
|
2354
|
+
prop_name = prop[7:] if prop.startswith("schema:") else prop
|
|
2355
|
+
base = prop_name.split(".", 1)[0]
|
|
2356
|
+
if base in {"a", "url"}:
|
|
2357
|
+
continue
|
|
2358
|
+
if "/" in base or base.startswith("http"):
|
|
2359
|
+
continue
|
|
2360
|
+
if base not in allowed_props:
|
|
2361
|
+
errors.append(f"{map_name}: property not allowed by Google: {base}")
|
|
2362
|
+
if _looks_like_xpath(obj):
|
|
2363
|
+
continue
|
|
2364
|
+
if (
|
|
2365
|
+
base in {"author", "reviewRating", "itemReviewed"}
|
|
2366
|
+
and obj.startswith("ex:")
|
|
2367
|
+
and obj.endswith("~iri")
|
|
2368
|
+
):
|
|
2369
|
+
continue
|
|
2370
|
+
if obj.startswith("ex:") and obj.endswith("~iri"):
|
|
2371
|
+
continue
|
|
2372
|
+
errors.append(f"{map_name}: hard-coded literal for {base} is not allowed")
|
|
2373
|
+
if map_type == "Review":
|
|
2374
|
+
review_rating = [
|
|
2375
|
+
obj
|
|
2376
|
+
for prop, obj in mapping.get("props", [])
|
|
2377
|
+
if prop.endswith("reviewRating")
|
|
2378
|
+
]
|
|
2379
|
+
for obj in review_rating:
|
|
2380
|
+
if not (obj.startswith("ex:") and obj.endswith("~iri")):
|
|
2381
|
+
errors.append(f"{map_name}: reviewRating must map to a Rating node")
|
|
2382
|
+
return errors
|
|
2383
|
+
|
|
2384
|
+
|
|
2385
|
+
def _xpath_evidence_errors(
|
|
2386
|
+
mappings: list[dict[str, Any]],
|
|
2387
|
+
xhtml: str,
|
|
2388
|
+
) -> list[str]:
|
|
2389
|
+
try:
|
|
2390
|
+
from lxml import html as lxml_html
|
|
2391
|
+
except Exception:
|
|
2392
|
+
return []
|
|
2393
|
+
parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
|
|
2394
|
+
try:
|
|
2395
|
+
doc = lxml_html.document_fromstring(xhtml, parser=parser)
|
|
2396
|
+
except Exception:
|
|
2397
|
+
return []
|
|
2398
|
+
errors: list[str] = []
|
|
2399
|
+
for mapping in mappings:
|
|
2400
|
+
map_name = mapping.get("name", "mapping")
|
|
2401
|
+
for prop, obj in mapping.get("props", []):
|
|
2402
|
+
prop_name = prop[7:] if prop.startswith("schema:") else prop
|
|
2403
|
+
if prop_name == "url":
|
|
2404
|
+
continue
|
|
2405
|
+
if not _looks_like_xpath(obj):
|
|
2406
|
+
continue
|
|
2407
|
+
try:
|
|
2408
|
+
result = doc.xpath(_simplify_xpath(obj))
|
|
2409
|
+
except Exception:
|
|
2410
|
+
errors.append(f"{map_name}: invalid XPath for {prop_name}")
|
|
2411
|
+
continue
|
|
2412
|
+
if not result:
|
|
2413
|
+
errors.append(f"{map_name}: XPath returned no results for {prop_name}")
|
|
2414
|
+
continue
|
|
2415
|
+
if isinstance(result, list) and all(
|
|
2416
|
+
(isinstance(item, str) and not item.strip()) for item in result
|
|
2417
|
+
):
|
|
2418
|
+
errors.append(f"{map_name}: XPath returned empty text for {prop_name}")
|
|
2419
|
+
return errors
|
|
2420
|
+
|
|
2421
|
+
|
|
2422
|
+
def _xpath_reusability_warnings(mappings: list[dict[str, Any]]) -> list[str]:
|
|
2423
|
+
warnings: list[str] = []
|
|
2424
|
+
id_with_digits = re.compile(r"@id\\s*=\\s*['\"][^'\"]*\\d[^'\"]*['\"]")
|
|
2425
|
+
for mapping in mappings:
|
|
2426
|
+
map_name = mapping.get("name", "mapping")
|
|
2427
|
+
for prop, obj in mapping.get("props", []):
|
|
2428
|
+
prop_name = prop[7:] if prop.startswith("schema:") else prop
|
|
2429
|
+
if not _looks_like_xpath(obj):
|
|
2430
|
+
continue
|
|
2431
|
+
candidate = obj.replace('\\"', '"').replace("\\'", "'")
|
|
2432
|
+
if id_with_digits.search(candidate):
|
|
2433
|
+
warnings.append(
|
|
2434
|
+
f"{map_name}: XPath for {prop_name} uses a numeric @id; prefer a reusable selector."
|
|
2435
|
+
)
|
|
2436
|
+
return warnings
|
|
2437
|
+
|
|
2438
|
+
|
|
2439
|
+
def _mapping_type_sanity(
|
|
2440
|
+
mappings: list[dict[str, Any]],
|
|
2441
|
+
expected_types: dict[str, tuple[str, ...]],
|
|
2442
|
+
) -> list[str]:
|
|
2443
|
+
errors: list[str] = []
|
|
2444
|
+
mapping_types = {
|
|
2445
|
+
m.get("name"): normalize_type(m.get("type") or "") for m in mappings
|
|
2446
|
+
}
|
|
2447
|
+
for mapping in mappings:
|
|
2448
|
+
map_name = mapping.get("name")
|
|
2449
|
+
for prop, obj in mapping.get("props", []):
|
|
2450
|
+
prop_name = prop[7:] if prop.startswith("schema:") else prop
|
|
2451
|
+
if obj.startswith("ex:") and obj.endswith("~iri"):
|
|
2452
|
+
target = obj.split("ex:", 1)[1].split("~", 1)[0]
|
|
2453
|
+
expected = expected_types.get(prop_name)
|
|
2454
|
+
if expected:
|
|
2455
|
+
actual = mapping_types.get(target, "")
|
|
2456
|
+
if actual and actual not in expected:
|
|
2457
|
+
errors.append(
|
|
2458
|
+
f"{map_name}: {prop_name} must map to {expected}, got {actual}"
|
|
2459
|
+
)
|
|
2460
|
+
return errors
|
|
2461
|
+
|
|
2462
|
+
|
|
2463
|
+
def _format_result_path(value: Identifier | None) -> str:
|
|
2464
|
+
if value is None:
|
|
2465
|
+
return "unknown"
|
|
2466
|
+
if isinstance(value, URIRef):
|
|
2467
|
+
return _short_schema_name(value) or str(value)
|
|
2468
|
+
return str(value)
|
|
2469
|
+
|
|
2470
|
+
|
|
2471
|
+
def _validation_messages(
|
|
2472
|
+
result: ValidationResult, max_items: int = 20
|
|
2473
|
+
) -> tuple[list[str], list[str]]:
|
|
2474
|
+
errors: list[str] = []
|
|
2475
|
+
warnings: list[str] = []
|
|
2476
|
+
count = 0
|
|
2477
|
+
for res in result.report_graph.subjects(RDF.type, _SH.ValidationResult):
|
|
2478
|
+
severity = result.report_graph.value(res, _SH.resultSeverity)
|
|
2479
|
+
message = result.report_graph.value(res, _SH.resultMessage)
|
|
2480
|
+
path = result.report_graph.value(res, _SH.resultPath)
|
|
2481
|
+
source_shape = result.report_graph.value(res, _SH.sourceShape)
|
|
2482
|
+
source_label = result.shape_source_map.get(source_shape, "unknown")
|
|
2483
|
+
line = f"{_format_result_path(path)}: {message or 'validation error'} (shape: {source_label})"
|
|
2484
|
+
if severity == _SH.Warning:
|
|
2485
|
+
warnings.append(line)
|
|
2486
|
+
else:
|
|
2487
|
+
errors.append(line)
|
|
2488
|
+
count += 1
|
|
2489
|
+
if count >= max_items:
|
|
2490
|
+
break
|
|
2491
|
+
return errors, warnings
|
|
2492
|
+
|
|
2493
|
+
|
|
2494
|
+
def _validation_messages_for_types(
|
|
2495
|
+
result: ValidationResult,
|
|
2496
|
+
allowed_types: set[str],
|
|
2497
|
+
max_items: int = 20,
|
|
2498
|
+
) -> tuple[list[str], list[str]]:
|
|
2499
|
+
errors: list[str] = []
|
|
2500
|
+
warnings: list[str] = []
|
|
2501
|
+
count = 0
|
|
2502
|
+
for res in result.report_graph.subjects(RDF.type, _SH.ValidationResult):
|
|
2503
|
+
severity = result.report_graph.value(res, _SH.resultSeverity)
|
|
2504
|
+
message = result.report_graph.value(res, _SH.resultMessage)
|
|
2505
|
+
path = result.report_graph.value(res, _SH.resultPath)
|
|
2506
|
+
source_shape = result.report_graph.value(res, _SH.sourceShape)
|
|
2507
|
+
source_label = result.shape_source_map.get(source_shape, "unknown")
|
|
2508
|
+
focus = result.report_graph.value(res, _SH.focusNode)
|
|
2509
|
+
focus_types: set[str] = set()
|
|
2510
|
+
if focus is not None:
|
|
2511
|
+
for t in result.data_graph.objects(focus, RDF.type):
|
|
2512
|
+
if isinstance(t, URIRef):
|
|
2513
|
+
focus_types.add(normalize_type(str(t)))
|
|
2514
|
+
relevant = not focus_types or bool(focus_types & allowed_types)
|
|
2515
|
+
line = f"{_format_result_path(path)}: {message or 'validation error'} (shape: {source_label})"
|
|
2516
|
+
if severity == _SH.Warning or not relevant:
|
|
2517
|
+
warnings.append(line)
|
|
2518
|
+
else:
|
|
2519
|
+
errors.append(line)
|
|
2520
|
+
count += 1
|
|
2521
|
+
if count >= max_items:
|
|
2522
|
+
break
|
|
2523
|
+
return errors, warnings
|
|
2524
|
+
|
|
2525
|
+
|
|
2526
|
+
def normalize_jsonld(
|
|
2527
|
+
data: dict[str, Any] | list[Any],
|
|
2528
|
+
dataset_uri: str,
|
|
2529
|
+
url: str,
|
|
2530
|
+
target_type: str | None,
|
|
2531
|
+
*,
|
|
2532
|
+
embed_nodes: bool = True,
|
|
2533
|
+
) -> dict[str, Any]:
|
|
2534
|
+
data = _normalize_iri_suffixes(data)
|
|
2535
|
+
nodes = _collect_jsonld_nodes(data)
|
|
2536
|
+
if not nodes:
|
|
2537
|
+
raise RuntimeError("No JSON-LD nodes produced by morph-kgc.")
|
|
2538
|
+
|
|
2539
|
+
target = normalize_type(target_type) if target_type else None
|
|
2540
|
+
main_node: dict[str, Any] | None = None
|
|
2541
|
+
main_old_id = "_:b0"
|
|
2542
|
+
for idx, node in enumerate(nodes):
|
|
2543
|
+
node_type = _extract_type(node)
|
|
2544
|
+
if target and node_type == target:
|
|
2545
|
+
main_node = node
|
|
2546
|
+
main_old_id = str(node.get("@id") or f"_:b{idx}")
|
|
2547
|
+
break
|
|
2548
|
+
if main_node is None:
|
|
2549
|
+
main_node = nodes[0]
|
|
2550
|
+
main_old_id = str(main_node.get("@id") or "_:b0")
|
|
2551
|
+
|
|
2552
|
+
id_map = _build_id_map(nodes, dataset_uri, url)
|
|
2553
|
+
node_map: dict[str, dict[str, Any]] = {}
|
|
2554
|
+
for idx, node in enumerate(nodes):
|
|
2555
|
+
old_id = str(node.get("@id") or f"_:b{idx}")
|
|
2556
|
+
new_id = id_map[old_id]
|
|
2557
|
+
node["@id"] = new_id
|
|
2558
|
+
node_map[new_id] = node
|
|
2559
|
+
|
|
2560
|
+
for node in nodes:
|
|
2561
|
+
for key, value in list(node.items()):
|
|
2562
|
+
if key in ("@id", "@type"):
|
|
2563
|
+
continue
|
|
2564
|
+
node[key] = _rewrite_refs(value, id_map, node_map, embed_nodes=embed_nodes)
|
|
2565
|
+
|
|
2566
|
+
main_id = id_map.get(main_old_id)
|
|
2567
|
+
if not main_id:
|
|
2568
|
+
raise RuntimeError("Failed to resolve main node @id.")
|
|
2569
|
+
if embed_nodes:
|
|
2570
|
+
main = node_map[main_id]
|
|
2571
|
+
main["@context"] = _SCHEMA_BASE
|
|
2572
|
+
blank_nodes = _blank_node_errors(main)
|
|
2573
|
+
if blank_nodes:
|
|
2574
|
+
raise RuntimeError("Blank nodes are not allowed in JSON-LD output.")
|
|
2575
|
+
return main
|
|
2576
|
+
for node in node_map.values():
|
|
2577
|
+
node.setdefault("@context", _SCHEMA_BASE)
|
|
2578
|
+
graph = {"@context": _SCHEMA_BASE, "@graph": list(node_map.values())}
|
|
2579
|
+
blank_nodes = _blank_node_errors(graph)
|
|
2580
|
+
if blank_nodes:
|
|
2581
|
+
raise RuntimeError("Blank nodes are not allowed in JSON-LD output.")
|
|
2582
|
+
return graph
|
|
2583
|
+
|
|
2584
|
+
|
|
2585
|
+
def generate_from_agent(
|
|
2586
|
+
url: str,
|
|
2587
|
+
html: str,
|
|
2588
|
+
xhtml: str,
|
|
2589
|
+
cleaned_xhtml: str,
|
|
2590
|
+
api_key: str,
|
|
2591
|
+
dataset_uri: str,
|
|
2592
|
+
target_type: str | None,
|
|
2593
|
+
workdir: Path,
|
|
2594
|
+
debug: bool = False,
|
|
2595
|
+
max_retries: int = 2,
|
|
2596
|
+
max_nesting_depth: int = 2,
|
|
2597
|
+
quality_check: bool = True,
|
|
2598
|
+
log: Callable[[str], None] | None = None,
|
|
2599
|
+
) -> tuple[str, dict[str, Any]]:
|
|
2600
|
+
debug_path = workdir / "agent_debug.json" if debug else None
|
|
2601
|
+
target_name = normalize_type(target_type or "Thing")
|
|
2602
|
+
property_guides = property_guides_with_related(target_name, max_nesting_depth)
|
|
2603
|
+
allowed_properties = _google_allowed_properties(property_guides)
|
|
2604
|
+
allowed_property_set = _mapping_allowed_property_set(property_guides)
|
|
2605
|
+
workdir.mkdir(parents=True, exist_ok=True)
|
|
2606
|
+
requirements_path = workdir / "requirements.json"
|
|
2607
|
+
requirements_path.write_text(
|
|
2608
|
+
json.dumps(
|
|
2609
|
+
{
|
|
2610
|
+
"target_type": target_name,
|
|
2611
|
+
"max_depth": max_nesting_depth,
|
|
2612
|
+
"types": property_guides,
|
|
2613
|
+
"allowed_properties": allowed_properties,
|
|
2614
|
+
},
|
|
2615
|
+
indent=2,
|
|
2616
|
+
)
|
|
2617
|
+
)
|
|
2618
|
+
html_path = (workdir / "rendered.html").resolve()
|
|
2619
|
+
html_path.write_text(html)
|
|
2620
|
+
xhtml_path = (workdir / "page.xhtml").resolve()
|
|
2621
|
+
xhtml_path.write_text(xhtml)
|
|
2622
|
+
cleaned_path = (workdir / "page.cleaned.xhtml").resolve()
|
|
2623
|
+
cleaned_path.write_text(cleaned_xhtml)
|
|
2624
|
+
|
|
2625
|
+
shape_specs = shape_specs_for_types(list(property_guides.keys()))
|
|
2626
|
+
mapping_validation_path = workdir / "mapping.validation.json"
|
|
2627
|
+
mapping_jsonld_path = workdir / "mapping.jsonld"
|
|
2628
|
+
|
|
2629
|
+
yarrml = ""
|
|
2630
|
+
mappings: list[dict[str, Any]] = []
|
|
2631
|
+
missing_required: list[str] = []
|
|
2632
|
+
previous_yarrml: str | None = None
|
|
2633
|
+
validation_errors: list[str] | None = None
|
|
2634
|
+
validation_report: list[str] | None = None
|
|
2635
|
+
missing_recommended: list[str] = []
|
|
2636
|
+
xpath_warnings: list[str] = []
|
|
2637
|
+
quality_feedback: list[str] | None = None
|
|
2638
|
+
quality_score: int | None = None
|
|
2639
|
+
jsonld_raw: dict[str, Any] | list[Any] | None = None
|
|
2640
|
+
normalized_jsonld: dict[str, Any] | None = None
|
|
2641
|
+
|
|
2642
|
+
for attempt in range(max_retries + 1):
|
|
2643
|
+
yarrml = ask_agent_for_yarrml(
|
|
2644
|
+
api_key,
|
|
2645
|
+
url,
|
|
2646
|
+
cleaned_xhtml,
|
|
2647
|
+
target_type,
|
|
2648
|
+
debug=debug,
|
|
2649
|
+
debug_path=debug_path,
|
|
2650
|
+
property_guides=property_guides,
|
|
2651
|
+
missing_required=missing_required if attempt > 0 else None,
|
|
2652
|
+
missing_recommended=missing_recommended if attempt > 0 else None,
|
|
2653
|
+
previous_yarrml=previous_yarrml if attempt > 0 else None,
|
|
2654
|
+
validation_errors=validation_errors if attempt > 0 else None,
|
|
2655
|
+
validation_report=validation_report if attempt > 0 else None,
|
|
2656
|
+
xpath_warnings=xpath_warnings if attempt > 0 else None,
|
|
2657
|
+
allow_properties=allowed_properties,
|
|
2658
|
+
quality_feedback=quality_feedback if attempt > 0 else None,
|
|
2659
|
+
)
|
|
2660
|
+
|
|
2661
|
+
yarrml, mappings = _normalize_agent_yarrml(
|
|
2662
|
+
yarrml,
|
|
2663
|
+
url,
|
|
2664
|
+
cleaned_path.as_posix(),
|
|
2665
|
+
target_type,
|
|
2666
|
+
)
|
|
2667
|
+
yarrml_path = workdir / "mapping.yarrml"
|
|
2668
|
+
rml_path = workdir / "mapping.ttl"
|
|
2669
|
+
yarrml_path.write_text(yarrml)
|
|
2670
|
+
|
|
2671
|
+
try:
|
|
2672
|
+
_run_yarrrml_parser(yarrml_path, rml_path)
|
|
2673
|
+
_ensure_subject_termtype_iri(rml_path)
|
|
2674
|
+
_normalize_reference_formulation(rml_path)
|
|
2675
|
+
jsonld_raw = _materialize_jsonld(rml_path)
|
|
2676
|
+
jsonld_raw = _fill_jsonld_from_mappings(jsonld_raw, mappings, cleaned_xhtml)
|
|
2677
|
+
_ensure_node_ids(jsonld_raw, dataset_uri, url)
|
|
2678
|
+
mapping_jsonld_path.write_text(json.dumps(jsonld_raw, indent=2))
|
|
2679
|
+
normalized_jsonld = postprocess_jsonld(
|
|
2680
|
+
jsonld_raw,
|
|
2681
|
+
mappings,
|
|
2682
|
+
cleaned_xhtml,
|
|
2683
|
+
dataset_uri,
|
|
2684
|
+
url,
|
|
2685
|
+
target_type=target_type,
|
|
2686
|
+
)
|
|
2687
|
+
final_jsonld_path = workdir / "structured-data.jsonld"
|
|
2688
|
+
final_jsonld_path.write_text(json.dumps(normalized_jsonld, indent=2))
|
|
2689
|
+
validation_result = validate_file(
|
|
2690
|
+
str(final_jsonld_path), shape_specs=shape_specs
|
|
2691
|
+
)
|
|
2692
|
+
errors, warnings = _validation_messages_for_types(
|
|
2693
|
+
validation_result,
|
|
2694
|
+
set(property_guides.keys()),
|
|
2695
|
+
)
|
|
2696
|
+
validation_errors = errors or None
|
|
2697
|
+
validation_report = (
|
|
2698
|
+
validation_result.report_text.splitlines()
|
|
2699
|
+
if validation_result
|
|
2700
|
+
else None
|
|
2701
|
+
)
|
|
2702
|
+
except Exception as exc:
|
|
2703
|
+
mapping_validation_path.write_text(
|
|
2704
|
+
json.dumps(
|
|
2705
|
+
{
|
|
2706
|
+
"conforms": False,
|
|
2707
|
+
"warning_count": 0,
|
|
2708
|
+
"errors": [str(exc)],
|
|
2709
|
+
"warnings": [],
|
|
2710
|
+
},
|
|
2711
|
+
indent=2,
|
|
2712
|
+
)
|
|
2713
|
+
)
|
|
2714
|
+
validation_errors = [str(exc)]
|
|
2715
|
+
validation_report = None
|
|
2716
|
+
previous_yarrml = yarrml
|
|
2717
|
+
continue
|
|
2718
|
+
|
|
2719
|
+
mapped_props = _main_mapping_props(mappings)
|
|
2720
|
+
required_props = property_guides.get(target_name, {}).get("required", [])
|
|
2721
|
+
recommended_props = property_guides.get(target_name, {}).get("recommended", [])
|
|
2722
|
+
missing_required = _missing_required_props(required_props, mapped_props)
|
|
2723
|
+
missing_recommended = _missing_recommended_props(
|
|
2724
|
+
recommended_props, mapped_props
|
|
2725
|
+
)
|
|
2726
|
+
mapping_errors: list[str] = []
|
|
2727
|
+
mapping_errors.extend(
|
|
2728
|
+
_mapping_violations(mappings, allowed_property_set, target_name)
|
|
2729
|
+
)
|
|
2730
|
+
evidence_warnings = _xpath_evidence_errors(mappings, cleaned_xhtml)
|
|
2731
|
+
reusability_warnings = _xpath_reusability_warnings(mappings)
|
|
2732
|
+
expected_types = {
|
|
2733
|
+
"reviewRating": ("Rating",),
|
|
2734
|
+
"author": ("Person", "Organization"),
|
|
2735
|
+
}
|
|
2736
|
+
mapping_errors.extend(_mapping_type_sanity(mappings, expected_types))
|
|
2737
|
+
if reusability_warnings:
|
|
2738
|
+
mapping_errors.extend(reusability_warnings)
|
|
2739
|
+
warnings_out: list[str] = list(warnings) if "warnings" in locals() else []
|
|
2740
|
+
if evidence_warnings:
|
|
2741
|
+
warnings_out.extend(evidence_warnings)
|
|
2742
|
+
if reusability_warnings:
|
|
2743
|
+
warnings_out.extend(reusability_warnings)
|
|
2744
|
+
if missing_required:
|
|
2745
|
+
warnings_out.append(
|
|
2746
|
+
f"Missing required properties: {', '.join(missing_required)}"
|
|
2747
|
+
)
|
|
2748
|
+
if mapping_errors:
|
|
2749
|
+
mapping_validation_path.write_text(
|
|
2750
|
+
json.dumps(
|
|
2751
|
+
{
|
|
2752
|
+
"conforms": False,
|
|
2753
|
+
"warning_count": validation_result.warning_count
|
|
2754
|
+
if "validation_result" in locals()
|
|
2755
|
+
else 0,
|
|
2756
|
+
"errors": mapping_errors,
|
|
2757
|
+
"warnings": warnings_out,
|
|
2758
|
+
"shacl_errors": errors if "errors" in locals() else [],
|
|
2759
|
+
},
|
|
2760
|
+
indent=2,
|
|
2761
|
+
)
|
|
2762
|
+
)
|
|
2763
|
+
validation_errors = mapping_errors
|
|
2764
|
+
validation_report = None
|
|
2765
|
+
xpath_warnings = warnings_out
|
|
2766
|
+
else:
|
|
2767
|
+
if _review_rating_dropped(jsonld_raw, mappings, target_type):
|
|
2768
|
+
warnings_out.append("Review ratingValue missing; reviewRating dropped.")
|
|
2769
|
+
mapping_validation_path.write_text(
|
|
2770
|
+
json.dumps(
|
|
2771
|
+
{
|
|
2772
|
+
"conforms": validation_result.conforms
|
|
2773
|
+
if "validation_result" in locals()
|
|
2774
|
+
else True,
|
|
2775
|
+
"warning_count": validation_result.warning_count
|
|
2776
|
+
if "validation_result" in locals()
|
|
2777
|
+
else 0,
|
|
2778
|
+
"errors": errors if "errors" in locals() else [],
|
|
2779
|
+
"warnings": warnings_out,
|
|
2780
|
+
},
|
|
2781
|
+
indent=2,
|
|
2782
|
+
)
|
|
2783
|
+
)
|
|
2784
|
+
validation_errors = errors or None
|
|
2785
|
+
validation_report = (
|
|
2786
|
+
validation_result.report_text.splitlines()
|
|
2787
|
+
if validation_result
|
|
2788
|
+
else None
|
|
2789
|
+
)
|
|
2790
|
+
xpath_warnings = warnings_out
|
|
2791
|
+
if quality_check:
|
|
2792
|
+
quality_score = None
|
|
2793
|
+
quality_feedback = None
|
|
2794
|
+
try:
|
|
2795
|
+
quality_payload = ask_agent_for_quality(
|
|
2796
|
+
api_key,
|
|
2797
|
+
url,
|
|
2798
|
+
cleaned_xhtml,
|
|
2799
|
+
normalized_jsonld,
|
|
2800
|
+
property_guides,
|
|
2801
|
+
target_type,
|
|
2802
|
+
)
|
|
2803
|
+
except RuntimeError:
|
|
2804
|
+
quality_payload = None
|
|
2805
|
+
if isinstance(quality_payload, dict):
|
|
2806
|
+
score = quality_payload.get("score")
|
|
2807
|
+
if isinstance(score, (int, float)):
|
|
2808
|
+
quality_score = int(score)
|
|
2809
|
+
missing = quality_payload.get("missing_in_jsonld")
|
|
2810
|
+
notes = quality_payload.get("notes")
|
|
2811
|
+
suggested = quality_payload.get("suggested_xpath")
|
|
2812
|
+
feedback: list[str] = []
|
|
2813
|
+
if isinstance(missing, list) and missing:
|
|
2814
|
+
feedback.append("Missing in JSON-LD (present in XHTML):")
|
|
2815
|
+
feedback.extend([str(item) for item in missing])
|
|
2816
|
+
if isinstance(suggested, dict) and suggested:
|
|
2817
|
+
feedback.append("Suggested XPath for missing properties:")
|
|
2818
|
+
for key, value in suggested.items():
|
|
2819
|
+
feedback.append(f"- {key}: {value}")
|
|
2820
|
+
if isinstance(notes, list) and notes:
|
|
2821
|
+
feedback.append("Notes:")
|
|
2822
|
+
feedback.extend([str(item) for item in notes])
|
|
2823
|
+
if feedback:
|
|
2824
|
+
feedback.append(
|
|
2825
|
+
f"Quality score: {quality_score}"
|
|
2826
|
+
if quality_score is not None
|
|
2827
|
+
else "Quality score unavailable"
|
|
2828
|
+
)
|
|
2829
|
+
quality_feedback = feedback
|
|
2830
|
+
if validation_errors is None and (
|
|
2831
|
+
not missing_required or attempt >= max_retries
|
|
2832
|
+
):
|
|
2833
|
+
if (
|
|
2834
|
+
not quality_check
|
|
2835
|
+
or quality_score is None
|
|
2836
|
+
or quality_score >= 7
|
|
2837
|
+
or attempt >= max_retries
|
|
2838
|
+
):
|
|
2839
|
+
break
|
|
2840
|
+
previous_yarrml = yarrml
|
|
2841
|
+
|
|
2842
|
+
if jsonld_raw is None:
|
|
2843
|
+
raise RuntimeError(
|
|
2844
|
+
"Failed to produce JSON-LD from the generated YARRRML mapping."
|
|
2845
|
+
)
|
|
2846
|
+
if validation_errors:
|
|
2847
|
+
logger = logging.getLogger("worai")
|
|
2848
|
+
logger.warning(
|
|
2849
|
+
"YARRRML mapping failed validation after retries; proceeding anyway. "
|
|
2850
|
+
f"See {mapping_validation_path} for details."
|
|
2851
|
+
)
|
|
2852
|
+
|
|
2853
|
+
if normalized_jsonld is None:
|
|
2854
|
+
normalized_jsonld = normalize_jsonld(
|
|
2855
|
+
jsonld_raw, dataset_uri, url, target_type, embed_nodes=False
|
|
2856
|
+
)
|
|
2857
|
+
jsonld = normalized_jsonld
|
|
2858
|
+
return yarrml, jsonld
|
|
2859
|
+
|
|
2860
|
+
|
|
2861
|
+
__all__ = [
|
|
2862
|
+
"StructuredDataOptions",
|
|
2863
|
+
"StructuredDataResult",
|
|
2864
|
+
"build_output_basename",
|
|
2865
|
+
"ensure_no_blank_nodes",
|
|
2866
|
+
"generate_from_agent",
|
|
2867
|
+
"get_dataset_uri",
|
|
2868
|
+
"get_dataset_uri_async",
|
|
2869
|
+
"make_reusable_yarrrml",
|
|
2870
|
+
"materialize_yarrrml_jsonld",
|
|
2871
|
+
"normalize_type",
|
|
2872
|
+
"normalize_yarrrml_mappings",
|
|
2873
|
+
"postprocess_jsonld",
|
|
2874
|
+
"shape_specs_for_type",
|
|
2875
|
+
]
|