wordlift-sdk 2.9.0__py3-none-any.whl → 2.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +1 -1
- wordlift_sdk/render/__init__.py +30 -0
- wordlift_sdk/render/browser.py +132 -0
- wordlift_sdk/render/cleanup_options.py +24 -0
- wordlift_sdk/render/html_renderer.py +86 -0
- wordlift_sdk/render/render_options.py +21 -0
- wordlift_sdk/render/rendered_page.py +13 -0
- wordlift_sdk/render/xhtml_cleaner.py +126 -0
- wordlift_sdk/structured_data/__init__.py +27 -0
- wordlift_sdk/structured_data/agent.py +49 -0
- wordlift_sdk/structured_data/agent_generator.py +12 -0
- wordlift_sdk/structured_data/batch.py +220 -0
- wordlift_sdk/structured_data/constants.py +1 -0
- wordlift_sdk/structured_data/dataset_resolver.py +32 -0
- wordlift_sdk/structured_data/debug.py +23 -0
- wordlift_sdk/structured_data/engine.py +2875 -0
- wordlift_sdk/structured_data/inputs.py +58 -0
- wordlift_sdk/structured_data/io.py +44 -0
- wordlift_sdk/structured_data/materialization.py +70 -0
- wordlift_sdk/structured_data/models.py +48 -0
- wordlift_sdk/structured_data/orchestrator.py +194 -0
- wordlift_sdk/structured_data/rendering.py +43 -0
- wordlift_sdk/structured_data/schema_guide.py +17 -0
- wordlift_sdk/structured_data/structured_data_engine.py +58 -0
- wordlift_sdk/structured_data/validation.py +31 -0
- wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
- wordlift_sdk/url_source/__init__.py +7 -2
- wordlift_sdk/validation/__init__.py +7 -0
- wordlift_sdk/validation/generator.py +446 -0
- wordlift_sdk/validation/shacl.py +205 -0
- wordlift_sdk/validation/shacls/__init__.py +1 -0
- wordlift_sdk/validation/shacls/google-article.ttl +148 -0
- wordlift_sdk/validation/shacls/google-book.ttl +660 -0
- wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
- wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
- wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
- wordlift_sdk/validation/shacls/google-course.ttl +43 -0
- wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
- wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
- wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
- wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
- wordlift_sdk/validation/shacls/google-event.ttl +46 -0
- wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
- wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
- wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
- wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
- wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
- wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
- wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
- wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
- wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
- wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
- wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
- wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
- wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
- wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
- wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
- wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
- wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
- wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
- wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
- wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
- wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
- wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
- wordlift_sdk/validation/shacls/google-video.ttl +149 -0
- wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
- {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/METADATA +1 -1
- {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/RECORD +69 -5
- {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
"""SHACL generator utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import html as html_lib
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Iterable
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
from rdflib import Graph, Namespace, RDF, RDFS, URIRef
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
|
|
17
|
+
SEARCH_GALLERY_URL = "https://developers.google.com/search/docs/appearance/structured-data/search-gallery"
|
|
18
|
+
FEATURE_URL_RE = re.compile(
|
|
19
|
+
r'href="(/search/docs/appearance/structured-data/[^"#?]+)"', re.IGNORECASE
|
|
20
|
+
)
|
|
21
|
+
TOKEN_RE = re.compile(
|
|
22
|
+
r"(<table[^>]*>.*?</table>|<p[^>]*>.*?</p>|<h2[^>]*>.*?</h2>|<h3[^>]*>.*?</h3>)",
|
|
23
|
+
re.DOTALL | re.IGNORECASE,
|
|
24
|
+
)
|
|
25
|
+
ROW_RE = re.compile(r"<tr[^>]*>.*?</tr>", re.DOTALL | re.IGNORECASE)
|
|
26
|
+
TAG_RE = re.compile(r"<[^>]+>")
|
|
27
|
+
|
|
28
|
+
SCHEMA_JSONLD_URL = "https://schema.org/version/latest/schemaorg-current-https.jsonld"
|
|
29
|
+
|
|
30
|
+
SCHEMA_VOCAB = Namespace("https://schema.org/")
|
|
31
|
+
SCHEMA_DATA = Namespace("http://schema.org/")
|
|
32
|
+
SH = Namespace("http://www.w3.org/ns/shacl#")
|
|
33
|
+
XSD = Namespace("http://www.w3.org/2001/XMLSchema#")
|
|
34
|
+
RDF_NS = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class FeatureData:
|
|
39
|
+
url: str
|
|
40
|
+
types: dict[str, dict[str, set[str]]]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class PropertyRange:
|
|
45
|
+
prop: URIRef
|
|
46
|
+
ranges: list[URIRef]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _strip_tags(text: str) -> str:
|
|
50
|
+
return html_lib.unescape(TAG_RE.sub("", text)).strip()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _unique(items: Iterable[str]) -> list[str]:
|
|
54
|
+
seen: set[str] = set()
|
|
55
|
+
result: list[str] = []
|
|
56
|
+
for item in items:
|
|
57
|
+
if item not in seen:
|
|
58
|
+
seen.add(item)
|
|
59
|
+
result.append(item)
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _extract_schema_types(fragment: str) -> list[str]:
|
|
64
|
+
types = []
|
|
65
|
+
for match in re.findall(r"https?://schema\.org/([A-Za-z0-9]+)", fragment):
|
|
66
|
+
types.append(match)
|
|
67
|
+
|
|
68
|
+
if types:
|
|
69
|
+
return _unique(types)
|
|
70
|
+
|
|
71
|
+
if fragment.lower().startswith("<h"):
|
|
72
|
+
code_match = re.findall(
|
|
73
|
+
r"<code[^>]*>(.*?)</code>", fragment, re.DOTALL | re.IGNORECASE
|
|
74
|
+
)
|
|
75
|
+
for item in code_match:
|
|
76
|
+
value = _strip_tags(item)
|
|
77
|
+
for token in re.findall(r"[A-Z][A-Za-z0-9]*", value):
|
|
78
|
+
types.append(token)
|
|
79
|
+
return _unique(types)
|
|
80
|
+
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _table_kind(table_html: str) -> str | None:
|
|
85
|
+
header_match = re.search(r"<th[^>]*>\s*([^<]+)\s*</th>", table_html, re.IGNORECASE)
|
|
86
|
+
if not header_match:
|
|
87
|
+
return None
|
|
88
|
+
header = _strip_tags(header_match.group(1)).lower()
|
|
89
|
+
if "required properties" in header:
|
|
90
|
+
return "required"
|
|
91
|
+
if "recommended properties" in header:
|
|
92
|
+
return "recommended"
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _extract_properties(table_html: str) -> list[str]:
|
|
97
|
+
props: list[str] = []
|
|
98
|
+
for row in ROW_RE.findall(table_html):
|
|
99
|
+
td_match = re.search(r"<td[^>]*>(.*?)</td>", row, re.DOTALL | re.IGNORECASE)
|
|
100
|
+
if not td_match:
|
|
101
|
+
continue
|
|
102
|
+
td_html = td_match.group(1)
|
|
103
|
+
code_match = re.search(
|
|
104
|
+
r"<code[^>]*>(.*?)</code>", td_html, re.DOTALL | re.IGNORECASE
|
|
105
|
+
)
|
|
106
|
+
if not code_match:
|
|
107
|
+
continue
|
|
108
|
+
raw = _strip_tags(code_match.group(1))
|
|
109
|
+
for token in re.findall(
|
|
110
|
+
r"[A-Za-z][A-Za-z0-9]*(?:\.[A-Za-z][A-Za-z0-9]*)*", raw
|
|
111
|
+
):
|
|
112
|
+
if token.startswith("@"):
|
|
113
|
+
continue
|
|
114
|
+
if token[0].isupper() and "." not in token:
|
|
115
|
+
continue
|
|
116
|
+
props.append(token)
|
|
117
|
+
return _unique(props)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _feature_urls_from_gallery(html: str) -> list[str]:
|
|
121
|
+
urls: list[str] = []
|
|
122
|
+
for match in FEATURE_URL_RE.findall(html):
|
|
123
|
+
url = f"https://developers.google.com{match}".rstrip("/")
|
|
124
|
+
if url.endswith("/search-gallery"):
|
|
125
|
+
continue
|
|
126
|
+
if url.endswith("/structured-data"):
|
|
127
|
+
continue
|
|
128
|
+
if url not in urls:
|
|
129
|
+
urls.append(url)
|
|
130
|
+
return urls
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _parse_feature(html: str, url: str) -> FeatureData:
|
|
134
|
+
current_types: list[str] = []
|
|
135
|
+
type_data: dict[str, dict[str, set[str]]] = {}
|
|
136
|
+
|
|
137
|
+
for token in TOKEN_RE.findall(html):
|
|
138
|
+
if token.lower().startswith(("<p", "<h2", "<h3")):
|
|
139
|
+
types = _extract_schema_types(token)
|
|
140
|
+
if types:
|
|
141
|
+
current_types = types
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
if token.lower().startswith("<table"):
|
|
145
|
+
kind = _table_kind(token)
|
|
146
|
+
if not kind:
|
|
147
|
+
continue
|
|
148
|
+
props = _extract_properties(token)
|
|
149
|
+
if not props:
|
|
150
|
+
continue
|
|
151
|
+
target_types = current_types or ["Thing"]
|
|
152
|
+
for t in target_types:
|
|
153
|
+
bucket = type_data.setdefault(
|
|
154
|
+
t, {"required": set(), "recommended": set()}
|
|
155
|
+
)
|
|
156
|
+
bucket[kind].update(props)
|
|
157
|
+
|
|
158
|
+
for t, bucket in type_data.items():
|
|
159
|
+
bucket["recommended"].difference_update(bucket["required"])
|
|
160
|
+
|
|
161
|
+
return FeatureData(url=url, types=type_data)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _prop_path(prop: str) -> str:
|
|
165
|
+
parts = prop.split(".")
|
|
166
|
+
if len(parts) == 1:
|
|
167
|
+
return f"schema:{parts[0]}"
|
|
168
|
+
seq = " ".join(f"schema:{part}" for part in parts)
|
|
169
|
+
return f"( {seq} )"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _write_feature(feature: FeatureData, output_path: Path, overwrite: bool) -> bool:
|
|
173
|
+
if output_path.exists() and not overwrite:
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
lines: list[str] = []
|
|
177
|
+
slug = output_path.stem
|
|
178
|
+
prefix_base = f"https://wordlift.io/shacl/google/{slug}/"
|
|
179
|
+
lines.append(f"@prefix : <{prefix_base}> .")
|
|
180
|
+
lines.append("@prefix sh: <http://www.w3.org/ns/shacl#> .")
|
|
181
|
+
lines.append("@prefix schema: <http://schema.org/> .")
|
|
182
|
+
lines.append("")
|
|
183
|
+
lines.append(f"# Source: {feature.url}")
|
|
184
|
+
lines.append(f"# Generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
|
|
185
|
+
lines.append(
|
|
186
|
+
"# Notes: required properties => errors; recommended properties => warnings."
|
|
187
|
+
)
|
|
188
|
+
lines.append("")
|
|
189
|
+
|
|
190
|
+
for type_name in sorted(feature.types.keys()):
|
|
191
|
+
bucket = feature.types[type_name]
|
|
192
|
+
shape_name = f":google_{type_name}Shape"
|
|
193
|
+
lines.append(shape_name)
|
|
194
|
+
lines.append(" a sh:NodeShape ;")
|
|
195
|
+
lines.append(f" sh:targetClass schema:{type_name} ;")
|
|
196
|
+
|
|
197
|
+
for prop in sorted(bucket["required"]):
|
|
198
|
+
path = _prop_path(prop)
|
|
199
|
+
lines.append(" sh:property [")
|
|
200
|
+
lines.append(f" sh:path {path} ;")
|
|
201
|
+
lines.append(" sh:minCount 1 ;")
|
|
202
|
+
lines.append(" ] ;")
|
|
203
|
+
|
|
204
|
+
for prop in sorted(bucket["recommended"]):
|
|
205
|
+
path = _prop_path(prop)
|
|
206
|
+
lines.append(" sh:property [")
|
|
207
|
+
lines.append(f" sh:path {path} ;")
|
|
208
|
+
lines.append(" sh:minCount 1 ;")
|
|
209
|
+
lines.append(" sh:severity sh:Warning ;")
|
|
210
|
+
lines.append(f' sh:message "Recommended by Google: {prop}." ;')
|
|
211
|
+
lines.append(" ] ;")
|
|
212
|
+
|
|
213
|
+
lines.append(".")
|
|
214
|
+
lines.append("")
|
|
215
|
+
|
|
216
|
+
output_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
|
|
217
|
+
return True
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def generate_google_shacls(
|
|
221
|
+
output_dir: Path, overwrite: bool, limit: int, only: list[str] | None
|
|
222
|
+
) -> int:
|
|
223
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
224
|
+
|
|
225
|
+
gallery_html = requests.get(SEARCH_GALLERY_URL, timeout=30).text
|
|
226
|
+
feature_urls = _feature_urls_from_gallery(gallery_html)
|
|
227
|
+
|
|
228
|
+
if only:
|
|
229
|
+
wanted = {slug.strip().rstrip("/") for slug in only}
|
|
230
|
+
feature_urls = [url for url in feature_urls if url.rsplit("/", 1)[-1] in wanted]
|
|
231
|
+
|
|
232
|
+
if limit:
|
|
233
|
+
feature_urls = feature_urls[:limit]
|
|
234
|
+
|
|
235
|
+
generated = 0
|
|
236
|
+
skipped = 0
|
|
237
|
+
|
|
238
|
+
for url in tqdm(feature_urls, desc="Generating SHACLs", unit="feature"):
|
|
239
|
+
slug = url.rsplit("/", 1)[-1]
|
|
240
|
+
output_name = f"google-{slug}.ttl"
|
|
241
|
+
output_path = output_dir / output_name
|
|
242
|
+
|
|
243
|
+
if slug == "review-snippet" and output_path.exists() is False:
|
|
244
|
+
curated = output_dir / "review-snippet.ttl"
|
|
245
|
+
if curated.exists() and not overwrite:
|
|
246
|
+
skipped += 1
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
html = requests.get(url, timeout=30).text
|
|
250
|
+
feature = _parse_feature(html, url)
|
|
251
|
+
if not feature.types:
|
|
252
|
+
skipped += 1
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
if _write_feature(feature, output_path, overwrite):
|
|
256
|
+
generated += 1
|
|
257
|
+
else:
|
|
258
|
+
skipped += 1
|
|
259
|
+
|
|
260
|
+
print(f"Generated: {generated}")
|
|
261
|
+
print(f"Skipped: {skipped}")
|
|
262
|
+
print(f"Total: {len(feature_urls)}")
|
|
263
|
+
return 0
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _datatype_shapes(datatype: str) -> list[dict[str, str]]:
|
|
267
|
+
if datatype == "Text":
|
|
268
|
+
return [
|
|
269
|
+
{"datatype": str(XSD.string)},
|
|
270
|
+
{"datatype": str(RDF_NS.langString)},
|
|
271
|
+
]
|
|
272
|
+
if datatype == "URL":
|
|
273
|
+
return [{"datatype": str(XSD.anyURI)}]
|
|
274
|
+
if datatype == "Boolean":
|
|
275
|
+
return [{"datatype": str(XSD.boolean)}]
|
|
276
|
+
if datatype == "Date":
|
|
277
|
+
return [{"datatype": str(XSD.date)}]
|
|
278
|
+
if datatype == "DateTime":
|
|
279
|
+
return [{"datatype": str(XSD.dateTime)}]
|
|
280
|
+
if datatype == "Time":
|
|
281
|
+
return [{"datatype": str(XSD.time)}]
|
|
282
|
+
if datatype == "Integer":
|
|
283
|
+
return [{"datatype": str(XSD.integer)}]
|
|
284
|
+
if datatype == "Float":
|
|
285
|
+
return [{"datatype": str(XSD.float)}]
|
|
286
|
+
if datatype == "Number":
|
|
287
|
+
return [
|
|
288
|
+
{"datatype": str(XSD.integer)},
|
|
289
|
+
{"datatype": str(XSD.decimal)},
|
|
290
|
+
{"datatype": str(XSD.double)},
|
|
291
|
+
]
|
|
292
|
+
return []
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _short_name(uri: URIRef) -> str:
|
|
296
|
+
value = str(uri)
|
|
297
|
+
if value.startswith(str(SCHEMA_VOCAB)):
|
|
298
|
+
return value[len(str(SCHEMA_VOCAB)) :]
|
|
299
|
+
if value.startswith(str(SCHEMA_DATA)):
|
|
300
|
+
return value[len(str(SCHEMA_DATA)) :]
|
|
301
|
+
return value.rsplit("/", 1)[-1]
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _collect_classes(graph: Graph) -> list[URIRef]:
|
|
305
|
+
classes = set(graph.subjects(RDF.type, RDFS.Class))
|
|
306
|
+
classes.update(graph.subjects(RDF.type, SCHEMA_VOCAB.Class))
|
|
307
|
+
return sorted(classes, key=str)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _collect_properties(graph: Graph) -> list[URIRef]:
|
|
311
|
+
props = set(graph.subjects(RDF.type, RDF.Property))
|
|
312
|
+
return sorted(props, key=str)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _collect_domain_ranges(
|
|
316
|
+
graph: Graph, prop: URIRef
|
|
317
|
+
) -> list[tuple[URIRef, list[URIRef]]]:
|
|
318
|
+
domains = list(graph.objects(prop, SCHEMA_VOCAB.domainIncludes))
|
|
319
|
+
ranges = list(graph.objects(prop, SCHEMA_VOCAB.rangeIncludes))
|
|
320
|
+
if not domains:
|
|
321
|
+
return []
|
|
322
|
+
return [(domain, ranges) for domain in domains]
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _render_property_shape(prop: URIRef, ranges: list[URIRef]) -> list[str]:
|
|
326
|
+
lines: list[str] = []
|
|
327
|
+
lines.append(" sh:property [")
|
|
328
|
+
lines.append(f" sh:path schema:{_short_name(prop)} ;")
|
|
329
|
+
lines.append(" sh:severity sh:Warning ;")
|
|
330
|
+
|
|
331
|
+
range_constraints: list[str] = []
|
|
332
|
+
for r in ranges:
|
|
333
|
+
name = _short_name(r)
|
|
334
|
+
datatype_shapes = _datatype_shapes(name)
|
|
335
|
+
if datatype_shapes:
|
|
336
|
+
for shape in datatype_shapes:
|
|
337
|
+
range_constraints.append(f"[ sh:datatype <{shape['datatype']}> ]")
|
|
338
|
+
else:
|
|
339
|
+
range_constraints.append(f"[ sh:class schema:{name} ]")
|
|
340
|
+
|
|
341
|
+
if range_constraints:
|
|
342
|
+
if len(range_constraints) == 1:
|
|
343
|
+
lines.append(f" sh:or ( {range_constraints[0]} ) ;")
|
|
344
|
+
else:
|
|
345
|
+
lines.append(" sh:or (")
|
|
346
|
+
for rc in range_constraints:
|
|
347
|
+
lines.append(f" {rc}")
|
|
348
|
+
lines.append(" ) ;")
|
|
349
|
+
|
|
350
|
+
lines.append(f' sh:message "Schema.org range check: {_short_name(prop)}." ;')
|
|
351
|
+
lines.append(" ] ;")
|
|
352
|
+
return lines
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def generate_schema_shacls(output_file: Path, overwrite: bool) -> int:
|
|
356
|
+
output_path = output_file
|
|
357
|
+
if output_path.exists() and not overwrite:
|
|
358
|
+
print(f"Output exists: {output_path}")
|
|
359
|
+
return 1
|
|
360
|
+
|
|
361
|
+
response = requests.get(SCHEMA_JSONLD_URL, timeout=60)
|
|
362
|
+
response.raise_for_status()
|
|
363
|
+
|
|
364
|
+
graph = Graph()
|
|
365
|
+
graph.parse(data=response.text, format="json-ld")
|
|
366
|
+
|
|
367
|
+
classes = _collect_classes(graph)
|
|
368
|
+
props = _collect_properties(graph)
|
|
369
|
+
|
|
370
|
+
class_props: dict[URIRef, list[PropertyRange]] = {cls: [] for cls in classes}
|
|
371
|
+
|
|
372
|
+
for prop in tqdm(props, desc="Collecting properties", unit="prop"):
|
|
373
|
+
for domain, ranges in _collect_domain_ranges(graph, prop):
|
|
374
|
+
if domain not in class_props:
|
|
375
|
+
class_props[domain] = []
|
|
376
|
+
class_props[domain].append(PropertyRange(prop=prop, ranges=ranges))
|
|
377
|
+
|
|
378
|
+
lines: list[str] = []
|
|
379
|
+
lines.append("@prefix : <https://wordlift.io/shacl/schemaorg-grammar/> .")
|
|
380
|
+
lines.append(f"@prefix sh: <{SH}> .")
|
|
381
|
+
lines.append(f"@prefix schema: <{SCHEMA_DATA}> .")
|
|
382
|
+
lines.append("")
|
|
383
|
+
lines.append(f"# Source: {SCHEMA_JSONLD_URL}")
|
|
384
|
+
lines.append(f"# Generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
|
|
385
|
+
lines.append(
|
|
386
|
+
"# Notes: schema.org grammar checks only; all constraints are warnings."
|
|
387
|
+
)
|
|
388
|
+
lines.append("")
|
|
389
|
+
|
|
390
|
+
for cls in tqdm(classes, desc="Writing shapes", unit="class"):
|
|
391
|
+
props_for_class = class_props.get(cls, [])
|
|
392
|
+
if not props_for_class:
|
|
393
|
+
continue
|
|
394
|
+
shape_name = f":schema_{_short_name(cls)}Shape"
|
|
395
|
+
lines.append(shape_name)
|
|
396
|
+
lines.append(" a sh:NodeShape ;")
|
|
397
|
+
lines.append(f" sh:targetClass schema:{_short_name(cls)} ;")
|
|
398
|
+
|
|
399
|
+
for prop_range in props_for_class:
|
|
400
|
+
lines.extend(_render_property_shape(prop_range.prop, prop_range.ranges))
|
|
401
|
+
|
|
402
|
+
lines.append(".")
|
|
403
|
+
lines.append("")
|
|
404
|
+
|
|
405
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
406
|
+
output_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
|
|
407
|
+
print(f"Wrote {output_path}")
|
|
408
|
+
return 0
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def google_main(argv: list[str] | None = None) -> int:
|
|
412
|
+
parser = argparse.ArgumentParser(
|
|
413
|
+
description="Generate Google Search Gallery SHACL shapes."
|
|
414
|
+
)
|
|
415
|
+
parser.add_argument(
|
|
416
|
+
"--output-dir",
|
|
417
|
+
default="worai/validation/shacls",
|
|
418
|
+
help="Directory for generated SHACL files.",
|
|
419
|
+
)
|
|
420
|
+
parser.add_argument(
|
|
421
|
+
"--overwrite", action="store_true", help="Overwrite existing files."
|
|
422
|
+
)
|
|
423
|
+
parser.add_argument(
|
|
424
|
+
"--limit", type=int, default=0, help="Limit number of features (0 = all)."
|
|
425
|
+
)
|
|
426
|
+
parser.add_argument(
|
|
427
|
+
"--only", nargs="*", help="Only generate for specified feature slugs."
|
|
428
|
+
)
|
|
429
|
+
args = parser.parse_args(argv)
|
|
430
|
+
return generate_google_shacls(
|
|
431
|
+
Path(args.output_dir), args.overwrite, args.limit, args.only
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def schema_main(argv: list[str] | None = None) -> int:
|
|
436
|
+
parser = argparse.ArgumentParser(description="Generate Schema.org grammar SHACLs.")
|
|
437
|
+
parser.add_argument(
|
|
438
|
+
"--output-file",
|
|
439
|
+
default="worai/validation/shacls/schemaorg-grammar.ttl",
|
|
440
|
+
help="Output SHACL file.",
|
|
441
|
+
)
|
|
442
|
+
parser.add_argument(
|
|
443
|
+
"--overwrite", action="store_true", help="Overwrite existing file."
|
|
444
|
+
)
|
|
445
|
+
args = parser.parse_args(argv)
|
|
446
|
+
return generate_schema_shacls(Path(args.output_file), args.overwrite)
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""SHACL validation helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from importlib import resources
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterable
|
|
9
|
+
|
|
10
|
+
from pyshacl import validate
|
|
11
|
+
from rdflib import Graph, Namespace, URIRef
|
|
12
|
+
from rdflib.term import Identifier
|
|
13
|
+
from requests import Response, get
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ValidationResult:
|
|
18
|
+
conforms: bool
|
|
19
|
+
report_text: str
|
|
20
|
+
report_graph: Graph
|
|
21
|
+
data_graph: Graph
|
|
22
|
+
shape_source_map: dict[Identifier, str]
|
|
23
|
+
warning_count: int
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _detect_format_from_path(path: Path) -> str | None:
|
|
27
|
+
if path.suffix.lower() in {".jsonld", ".json-ld"}:
|
|
28
|
+
return "json-ld"
|
|
29
|
+
if path.suffix.lower() in {".ttl", ".turtle"}:
|
|
30
|
+
return "turtle"
|
|
31
|
+
if path.suffix.lower() in {".nt"}:
|
|
32
|
+
return "nt"
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _detect_format_from_response(response: Response) -> str | None:
|
|
37
|
+
content_type = response.headers.get("content-type", "").lower()
|
|
38
|
+
if "json" in content_type or "ld+json" in content_type:
|
|
39
|
+
return "json-ld"
|
|
40
|
+
if "turtle" in content_type or "ttl" in content_type:
|
|
41
|
+
return "turtle"
|
|
42
|
+
if "n-triples" in content_type:
|
|
43
|
+
return "nt"
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _load_graph_from_text(data: str, fmt: str | None) -> Graph:
|
|
48
|
+
graph = Graph()
|
|
49
|
+
try:
|
|
50
|
+
graph.parse(data=data, format=fmt)
|
|
51
|
+
return graph
|
|
52
|
+
except Exception as exc:
|
|
53
|
+
if fmt is None:
|
|
54
|
+
raise
|
|
55
|
+
raise RuntimeError(f"Failed to parse input as {fmt}: {exc}") from exc
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _load_graph(path_or_url: str) -> Graph:
|
|
59
|
+
if path_or_url.startswith(("http://", "https://")):
|
|
60
|
+
response = get(path_or_url, timeout=30)
|
|
61
|
+
if not response.ok:
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
f"Failed to fetch URL ({response.status_code}): {path_or_url}"
|
|
64
|
+
)
|
|
65
|
+
fmt = _detect_format_from_response(response)
|
|
66
|
+
try:
|
|
67
|
+
return _load_graph_from_text(response.text, fmt)
|
|
68
|
+
except Exception:
|
|
69
|
+
for fallback in (None, "json-ld", "turtle", "nt"):
|
|
70
|
+
if fallback == fmt:
|
|
71
|
+
continue
|
|
72
|
+
try:
|
|
73
|
+
return _load_graph_from_text(response.text, fallback)
|
|
74
|
+
except Exception:
|
|
75
|
+
continue
|
|
76
|
+
raise RuntimeError(f"Failed to parse remote RDF from {path_or_url}")
|
|
77
|
+
|
|
78
|
+
path = Path(path_or_url)
|
|
79
|
+
if not path.exists():
|
|
80
|
+
raise RuntimeError(f"Input file not found: {path}")
|
|
81
|
+
|
|
82
|
+
fmt = _detect_format_from_path(path)
|
|
83
|
+
graph = Graph()
|
|
84
|
+
graph.parse(path.as_posix(), format=fmt)
|
|
85
|
+
return graph
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _normalize_schema_org_uris(graph: Graph) -> Graph:
|
|
89
|
+
schema_http = "http://schema.org/"
|
|
90
|
+
schema_https = "https://schema.org/"
|
|
91
|
+
normalized = Graph()
|
|
92
|
+
for prefix, ns in graph.namespace_manager.namespaces():
|
|
93
|
+
normalized.namespace_manager.bind(prefix, ns, replace=True)
|
|
94
|
+
for s, p, o in graph:
|
|
95
|
+
if isinstance(s, URIRef) and str(s).startswith(schema_https):
|
|
96
|
+
s = URIRef(schema_http + str(s)[len(schema_https) :])
|
|
97
|
+
if isinstance(p, URIRef) and str(p).startswith(schema_https):
|
|
98
|
+
p = URIRef(schema_http + str(p)[len(schema_https) :])
|
|
99
|
+
if isinstance(o, URIRef) and str(o).startswith(schema_https):
|
|
100
|
+
o = URIRef(schema_http + str(o)[len(schema_https) :])
|
|
101
|
+
normalized.add((s, p, o))
|
|
102
|
+
return normalized
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _shape_resource_names() -> list[str]:
|
|
106
|
+
shapes_dir = resources.files("wordlift_sdk.validation.shacls")
|
|
107
|
+
return sorted(
|
|
108
|
+
[
|
|
109
|
+
p.name
|
|
110
|
+
for p in shapes_dir.iterdir()
|
|
111
|
+
if p.is_file() and p.name.endswith(".ttl")
|
|
112
|
+
]
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def list_shape_names() -> list[str]:
|
|
117
|
+
return _shape_resource_names()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _read_shape_resource(name: str) -> str | None:
|
|
121
|
+
shapes_dir = resources.files("wordlift_sdk.validation.shacls")
|
|
122
|
+
resource = shapes_dir.joinpath(name)
|
|
123
|
+
if not resource.is_file():
|
|
124
|
+
return None
|
|
125
|
+
return resource.read_text(encoding="utf-8")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _resolve_shape_sources(shape_specs: Iterable[str] | None) -> list[str]:
|
|
129
|
+
if not shape_specs:
|
|
130
|
+
return _shape_resource_names()
|
|
131
|
+
|
|
132
|
+
resolved: list[str] = []
|
|
133
|
+
for spec in shape_specs:
|
|
134
|
+
path = Path(spec)
|
|
135
|
+
if path.exists():
|
|
136
|
+
resolved.append(path.as_posix())
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
name = spec
|
|
140
|
+
if not name.endswith(".ttl"):
|
|
141
|
+
name = f"{name}.ttl"
|
|
142
|
+
|
|
143
|
+
if _read_shape_resource(name) is None:
|
|
144
|
+
raise RuntimeError(f"Shape not found: {spec}")
|
|
145
|
+
resolved.append(name)
|
|
146
|
+
|
|
147
|
+
return resolved
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _load_shapes_graph(
|
|
151
|
+
shape_specs: Iterable[str] | None,
|
|
152
|
+
) -> tuple[Graph, dict[Identifier, str]]:
|
|
153
|
+
shapes_graph = Graph()
|
|
154
|
+
source_map: dict[Identifier, str] = {}
|
|
155
|
+
for spec in _resolve_shape_sources(shape_specs):
|
|
156
|
+
path = Path(spec)
|
|
157
|
+
if path.exists():
|
|
158
|
+
temp = Graph()
|
|
159
|
+
temp.parse(path.as_posix(), format="turtle")
|
|
160
|
+
shapes_graph += temp
|
|
161
|
+
label = path.stem
|
|
162
|
+
for subj in temp.subjects():
|
|
163
|
+
source_map.setdefault(subj, label)
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
data = _read_shape_resource(spec)
|
|
167
|
+
if data is None:
|
|
168
|
+
raise RuntimeError(f"Shape not found: {spec}")
|
|
169
|
+
temp = Graph()
|
|
170
|
+
temp.parse(data=data, format="turtle")
|
|
171
|
+
shapes_graph += temp
|
|
172
|
+
label = Path(spec).stem
|
|
173
|
+
for subj in temp.subjects():
|
|
174
|
+
source_map.setdefault(subj, label)
|
|
175
|
+
|
|
176
|
+
return shapes_graph, source_map
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def validate_file(
|
|
180
|
+
input_file: str, shape_specs: Iterable[str] | None = None
|
|
181
|
+
) -> ValidationResult:
|
|
182
|
+
data_graph = _load_graph(input_file)
|
|
183
|
+
data_graph = _normalize_schema_org_uris(data_graph)
|
|
184
|
+
shapes_graph, source_map = _load_shapes_graph(shape_specs)
|
|
185
|
+
|
|
186
|
+
conforms, report_graph, report_text = validate(
|
|
187
|
+
data_graph,
|
|
188
|
+
shacl_graph=shapes_graph,
|
|
189
|
+
inference="rdfs",
|
|
190
|
+
abort_on_first=False,
|
|
191
|
+
allow_infos=True,
|
|
192
|
+
allow_warnings=True,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
sh = Namespace("http://www.w3.org/ns/shacl#")
|
|
196
|
+
warning_count = sum(1 for _ in report_graph.subjects(sh.resultSeverity, sh.Warning))
|
|
197
|
+
|
|
198
|
+
return ValidationResult(
|
|
199
|
+
conforms=conforms,
|
|
200
|
+
report_text=report_text,
|
|
201
|
+
report_graph=report_graph,
|
|
202
|
+
data_graph=data_graph,
|
|
203
|
+
shape_source_map=source_map,
|
|
204
|
+
warning_count=warning_count,
|
|
205
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Packaged SHACL shapes."""
|