wordlift-sdk 2.9.0__py3-none-any.whl → 2.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. wordlift_sdk/__init__.py +1 -1
  2. wordlift_sdk/render/__init__.py +30 -0
  3. wordlift_sdk/render/browser.py +132 -0
  4. wordlift_sdk/render/cleanup_options.py +24 -0
  5. wordlift_sdk/render/html_renderer.py +86 -0
  6. wordlift_sdk/render/render_options.py +21 -0
  7. wordlift_sdk/render/rendered_page.py +13 -0
  8. wordlift_sdk/render/xhtml_cleaner.py +126 -0
  9. wordlift_sdk/structured_data/__init__.py +27 -0
  10. wordlift_sdk/structured_data/agent.py +49 -0
  11. wordlift_sdk/structured_data/agent_generator.py +12 -0
  12. wordlift_sdk/structured_data/batch.py +220 -0
  13. wordlift_sdk/structured_data/constants.py +1 -0
  14. wordlift_sdk/structured_data/dataset_resolver.py +32 -0
  15. wordlift_sdk/structured_data/debug.py +23 -0
  16. wordlift_sdk/structured_data/engine.py +2875 -0
  17. wordlift_sdk/structured_data/inputs.py +58 -0
  18. wordlift_sdk/structured_data/io.py +44 -0
  19. wordlift_sdk/structured_data/materialization.py +70 -0
  20. wordlift_sdk/structured_data/models.py +48 -0
  21. wordlift_sdk/structured_data/orchestrator.py +194 -0
  22. wordlift_sdk/structured_data/rendering.py +43 -0
  23. wordlift_sdk/structured_data/schema_guide.py +17 -0
  24. wordlift_sdk/structured_data/structured_data_engine.py +58 -0
  25. wordlift_sdk/structured_data/validation.py +31 -0
  26. wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
  27. wordlift_sdk/url_source/__init__.py +7 -2
  28. wordlift_sdk/validation/__init__.py +7 -0
  29. wordlift_sdk/validation/generator.py +446 -0
  30. wordlift_sdk/validation/shacl.py +205 -0
  31. wordlift_sdk/validation/shacls/__init__.py +1 -0
  32. wordlift_sdk/validation/shacls/google-article.ttl +148 -0
  33. wordlift_sdk/validation/shacls/google-book.ttl +660 -0
  34. wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
  35. wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
  36. wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
  37. wordlift_sdk/validation/shacls/google-course.ttl +43 -0
  38. wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
  39. wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
  40. wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
  41. wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
  42. wordlift_sdk/validation/shacls/google-event.ttl +46 -0
  43. wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
  44. wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
  45. wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
  46. wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
  47. wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
  48. wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
  49. wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
  50. wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
  51. wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
  52. wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
  53. wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
  54. wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
  55. wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
  56. wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
  57. wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
  58. wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
  59. wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
  60. wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
  61. wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
  62. wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
  63. wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
  64. wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
  65. wordlift_sdk/validation/shacls/google-video.ttl +149 -0
  66. wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
  67. {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/METADATA +1 -1
  68. {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/RECORD +69 -5
  69. {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,446 @@
1
+ """SHACL generator utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import html as html_lib
7
+ import re
8
+ from dataclasses import dataclass
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Iterable
12
+
13
+ import requests
14
+ from rdflib import Graph, Namespace, RDF, RDFS, URIRef
15
+ from tqdm import tqdm
16
+
17
+ SEARCH_GALLERY_URL = "https://developers.google.com/search/docs/appearance/structured-data/search-gallery"
18
+ FEATURE_URL_RE = re.compile(
19
+ r'href="(/search/docs/appearance/structured-data/[^"#?]+)"', re.IGNORECASE
20
+ )
21
+ TOKEN_RE = re.compile(
22
+ r"(<table[^>]*>.*?</table>|<p[^>]*>.*?</p>|<h2[^>]*>.*?</h2>|<h3[^>]*>.*?</h3>)",
23
+ re.DOTALL | re.IGNORECASE,
24
+ )
25
+ ROW_RE = re.compile(r"<tr[^>]*>.*?</tr>", re.DOTALL | re.IGNORECASE)
26
+ TAG_RE = re.compile(r"<[^>]+>")
27
+
28
+ SCHEMA_JSONLD_URL = "https://schema.org/version/latest/schemaorg-current-https.jsonld"
29
+
30
+ SCHEMA_VOCAB = Namespace("https://schema.org/")
31
+ SCHEMA_DATA = Namespace("http://schema.org/")
32
+ SH = Namespace("http://www.w3.org/ns/shacl#")
33
+ XSD = Namespace("http://www.w3.org/2001/XMLSchema#")
34
+ RDF_NS = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
35
+
36
+
37
+ @dataclass
38
+ class FeatureData:
39
+ url: str
40
+ types: dict[str, dict[str, set[str]]]
41
+
42
+
43
+ @dataclass
44
+ class PropertyRange:
45
+ prop: URIRef
46
+ ranges: list[URIRef]
47
+
48
+
49
+ def _strip_tags(text: str) -> str:
50
+ return html_lib.unescape(TAG_RE.sub("", text)).strip()
51
+
52
+
53
+ def _unique(items: Iterable[str]) -> list[str]:
54
+ seen: set[str] = set()
55
+ result: list[str] = []
56
+ for item in items:
57
+ if item not in seen:
58
+ seen.add(item)
59
+ result.append(item)
60
+ return result
61
+
62
+
63
+ def _extract_schema_types(fragment: str) -> list[str]:
64
+ types = []
65
+ for match in re.findall(r"https?://schema\.org/([A-Za-z0-9]+)", fragment):
66
+ types.append(match)
67
+
68
+ if types:
69
+ return _unique(types)
70
+
71
+ if fragment.lower().startswith("<h"):
72
+ code_match = re.findall(
73
+ r"<code[^>]*>(.*?)</code>", fragment, re.DOTALL | re.IGNORECASE
74
+ )
75
+ for item in code_match:
76
+ value = _strip_tags(item)
77
+ for token in re.findall(r"[A-Z][A-Za-z0-9]*", value):
78
+ types.append(token)
79
+ return _unique(types)
80
+
81
+ return []
82
+
83
+
84
+ def _table_kind(table_html: str) -> str | None:
85
+ header_match = re.search(r"<th[^>]*>\s*([^<]+)\s*</th>", table_html, re.IGNORECASE)
86
+ if not header_match:
87
+ return None
88
+ header = _strip_tags(header_match.group(1)).lower()
89
+ if "required properties" in header:
90
+ return "required"
91
+ if "recommended properties" in header:
92
+ return "recommended"
93
+ return None
94
+
95
+
96
+ def _extract_properties(table_html: str) -> list[str]:
97
+ props: list[str] = []
98
+ for row in ROW_RE.findall(table_html):
99
+ td_match = re.search(r"<td[^>]*>(.*?)</td>", row, re.DOTALL | re.IGNORECASE)
100
+ if not td_match:
101
+ continue
102
+ td_html = td_match.group(1)
103
+ code_match = re.search(
104
+ r"<code[^>]*>(.*?)</code>", td_html, re.DOTALL | re.IGNORECASE
105
+ )
106
+ if not code_match:
107
+ continue
108
+ raw = _strip_tags(code_match.group(1))
109
+ for token in re.findall(
110
+ r"[A-Za-z][A-Za-z0-9]*(?:\.[A-Za-z][A-Za-z0-9]*)*", raw
111
+ ):
112
+ if token.startswith("@"):
113
+ continue
114
+ if token[0].isupper() and "." not in token:
115
+ continue
116
+ props.append(token)
117
+ return _unique(props)
118
+
119
+
120
+ def _feature_urls_from_gallery(html: str) -> list[str]:
121
+ urls: list[str] = []
122
+ for match in FEATURE_URL_RE.findall(html):
123
+ url = f"https://developers.google.com{match}".rstrip("/")
124
+ if url.endswith("/search-gallery"):
125
+ continue
126
+ if url.endswith("/structured-data"):
127
+ continue
128
+ if url not in urls:
129
+ urls.append(url)
130
+ return urls
131
+
132
+
133
+ def _parse_feature(html: str, url: str) -> FeatureData:
134
+ current_types: list[str] = []
135
+ type_data: dict[str, dict[str, set[str]]] = {}
136
+
137
+ for token in TOKEN_RE.findall(html):
138
+ if token.lower().startswith(("<p", "<h2", "<h3")):
139
+ types = _extract_schema_types(token)
140
+ if types:
141
+ current_types = types
142
+ continue
143
+
144
+ if token.lower().startswith("<table"):
145
+ kind = _table_kind(token)
146
+ if not kind:
147
+ continue
148
+ props = _extract_properties(token)
149
+ if not props:
150
+ continue
151
+ target_types = current_types or ["Thing"]
152
+ for t in target_types:
153
+ bucket = type_data.setdefault(
154
+ t, {"required": set(), "recommended": set()}
155
+ )
156
+ bucket[kind].update(props)
157
+
158
+ for t, bucket in type_data.items():
159
+ bucket["recommended"].difference_update(bucket["required"])
160
+
161
+ return FeatureData(url=url, types=type_data)
162
+
163
+
164
+ def _prop_path(prop: str) -> str:
165
+ parts = prop.split(".")
166
+ if len(parts) == 1:
167
+ return f"schema:{parts[0]}"
168
+ seq = " ".join(f"schema:{part}" for part in parts)
169
+ return f"( {seq} )"
170
+
171
+
172
+ def _write_feature(feature: FeatureData, output_path: Path, overwrite: bool) -> bool:
173
+ if output_path.exists() and not overwrite:
174
+ return False
175
+
176
+ lines: list[str] = []
177
+ slug = output_path.stem
178
+ prefix_base = f"https://wordlift.io/shacl/google/{slug}/"
179
+ lines.append(f"@prefix : <{prefix_base}> .")
180
+ lines.append("@prefix sh: <http://www.w3.org/ns/shacl#> .")
181
+ lines.append("@prefix schema: <http://schema.org/> .")
182
+ lines.append("")
183
+ lines.append(f"# Source: {feature.url}")
184
+ lines.append(f"# Generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
185
+ lines.append(
186
+ "# Notes: required properties => errors; recommended properties => warnings."
187
+ )
188
+ lines.append("")
189
+
190
+ for type_name in sorted(feature.types.keys()):
191
+ bucket = feature.types[type_name]
192
+ shape_name = f":google_{type_name}Shape"
193
+ lines.append(shape_name)
194
+ lines.append(" a sh:NodeShape ;")
195
+ lines.append(f" sh:targetClass schema:{type_name} ;")
196
+
197
+ for prop in sorted(bucket["required"]):
198
+ path = _prop_path(prop)
199
+ lines.append(" sh:property [")
200
+ lines.append(f" sh:path {path} ;")
201
+ lines.append(" sh:minCount 1 ;")
202
+ lines.append(" ] ;")
203
+
204
+ for prop in sorted(bucket["recommended"]):
205
+ path = _prop_path(prop)
206
+ lines.append(" sh:property [")
207
+ lines.append(f" sh:path {path} ;")
208
+ lines.append(" sh:minCount 1 ;")
209
+ lines.append(" sh:severity sh:Warning ;")
210
+ lines.append(f' sh:message "Recommended by Google: {prop}." ;')
211
+ lines.append(" ] ;")
212
+
213
+ lines.append(".")
214
+ lines.append("")
215
+
216
+ output_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
217
+ return True
218
+
219
+
220
+ def generate_google_shacls(
221
+ output_dir: Path, overwrite: bool, limit: int, only: list[str] | None
222
+ ) -> int:
223
+ output_dir.mkdir(parents=True, exist_ok=True)
224
+
225
+ gallery_html = requests.get(SEARCH_GALLERY_URL, timeout=30).text
226
+ feature_urls = _feature_urls_from_gallery(gallery_html)
227
+
228
+ if only:
229
+ wanted = {slug.strip().rstrip("/") for slug in only}
230
+ feature_urls = [url for url in feature_urls if url.rsplit("/", 1)[-1] in wanted]
231
+
232
+ if limit:
233
+ feature_urls = feature_urls[:limit]
234
+
235
+ generated = 0
236
+ skipped = 0
237
+
238
+ for url in tqdm(feature_urls, desc="Generating SHACLs", unit="feature"):
239
+ slug = url.rsplit("/", 1)[-1]
240
+ output_name = f"google-{slug}.ttl"
241
+ output_path = output_dir / output_name
242
+
243
+ if slug == "review-snippet" and output_path.exists() is False:
244
+ curated = output_dir / "review-snippet.ttl"
245
+ if curated.exists() and not overwrite:
246
+ skipped += 1
247
+ continue
248
+
249
+ html = requests.get(url, timeout=30).text
250
+ feature = _parse_feature(html, url)
251
+ if not feature.types:
252
+ skipped += 1
253
+ continue
254
+
255
+ if _write_feature(feature, output_path, overwrite):
256
+ generated += 1
257
+ else:
258
+ skipped += 1
259
+
260
+ print(f"Generated: {generated}")
261
+ print(f"Skipped: {skipped}")
262
+ print(f"Total: {len(feature_urls)}")
263
+ return 0
264
+
265
+
266
+ def _datatype_shapes(datatype: str) -> list[dict[str, str]]:
267
+ if datatype == "Text":
268
+ return [
269
+ {"datatype": str(XSD.string)},
270
+ {"datatype": str(RDF_NS.langString)},
271
+ ]
272
+ if datatype == "URL":
273
+ return [{"datatype": str(XSD.anyURI)}]
274
+ if datatype == "Boolean":
275
+ return [{"datatype": str(XSD.boolean)}]
276
+ if datatype == "Date":
277
+ return [{"datatype": str(XSD.date)}]
278
+ if datatype == "DateTime":
279
+ return [{"datatype": str(XSD.dateTime)}]
280
+ if datatype == "Time":
281
+ return [{"datatype": str(XSD.time)}]
282
+ if datatype == "Integer":
283
+ return [{"datatype": str(XSD.integer)}]
284
+ if datatype == "Float":
285
+ return [{"datatype": str(XSD.float)}]
286
+ if datatype == "Number":
287
+ return [
288
+ {"datatype": str(XSD.integer)},
289
+ {"datatype": str(XSD.decimal)},
290
+ {"datatype": str(XSD.double)},
291
+ ]
292
+ return []
293
+
294
+
295
+ def _short_name(uri: URIRef) -> str:
296
+ value = str(uri)
297
+ if value.startswith(str(SCHEMA_VOCAB)):
298
+ return value[len(str(SCHEMA_VOCAB)) :]
299
+ if value.startswith(str(SCHEMA_DATA)):
300
+ return value[len(str(SCHEMA_DATA)) :]
301
+ return value.rsplit("/", 1)[-1]
302
+
303
+
304
+ def _collect_classes(graph: Graph) -> list[URIRef]:
305
+ classes = set(graph.subjects(RDF.type, RDFS.Class))
306
+ classes.update(graph.subjects(RDF.type, SCHEMA_VOCAB.Class))
307
+ return sorted(classes, key=str)
308
+
309
+
310
+ def _collect_properties(graph: Graph) -> list[URIRef]:
311
+ props = set(graph.subjects(RDF.type, RDF.Property))
312
+ return sorted(props, key=str)
313
+
314
+
315
+ def _collect_domain_ranges(
316
+ graph: Graph, prop: URIRef
317
+ ) -> list[tuple[URIRef, list[URIRef]]]:
318
+ domains = list(graph.objects(prop, SCHEMA_VOCAB.domainIncludes))
319
+ ranges = list(graph.objects(prop, SCHEMA_VOCAB.rangeIncludes))
320
+ if not domains:
321
+ return []
322
+ return [(domain, ranges) for domain in domains]
323
+
324
+
325
+ def _render_property_shape(prop: URIRef, ranges: list[URIRef]) -> list[str]:
326
+ lines: list[str] = []
327
+ lines.append(" sh:property [")
328
+ lines.append(f" sh:path schema:{_short_name(prop)} ;")
329
+ lines.append(" sh:severity sh:Warning ;")
330
+
331
+ range_constraints: list[str] = []
332
+ for r in ranges:
333
+ name = _short_name(r)
334
+ datatype_shapes = _datatype_shapes(name)
335
+ if datatype_shapes:
336
+ for shape in datatype_shapes:
337
+ range_constraints.append(f"[ sh:datatype <{shape['datatype']}> ]")
338
+ else:
339
+ range_constraints.append(f"[ sh:class schema:{name} ]")
340
+
341
+ if range_constraints:
342
+ if len(range_constraints) == 1:
343
+ lines.append(f" sh:or ( {range_constraints[0]} ) ;")
344
+ else:
345
+ lines.append(" sh:or (")
346
+ for rc in range_constraints:
347
+ lines.append(f" {rc}")
348
+ lines.append(" ) ;")
349
+
350
+ lines.append(f' sh:message "Schema.org range check: {_short_name(prop)}." ;')
351
+ lines.append(" ] ;")
352
+ return lines
353
+
354
+
355
+ def generate_schema_shacls(output_file: Path, overwrite: bool) -> int:
356
+ output_path = output_file
357
+ if output_path.exists() and not overwrite:
358
+ print(f"Output exists: {output_path}")
359
+ return 1
360
+
361
+ response = requests.get(SCHEMA_JSONLD_URL, timeout=60)
362
+ response.raise_for_status()
363
+
364
+ graph = Graph()
365
+ graph.parse(data=response.text, format="json-ld")
366
+
367
+ classes = _collect_classes(graph)
368
+ props = _collect_properties(graph)
369
+
370
+ class_props: dict[URIRef, list[PropertyRange]] = {cls: [] for cls in classes}
371
+
372
+ for prop in tqdm(props, desc="Collecting properties", unit="prop"):
373
+ for domain, ranges in _collect_domain_ranges(graph, prop):
374
+ if domain not in class_props:
375
+ class_props[domain] = []
376
+ class_props[domain].append(PropertyRange(prop=prop, ranges=ranges))
377
+
378
+ lines: list[str] = []
379
+ lines.append("@prefix : <https://wordlift.io/shacl/schemaorg-grammar/> .")
380
+ lines.append(f"@prefix sh: <{SH}> .")
381
+ lines.append(f"@prefix schema: <{SCHEMA_DATA}> .")
382
+ lines.append("")
383
+ lines.append(f"# Source: {SCHEMA_JSONLD_URL}")
384
+ lines.append(f"# Generated: {datetime.utcnow().isoformat(timespec='seconds')}Z")
385
+ lines.append(
386
+ "# Notes: schema.org grammar checks only; all constraints are warnings."
387
+ )
388
+ lines.append("")
389
+
390
+ for cls in tqdm(classes, desc="Writing shapes", unit="class"):
391
+ props_for_class = class_props.get(cls, [])
392
+ if not props_for_class:
393
+ continue
394
+ shape_name = f":schema_{_short_name(cls)}Shape"
395
+ lines.append(shape_name)
396
+ lines.append(" a sh:NodeShape ;")
397
+ lines.append(f" sh:targetClass schema:{_short_name(cls)} ;")
398
+
399
+ for prop_range in props_for_class:
400
+ lines.extend(_render_property_shape(prop_range.prop, prop_range.ranges))
401
+
402
+ lines.append(".")
403
+ lines.append("")
404
+
405
+ output_path.parent.mkdir(parents=True, exist_ok=True)
406
+ output_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
407
+ print(f"Wrote {output_path}")
408
+ return 0
409
+
410
+
411
+ def google_main(argv: list[str] | None = None) -> int:
412
+ parser = argparse.ArgumentParser(
413
+ description="Generate Google Search Gallery SHACL shapes."
414
+ )
415
+ parser.add_argument(
416
+ "--output-dir",
417
+ default="worai/validation/shacls",
418
+ help="Directory for generated SHACL files.",
419
+ )
420
+ parser.add_argument(
421
+ "--overwrite", action="store_true", help="Overwrite existing files."
422
+ )
423
+ parser.add_argument(
424
+ "--limit", type=int, default=0, help="Limit number of features (0 = all)."
425
+ )
426
+ parser.add_argument(
427
+ "--only", nargs="*", help="Only generate for specified feature slugs."
428
+ )
429
+ args = parser.parse_args(argv)
430
+ return generate_google_shacls(
431
+ Path(args.output_dir), args.overwrite, args.limit, args.only
432
+ )
433
+
434
+
435
+ def schema_main(argv: list[str] | None = None) -> int:
436
+ parser = argparse.ArgumentParser(description="Generate Schema.org grammar SHACLs.")
437
+ parser.add_argument(
438
+ "--output-file",
439
+ default="worai/validation/shacls/schemaorg-grammar.ttl",
440
+ help="Output SHACL file.",
441
+ )
442
+ parser.add_argument(
443
+ "--overwrite", action="store_true", help="Overwrite existing file."
444
+ )
445
+ args = parser.parse_args(argv)
446
+ return generate_schema_shacls(Path(args.output_file), args.overwrite)
@@ -0,0 +1,205 @@
1
+ """SHACL validation helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from importlib import resources
7
+ from pathlib import Path
8
+ from typing import Iterable
9
+
10
+ from pyshacl import validate
11
+ from rdflib import Graph, Namespace, URIRef
12
+ from rdflib.term import Identifier
13
+ from requests import Response, get
14
+
15
+
16
+ @dataclass
17
+ class ValidationResult:
18
+ conforms: bool
19
+ report_text: str
20
+ report_graph: Graph
21
+ data_graph: Graph
22
+ shape_source_map: dict[Identifier, str]
23
+ warning_count: int
24
+
25
+
26
+ def _detect_format_from_path(path: Path) -> str | None:
27
+ if path.suffix.lower() in {".jsonld", ".json-ld"}:
28
+ return "json-ld"
29
+ if path.suffix.lower() in {".ttl", ".turtle"}:
30
+ return "turtle"
31
+ if path.suffix.lower() in {".nt"}:
32
+ return "nt"
33
+ return None
34
+
35
+
36
+ def _detect_format_from_response(response: Response) -> str | None:
37
+ content_type = response.headers.get("content-type", "").lower()
38
+ if "json" in content_type or "ld+json" in content_type:
39
+ return "json-ld"
40
+ if "turtle" in content_type or "ttl" in content_type:
41
+ return "turtle"
42
+ if "n-triples" in content_type:
43
+ return "nt"
44
+ return None
45
+
46
+
47
+ def _load_graph_from_text(data: str, fmt: str | None) -> Graph:
48
+ graph = Graph()
49
+ try:
50
+ graph.parse(data=data, format=fmt)
51
+ return graph
52
+ except Exception as exc:
53
+ if fmt is None:
54
+ raise
55
+ raise RuntimeError(f"Failed to parse input as {fmt}: {exc}") from exc
56
+
57
+
58
+ def _load_graph(path_or_url: str) -> Graph:
59
+ if path_or_url.startswith(("http://", "https://")):
60
+ response = get(path_or_url, timeout=30)
61
+ if not response.ok:
62
+ raise RuntimeError(
63
+ f"Failed to fetch URL ({response.status_code}): {path_or_url}"
64
+ )
65
+ fmt = _detect_format_from_response(response)
66
+ try:
67
+ return _load_graph_from_text(response.text, fmt)
68
+ except Exception:
69
+ for fallback in (None, "json-ld", "turtle", "nt"):
70
+ if fallback == fmt:
71
+ continue
72
+ try:
73
+ return _load_graph_from_text(response.text, fallback)
74
+ except Exception:
75
+ continue
76
+ raise RuntimeError(f"Failed to parse remote RDF from {path_or_url}")
77
+
78
+ path = Path(path_or_url)
79
+ if not path.exists():
80
+ raise RuntimeError(f"Input file not found: {path}")
81
+
82
+ fmt = _detect_format_from_path(path)
83
+ graph = Graph()
84
+ graph.parse(path.as_posix(), format=fmt)
85
+ return graph
86
+
87
+
88
+ def _normalize_schema_org_uris(graph: Graph) -> Graph:
89
+ schema_http = "http://schema.org/"
90
+ schema_https = "https://schema.org/"
91
+ normalized = Graph()
92
+ for prefix, ns in graph.namespace_manager.namespaces():
93
+ normalized.namespace_manager.bind(prefix, ns, replace=True)
94
+ for s, p, o in graph:
95
+ if isinstance(s, URIRef) and str(s).startswith(schema_https):
96
+ s = URIRef(schema_http + str(s)[len(schema_https) :])
97
+ if isinstance(p, URIRef) and str(p).startswith(schema_https):
98
+ p = URIRef(schema_http + str(p)[len(schema_https) :])
99
+ if isinstance(o, URIRef) and str(o).startswith(schema_https):
100
+ o = URIRef(schema_http + str(o)[len(schema_https) :])
101
+ normalized.add((s, p, o))
102
+ return normalized
103
+
104
+
105
+ def _shape_resource_names() -> list[str]:
106
+ shapes_dir = resources.files("wordlift_sdk.validation.shacls")
107
+ return sorted(
108
+ [
109
+ p.name
110
+ for p in shapes_dir.iterdir()
111
+ if p.is_file() and p.name.endswith(".ttl")
112
+ ]
113
+ )
114
+
115
+
116
+ def list_shape_names() -> list[str]:
117
+ return _shape_resource_names()
118
+
119
+
120
+ def _read_shape_resource(name: str) -> str | None:
121
+ shapes_dir = resources.files("wordlift_sdk.validation.shacls")
122
+ resource = shapes_dir.joinpath(name)
123
+ if not resource.is_file():
124
+ return None
125
+ return resource.read_text(encoding="utf-8")
126
+
127
+
128
+ def _resolve_shape_sources(shape_specs: Iterable[str] | None) -> list[str]:
129
+ if not shape_specs:
130
+ return _shape_resource_names()
131
+
132
+ resolved: list[str] = []
133
+ for spec in shape_specs:
134
+ path = Path(spec)
135
+ if path.exists():
136
+ resolved.append(path.as_posix())
137
+ continue
138
+
139
+ name = spec
140
+ if not name.endswith(".ttl"):
141
+ name = f"{name}.ttl"
142
+
143
+ if _read_shape_resource(name) is None:
144
+ raise RuntimeError(f"Shape not found: {spec}")
145
+ resolved.append(name)
146
+
147
+ return resolved
148
+
149
+
150
+ def _load_shapes_graph(
151
+ shape_specs: Iterable[str] | None,
152
+ ) -> tuple[Graph, dict[Identifier, str]]:
153
+ shapes_graph = Graph()
154
+ source_map: dict[Identifier, str] = {}
155
+ for spec in _resolve_shape_sources(shape_specs):
156
+ path = Path(spec)
157
+ if path.exists():
158
+ temp = Graph()
159
+ temp.parse(path.as_posix(), format="turtle")
160
+ shapes_graph += temp
161
+ label = path.stem
162
+ for subj in temp.subjects():
163
+ source_map.setdefault(subj, label)
164
+ continue
165
+
166
+ data = _read_shape_resource(spec)
167
+ if data is None:
168
+ raise RuntimeError(f"Shape not found: {spec}")
169
+ temp = Graph()
170
+ temp.parse(data=data, format="turtle")
171
+ shapes_graph += temp
172
+ label = Path(spec).stem
173
+ for subj in temp.subjects():
174
+ source_map.setdefault(subj, label)
175
+
176
+ return shapes_graph, source_map
177
+
178
+
179
+ def validate_file(
180
+ input_file: str, shape_specs: Iterable[str] | None = None
181
+ ) -> ValidationResult:
182
+ data_graph = _load_graph(input_file)
183
+ data_graph = _normalize_schema_org_uris(data_graph)
184
+ shapes_graph, source_map = _load_shapes_graph(shape_specs)
185
+
186
+ conforms, report_graph, report_text = validate(
187
+ data_graph,
188
+ shacl_graph=shapes_graph,
189
+ inference="rdfs",
190
+ abort_on_first=False,
191
+ allow_infos=True,
192
+ allow_warnings=True,
193
+ )
194
+
195
+ sh = Namespace("http://www.w3.org/ns/shacl#")
196
+ warning_count = sum(1 for _ in report_graph.subjects(sh.resultSeverity, sh.Warning))
197
+
198
+ return ValidationResult(
199
+ conforms=conforms,
200
+ report_text=report_text,
201
+ report_graph=report_graph,
202
+ data_graph=data_graph,
203
+ shape_source_map=source_map,
204
+ warning_count=warning_count,
205
+ )
@@ -0,0 +1 @@
1
+ """Packaged SHACL shapes."""