wordlift-sdk 2.9.0__py3-none-any.whl → 2.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. wordlift_sdk/__init__.py +1 -1
  2. wordlift_sdk/render/__init__.py +30 -0
  3. wordlift_sdk/render/browser.py +132 -0
  4. wordlift_sdk/render/cleanup_options.py +24 -0
  5. wordlift_sdk/render/html_renderer.py +86 -0
  6. wordlift_sdk/render/render_options.py +21 -0
  7. wordlift_sdk/render/rendered_page.py +13 -0
  8. wordlift_sdk/render/xhtml_cleaner.py +126 -0
  9. wordlift_sdk/structured_data/__init__.py +27 -0
  10. wordlift_sdk/structured_data/agent.py +49 -0
  11. wordlift_sdk/structured_data/agent_generator.py +12 -0
  12. wordlift_sdk/structured_data/batch.py +220 -0
  13. wordlift_sdk/structured_data/constants.py +1 -0
  14. wordlift_sdk/structured_data/dataset_resolver.py +32 -0
  15. wordlift_sdk/structured_data/debug.py +23 -0
  16. wordlift_sdk/structured_data/engine.py +2875 -0
  17. wordlift_sdk/structured_data/inputs.py +58 -0
  18. wordlift_sdk/structured_data/io.py +44 -0
  19. wordlift_sdk/structured_data/materialization.py +70 -0
  20. wordlift_sdk/structured_data/models.py +48 -0
  21. wordlift_sdk/structured_data/orchestrator.py +194 -0
  22. wordlift_sdk/structured_data/rendering.py +43 -0
  23. wordlift_sdk/structured_data/schema_guide.py +17 -0
  24. wordlift_sdk/structured_data/structured_data_engine.py +58 -0
  25. wordlift_sdk/structured_data/validation.py +31 -0
  26. wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
  27. wordlift_sdk/url_source/__init__.py +7 -2
  28. wordlift_sdk/validation/__init__.py +7 -0
  29. wordlift_sdk/validation/generator.py +446 -0
  30. wordlift_sdk/validation/shacl.py +205 -0
  31. wordlift_sdk/validation/shacls/__init__.py +1 -0
  32. wordlift_sdk/validation/shacls/google-article.ttl +148 -0
  33. wordlift_sdk/validation/shacls/google-book.ttl +660 -0
  34. wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
  35. wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
  36. wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
  37. wordlift_sdk/validation/shacls/google-course.ttl +43 -0
  38. wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
  39. wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
  40. wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
  41. wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
  42. wordlift_sdk/validation/shacls/google-event.ttl +46 -0
  43. wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
  44. wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
  45. wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
  46. wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
  47. wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
  48. wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
  49. wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
  50. wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
  51. wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
  52. wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
  53. wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
  54. wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
  55. wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
  56. wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
  57. wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
  58. wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
  59. wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
  60. wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
  61. wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
  62. wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
  63. wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
  64. wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
  65. wordlift_sdk/validation/shacls/google-video.ttl +149 -0
  66. wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
  67. {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/METADATA +1 -1
  68. {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/RECORD +69 -5
  69. {wordlift_sdk-2.9.0.dist-info → wordlift_sdk-2.10.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,58 @@
1
+ """Input resolution helpers for structured data workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from urllib.parse import urlparse
7
+
8
+
9
+ def is_url(value: str) -> bool:
10
+ parsed = urlparse(value)
11
+ return parsed.scheme in {"http", "https"}
12
+
13
+
14
+ def urls_from_sitemap(source: str) -> list[str]:
15
+ try:
16
+ import advertools as adv
17
+ except ImportError as exc: # pragma: no cover - runtime dependency
18
+ raise RuntimeError(
19
+ "advertools is required. Install with: pip install advertools"
20
+ ) from exc
21
+ df = adv.sitemap_to_df(source)
22
+ if df is None or df.empty:
23
+ return []
24
+ for column in ("loc", "url"):
25
+ if column in df.columns:
26
+ values = df[column].dropna().astype(str).tolist()
27
+ return [value for value in values if value]
28
+ return df.iloc[:, 0].dropna().astype(str).tolist()
29
+
30
+
31
+ def resolve_input_urls(value: str) -> list[str]:
32
+ path = Path(value)
33
+ if path.exists():
34
+ urls = urls_from_sitemap(str(path))
35
+ if not urls:
36
+ raise RuntimeError("No URLs found in sitemap file.")
37
+ return urls
38
+ if is_url(value):
39
+ try:
40
+ urls = urls_from_sitemap(value)
41
+ if urls:
42
+ return urls
43
+ except Exception:
44
+ pass
45
+ return [value]
46
+ raise RuntimeError("INPUT must be a sitemap URL/path or a page URL.")
47
+
48
+
49
+ def filter_urls(urls: list[str], regex: str, max_pages: int | None) -> list[str]:
50
+ import re
51
+
52
+ pattern = re.compile(regex)
53
+ urls = [url for url in urls if pattern.search(url)]
54
+ if not urls:
55
+ raise RuntimeError("No URLs matched the provided regex.")
56
+ if max_pages is not None:
57
+ urls = urls[:max_pages]
58
+ return urls
@@ -0,0 +1,44 @@
1
+ """Output utilities for structured data workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from rdflib import Graph
8
+
9
+
10
+ _OUTPUT_FORMATS: dict[str, tuple[str, str]] = {
11
+ "ttl": ("turtle", "ttl"),
12
+ "jsonld": ("json-ld", "jsonld"),
13
+ "json-ld": ("json-ld", "jsonld"),
14
+ "rdf": ("xml", "rdf"),
15
+ "nt": ("nt", "nt"),
16
+ "nq": ("nquads", "nq"),
17
+ }
18
+
19
+
20
+ def write_output(path: Path, content: str) -> None:
21
+ path.parent.mkdir(parents=True, exist_ok=True)
22
+ path.write_text(content)
23
+
24
+
25
+ def default_output_paths(out_dir: Path, base_name: str) -> tuple[Path, Path]:
26
+ jsonld_path = out_dir / f"{base_name}.jsonld"
27
+ yarrml_path = out_dir / f"{base_name}.yarrml"
28
+ return jsonld_path, yarrml_path
29
+
30
+
31
+ def normalize_output_format(value: str) -> tuple[str, str]:
32
+ key = value.strip().lower()
33
+ if key not in _OUTPUT_FORMATS:
34
+ supported = ", ".join(sorted({k for k in _OUTPUT_FORMATS if "-" not in k}))
35
+ raise RuntimeError(f"Unsupported format '{value}'. Choose from: {supported}.")
36
+ return _OUTPUT_FORMATS[key]
37
+
38
+
39
+ def serialize_graph(graph: Graph, output_format: str) -> str:
40
+ rdflib_format, _ = normalize_output_format(output_format)
41
+ serialized = graph.serialize(format=rdflib_format)
42
+ if isinstance(serialized, bytes):
43
+ return serialized.decode("utf-8")
44
+ return serialized
@@ -0,0 +1,70 @@
1
+ """Materialization pipeline for YARRRML -> JSON-LD."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from wordlift_sdk.structured_data.yarrrml_pipeline import YarrrmlPipeline
8
+
9
+
10
+ class MaterializationPipeline:
11
+ """Normalizes mappings, materializes JSON-LD, and post-processes output."""
12
+
13
+ def __init__(self, pipeline: YarrrmlPipeline | None = None) -> None:
14
+ self._pipeline = pipeline or YarrrmlPipeline()
15
+
16
+ def normalize(
17
+ self, yarrrml: str, url: str, xhtml_path: Path, target_type: str | None
18
+ ) -> tuple[str, list[dict]]:
19
+ return self._pipeline.normalize_mappings(
20
+ yarrrml, url, xhtml_path, target_type=target_type
21
+ )
22
+
23
+ def materialize(
24
+ self, normalized_yarrrml: str, xhtml_path: Path, workdir: Path, url: str
25
+ ) -> dict:
26
+ return self._pipeline.materialize_jsonld(
27
+ normalized_yarrrml, xhtml_path, workdir, url=url
28
+ )
29
+
30
+ def postprocess(
31
+ self,
32
+ jsonld_raw: dict,
33
+ mappings: list[dict],
34
+ cleaned_xhtml: str,
35
+ dataset_uri: str,
36
+ url: str,
37
+ target_type: str | None,
38
+ ) -> dict:
39
+ return self._pipeline.postprocess_jsonld(
40
+ jsonld_raw,
41
+ mappings,
42
+ cleaned_xhtml,
43
+ dataset_uri,
44
+ url,
45
+ target_type=target_type,
46
+ )
47
+
48
+ def run(
49
+ self,
50
+ yarrrml: str,
51
+ url: str,
52
+ cleaned_xhtml: str,
53
+ dataset_uri: str,
54
+ xhtml_path: Path,
55
+ workdir: Path,
56
+ target_type: str | None,
57
+ ) -> tuple[dict, list[dict]]:
58
+ normalized_yarrrml, mappings = self.normalize(
59
+ yarrrml, url, xhtml_path, target_type=target_type
60
+ )
61
+ jsonld_raw = self.materialize(normalized_yarrrml, xhtml_path, workdir, url=url)
62
+ jsonld = self.postprocess(
63
+ jsonld_raw,
64
+ mappings,
65
+ cleaned_xhtml,
66
+ dataset_uri,
67
+ url,
68
+ target_type=target_type,
69
+ )
70
+ return jsonld, mappings
@@ -0,0 +1,48 @@
1
+ """Request/response models for structured data workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass
10
+ class CreateRequest:
11
+ url: str
12
+ target_type: str
13
+ output_dir: Path
14
+ base_name: str
15
+ jsonld_path: Path | None
16
+ yarrml_path: Path | None
17
+ api_key: str | None
18
+ base_url: str | None
19
+ debug: bool
20
+ headed: bool
21
+ timeout_ms: int
22
+ max_retries: int
23
+ quality_check: bool
24
+ max_xhtml_chars: int
25
+ max_text_node_chars: int
26
+ max_nesting_depth: int
27
+ verbose: bool
28
+ validate: bool
29
+ wait_until: str
30
+
31
+
32
+ @dataclass
33
+ class GenerateRequest:
34
+ input_value: str
35
+ yarrrml_path: Path
36
+ regex: str
37
+ output_dir: Path
38
+ output_format: str
39
+ concurrency: str
40
+ api_key: str | None
41
+ base_url: str | None
42
+ headed: bool
43
+ timeout_ms: int
44
+ wait_until: str
45
+ max_xhtml_chars: int
46
+ max_text_node_chars: int
47
+ max_pages: int | None
48
+ verbose: bool
@@ -0,0 +1,194 @@
1
+ """Orchestration for structured data workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Callable
7
+
8
+ from wordlift_sdk.structured_data.engine import (
9
+ StructuredDataOptions,
10
+ StructuredDataResult,
11
+ )
12
+ from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
13
+ from wordlift_sdk.structured_data.structured_data_engine import StructuredDataEngine
14
+ from wordlift_sdk.structured_data.yarrrml_pipeline import YarrrmlPipeline
15
+
16
+ from .agent import AgentGenerator
17
+ from .batch import BatchGenerator
18
+ from .debug import echo_debug
19
+ from .io import default_output_paths, write_output
20
+ from .inputs import filter_urls, resolve_input_urls
21
+ from .models import CreateRequest, GenerateRequest
22
+ from .rendering import RenderPipeline
23
+ from .validation import ValidationService
24
+
25
+
26
+ class CreateWorkflow:
27
+ """Workflow for generating structured data from a single URL."""
28
+
29
+ def __init__(
30
+ self,
31
+ agent: AgentGenerator | None = None,
32
+ renderer: RenderPipeline | None = None,
33
+ validator: ValidationService | None = None,
34
+ engine: StructuredDataEngine | None = None,
35
+ ) -> None:
36
+ self._agent = agent or AgentGenerator()
37
+ self._renderer = renderer
38
+ self._validator = validator or ValidationService()
39
+ self._engine = engine or StructuredDataEngine()
40
+ self._yarrrml = YarrrmlPipeline()
41
+
42
+ def run(
43
+ self, request: CreateRequest, log: Callable[[str], None]
44
+ ) -> StructuredDataResult:
45
+ if not request.api_key:
46
+ raise RuntimeError(
47
+ "WORDLIFT_KEY is required (or set wordlift.api_key in config)."
48
+ )
49
+ base_url = request.base_url or DEFAULT_BASE_URL
50
+ dataset_uri = self._engine.get_dataset_uri(request.api_key, base_url=base_url)
51
+
52
+ renderer = self._renderer or RenderPipeline(
53
+ headed=request.headed,
54
+ timeout_ms=request.timeout_ms,
55
+ wait_until=request.wait_until,
56
+ max_xhtml_chars=request.max_xhtml_chars,
57
+ max_text_node_chars=request.max_text_node_chars,
58
+ )
59
+
60
+ rendered, cleaned_xhtml = renderer.render(request.url, log)
61
+
62
+ options = StructuredDataOptions(
63
+ url=request.url,
64
+ target_type=request.target_type,
65
+ dataset_uri=dataset_uri,
66
+ headless=not request.headed,
67
+ timeout_ms=request.timeout_ms,
68
+ wait_until=request.wait_until,
69
+ max_retries=request.max_retries,
70
+ max_xhtml_chars=request.max_xhtml_chars,
71
+ max_text_node_chars=request.max_text_node_chars,
72
+ max_nesting_depth=request.max_nesting_depth,
73
+ verbose=request.verbose,
74
+ )
75
+
76
+ workdir = request.output_dir / ".structured-data"
77
+ debug_path = workdir / "agent_debug.json"
78
+ try:
79
+ log("Generating YARRRML mapping and JSON-LD...")
80
+ yarrml, jsonld = self._agent.generate(
81
+ options.url,
82
+ rendered.html,
83
+ rendered.xhtml,
84
+ cleaned_xhtml,
85
+ request.api_key,
86
+ options.dataset_uri,
87
+ options.target_type,
88
+ workdir,
89
+ debug=request.debug,
90
+ max_retries=options.max_retries,
91
+ max_nesting_depth=options.max_nesting_depth,
92
+ quality_check=request.quality_check,
93
+ log=log,
94
+ )
95
+ except Exception:
96
+ if request.debug:
97
+ echo_debug(debug_path, log)
98
+ raise
99
+ if request.debug:
100
+ echo_debug(debug_path, log)
101
+
102
+ jsonld_path = request.jsonld_path
103
+ yarrml_path = request.yarrml_path
104
+ if jsonld_path is None or yarrml_path is None:
105
+ jsonld_path, yarrml_path = default_output_paths(
106
+ request.output_dir, request.base_name
107
+ )
108
+
109
+ write_output(jsonld_path, json.dumps(jsonld, indent=2))
110
+ yarrml = self._yarrrml.make_reusable_yarrrml(yarrml, request.url)
111
+ write_output(yarrml_path, yarrml)
112
+
113
+ if request.verbose:
114
+ mapping_validation_path = workdir / "mapping.validation.json"
115
+ if mapping_validation_path.exists():
116
+ try:
117
+ validation_payload = json.loads(mapping_validation_path.read_text())
118
+ except Exception:
119
+ validation_payload = {}
120
+ for warning in validation_payload.get("warnings", []):
121
+ if "reviewRating dropped" in warning:
122
+ log(warning)
123
+
124
+ if request.validate:
125
+ log("Validating JSON-LD output...")
126
+ report_text = self._validator.validate(
127
+ jsonld_path, request.target_type, workdir
128
+ )
129
+ log(report_text)
130
+
131
+ return StructuredDataResult(
132
+ jsonld=jsonld,
133
+ yarrml=yarrml,
134
+ jsonld_filename=str(jsonld_path),
135
+ yarrml_filename=str(yarrml_path),
136
+ )
137
+
138
+
139
+ class GenerateWorkflow:
140
+ """Workflow for generating structured data in batch from YARRRML."""
141
+
142
+ def __init__(self, engine: StructuredDataEngine | None = None) -> None:
143
+ self._engine = engine or StructuredDataEngine()
144
+
145
+ def run(
146
+ self, request: GenerateRequest, log: Callable[[str], None]
147
+ ) -> dict[str, object]:
148
+ base_url = request.base_url or DEFAULT_BASE_URL
149
+ dataset_uri = (
150
+ self._engine.get_dataset_uri(request.api_key, base_url=base_url)
151
+ if request.api_key
152
+ else None
153
+ )
154
+ if not dataset_uri:
155
+ raise RuntimeError(
156
+ "WORDLIFT_KEY is required (or set wordlift.api_key in config)."
157
+ )
158
+ if not request.yarrrml_path.exists():
159
+ raise RuntimeError(f"YARRRML file not found: {request.yarrrml_path}")
160
+ yarrrml = request.yarrrml_path.read_text()
161
+
162
+ urls = resolve_input_urls(request.input_value)
163
+ urls = filter_urls(urls, request.regex, request.max_pages)
164
+
165
+ batch = BatchGenerator(
166
+ output_dir=request.output_dir,
167
+ output_format=request.output_format,
168
+ concurrency=request.concurrency,
169
+ headed=request.headed,
170
+ timeout_ms=request.timeout_ms,
171
+ wait_until=request.wait_until,
172
+ max_xhtml_chars=request.max_xhtml_chars,
173
+ max_text_node_chars=request.max_text_node_chars,
174
+ dataset_uri=dataset_uri,
175
+ verbose=request.verbose,
176
+ )
177
+ summary = batch.generate(urls, yarrrml, log)
178
+ summary["input"] = request.input_value
179
+ return summary
180
+
181
+
182
+ def resolve_api_key_from_context(
183
+ ctx_config: object | None, env_key: str = "WORDLIFT_KEY"
184
+ ) -> str | None:
185
+ if ctx_config is not None:
186
+ try:
187
+ value = ctx_config.get("wordlift.api_key")
188
+ if value:
189
+ return value
190
+ except Exception:
191
+ return None
192
+ import os
193
+
194
+ return os.environ.get(env_key)
@@ -0,0 +1,43 @@
1
+ """Rendering and cleanup pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable
6
+
7
+ from wordlift_sdk.render import CleanupOptions, RenderOptions, clean_xhtml, render_html
8
+
9
+
10
+ class RenderPipeline:
11
+ """Renders a page and cleans XHTML for structured data generation."""
12
+
13
+ def __init__(
14
+ self,
15
+ headed: bool,
16
+ timeout_ms: int,
17
+ wait_until: str,
18
+ max_xhtml_chars: int,
19
+ max_text_node_chars: int,
20
+ ) -> None:
21
+ self._headed = headed
22
+ self._timeout_ms = timeout_ms
23
+ self._wait_until = wait_until
24
+ self._max_xhtml_chars = max_xhtml_chars
25
+ self._max_text_node_chars = max_text_node_chars
26
+
27
+ def render(self, url: str, log: Callable[[str], None]) -> tuple[object, str]:
28
+ log("Rendering page with Playwright...")
29
+ render_options = RenderOptions(
30
+ url=url,
31
+ headless=not self._headed,
32
+ timeout_ms=self._timeout_ms,
33
+ wait_until=self._wait_until,
34
+ )
35
+ rendered = render_html(render_options)
36
+
37
+ log("Cleaning XHTML for prompt usage...")
38
+ cleanup_options = CleanupOptions(
39
+ max_xhtml_chars=self._max_xhtml_chars,
40
+ max_text_node_chars=self._max_text_node_chars,
41
+ )
42
+ cleaned_xhtml = clean_xhtml(rendered.xhtml, cleanup_options)
43
+ return rendered, cleaned_xhtml
@@ -0,0 +1,17 @@
1
+ """Schema.org shape and property guidance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .engine import normalize_type, shape_specs_for_type
6
+
7
+
8
+ class SchemaGuide:
9
+ """Builds schema.org property guides and shape specs."""
10
+
11
+ def normalize_type(self, value: str) -> str:
12
+ return normalize_type(value)
13
+
14
+ def shape_specs_for_type(self, type_name: str | None) -> list[str]:
15
+ if not type_name:
16
+ return []
17
+ return shape_specs_for_type(type_name)
@@ -0,0 +1,58 @@
1
+ """OOP facade for structured data utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
6
+
7
+ from .agent_generator import AgentGenerator
8
+ from .dataset_resolver import DatasetResolver
9
+ from .schema_guide import SchemaGuide
10
+ from .yarrrml_pipeline import YarrrmlPipeline
11
+
12
+
13
+ class StructuredDataEngine:
14
+ """OOP facade for structured data utilities."""
15
+
16
+ def __init__(
17
+ self,
18
+ dataset: DatasetResolver | None = None,
19
+ schema: SchemaGuide | None = None,
20
+ yarrrml: YarrrmlPipeline | None = None,
21
+ agent: AgentGenerator | None = None,
22
+ ) -> None:
23
+ self.dataset = dataset or DatasetResolver()
24
+ self.schema = schema or SchemaGuide()
25
+ self.yarrrml = yarrrml or YarrrmlPipeline()
26
+ self.agent = agent or AgentGenerator()
27
+
28
+ def get_dataset_uri(self, api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
29
+ return self.dataset.get_dataset_uri(api_key, base_url)
30
+
31
+ async def get_dataset_uri_async(
32
+ self, api_key: str, base_url: str = DEFAULT_BASE_URL
33
+ ) -> str:
34
+ return await self.dataset.get_dataset_uri_async(api_key, base_url)
35
+
36
+ def generate_from_agent(self, *args, **kwargs):
37
+ return self.agent.generate_from_agent(*args, **kwargs)
38
+
39
+ def normalize_yarrrml_mappings(self, *args, **kwargs):
40
+ return self.yarrrml.normalize_mappings(*args, **kwargs)
41
+
42
+ def materialize_yarrrml_jsonld(self, *args, **kwargs):
43
+ return self.yarrrml.materialize_jsonld(*args, **kwargs)
44
+
45
+ def postprocess_jsonld(self, *args, **kwargs):
46
+ return self.yarrrml.postprocess_jsonld(*args, **kwargs)
47
+
48
+ def make_reusable_yarrrml(self, *args, **kwargs):
49
+ return self.yarrrml.make_reusable_yarrrml(*args, **kwargs)
50
+
51
+ def ensure_no_blank_nodes(self, *args, **kwargs):
52
+ return self.yarrrml.ensure_no_blank_nodes(*args, **kwargs)
53
+
54
+ def build_output_basename(self, *args, **kwargs):
55
+ return self.yarrrml.build_output_basename(*args, **kwargs)
56
+
57
+ def shape_specs_for_type(self, *args, **kwargs):
58
+ return self.schema.shape_specs_for_type(*args, **kwargs)
@@ -0,0 +1,31 @@
1
+ """SHACL validation helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ from wordlift_sdk.structured_data.schema_guide import SchemaGuide
9
+ from wordlift_sdk.validation.shacl import validate_file
10
+
11
+
12
+ class ValidationService:
13
+ """Validates JSON-LD outputs with SHACL shapes."""
14
+
15
+ def __init__(self, schema: SchemaGuide | None = None) -> None:
16
+ self._schema = schema or SchemaGuide()
17
+
18
+ def validate(self, jsonld_path: Path, target_type: str, workdir: Path) -> str:
19
+ shape_specs = self._schema.shape_specs_for_type(target_type)
20
+ result = validate_file(str(jsonld_path), shape_specs=shape_specs)
21
+ (workdir / "jsonld.validation.json").write_text(
22
+ json.dumps(
23
+ {
24
+ "conforms": result.conforms,
25
+ "warning_count": result.warning_count,
26
+ "report_text": result.report_text,
27
+ },
28
+ indent=2,
29
+ )
30
+ )
31
+ return result.report_text
@@ -0,0 +1,34 @@
1
+ """YARRRML to JSON-LD pipeline helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .engine import (
6
+ build_output_basename,
7
+ ensure_no_blank_nodes,
8
+ make_reusable_yarrrml,
9
+ materialize_yarrrml_jsonld,
10
+ normalize_yarrrml_mappings,
11
+ postprocess_jsonld,
12
+ )
13
+
14
+
15
+ class YarrrmlPipeline:
16
+ """YARRRML -> JSON-LD pipeline helpers."""
17
+
18
+ def normalize_mappings(self, *args, **kwargs):
19
+ return normalize_yarrrml_mappings(*args, **kwargs)
20
+
21
+ def materialize_jsonld(self, *args, **kwargs):
22
+ return materialize_yarrrml_jsonld(*args, **kwargs)
23
+
24
+ def postprocess_jsonld(self, *args, **kwargs):
25
+ return postprocess_jsonld(*args, **kwargs)
26
+
27
+ def ensure_no_blank_nodes(self, *args, **kwargs):
28
+ return ensure_no_blank_nodes(*args, **kwargs)
29
+
30
+ def make_reusable_yarrrml(self, *args, **kwargs):
31
+ return make_reusable_yarrrml(*args, **kwargs)
32
+
33
+ def build_output_basename(self, *args, **kwargs):
34
+ return build_output_basename(*args, **kwargs)
@@ -1,6 +1,11 @@
1
+ __all__ = [
2
+ "Url",
3
+ "UrlSource",
4
+ "GoogleSheetsUrlSource",
5
+ "ListUrlSource",
6
+ "SitemapUrlSource",
7
+ ]
1
8
  from .google_sheets_url_source import GoogleSheetsUrlSource
2
9
  from .list_url_source import ListUrlSource
3
10
  from .sitemap_url_source import SitemapUrlSource
4
11
  from .url_source import UrlSource, Url
5
-
6
- __all__ = ["Url", "UrlSource", "GoogleSheetsUrlSource", "ListUrlSource", "SitemapUrlSource"]
@@ -0,0 +1,7 @@
1
+ """Validation utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .shacl import ValidationResult, list_shape_names, validate_file
6
+
7
+ __all__ = ["ValidationResult", "list_shape_names", "validate_file"]