wordlift-sdk 2.9.1__py3-none-any.whl → 2.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. wordlift_sdk/__init__.py +1 -1
  2. wordlift_sdk/render/__init__.py +30 -0
  3. wordlift_sdk/render/browser.py +132 -0
  4. wordlift_sdk/render/cleanup_options.py +24 -0
  5. wordlift_sdk/render/html_renderer.py +86 -0
  6. wordlift_sdk/render/render_options.py +21 -0
  7. wordlift_sdk/render/rendered_page.py +13 -0
  8. wordlift_sdk/render/xhtml_cleaner.py +126 -0
  9. wordlift_sdk/structured_data/__init__.py +27 -0
  10. wordlift_sdk/structured_data/agent.py +49 -0
  11. wordlift_sdk/structured_data/agent_generator.py +12 -0
  12. wordlift_sdk/structured_data/batch.py +220 -0
  13. wordlift_sdk/structured_data/constants.py +1 -0
  14. wordlift_sdk/structured_data/dataset_resolver.py +32 -0
  15. wordlift_sdk/structured_data/debug.py +23 -0
  16. wordlift_sdk/structured_data/engine.py +2875 -0
  17. wordlift_sdk/structured_data/inputs.py +58 -0
  18. wordlift_sdk/structured_data/io.py +44 -0
  19. wordlift_sdk/structured_data/materialization.py +70 -0
  20. wordlift_sdk/structured_data/models.py +48 -0
  21. wordlift_sdk/structured_data/orchestrator.py +194 -0
  22. wordlift_sdk/structured_data/rendering.py +43 -0
  23. wordlift_sdk/structured_data/schema_guide.py +17 -0
  24. wordlift_sdk/structured_data/structured_data_engine.py +58 -0
  25. wordlift_sdk/structured_data/validation.py +31 -0
  26. wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
  27. wordlift_sdk/url_source/__init__.py +7 -2
  28. wordlift_sdk/validation/__init__.py +7 -0
  29. wordlift_sdk/validation/generator.py +446 -0
  30. wordlift_sdk/validation/shacl.py +205 -0
  31. wordlift_sdk/validation/shacls/__init__.py +1 -0
  32. wordlift_sdk/validation/shacls/google-article.ttl +148 -0
  33. wordlift_sdk/validation/shacls/google-book.ttl +660 -0
  34. wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
  35. wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
  36. wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
  37. wordlift_sdk/validation/shacls/google-course.ttl +43 -0
  38. wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
  39. wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
  40. wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
  41. wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
  42. wordlift_sdk/validation/shacls/google-event.ttl +46 -0
  43. wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
  44. wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
  45. wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
  46. wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
  47. wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
  48. wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
  49. wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
  50. wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
  51. wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
  52. wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
  53. wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
  54. wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
  55. wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
  56. wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
  57. wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
  58. wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
  59. wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
  60. wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
  61. wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
  62. wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
  63. wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
  64. wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
  65. wordlift_sdk/validation/shacls/google-video.ttl +149 -0
  66. wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
  67. {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/METADATA +3 -1
  68. {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/RECORD +69 -5
  69. {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,220 @@
1
+ """Batch generation for structured data using YARRRML mappings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import tempfile
7
+ import time
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from pathlib import Path
10
+ from typing import Callable
11
+
12
+ from rdflib import Graph
13
+
14
+ from wordlift_sdk.structured_data.yarrrml_pipeline import YarrrmlPipeline
15
+ from wordlift_sdk.render import CleanupOptions, RenderOptions, clean_xhtml, render_html
16
+
17
+ from .io import normalize_output_format, serialize_graph, write_output
18
+ from .materialization import MaterializationPipeline
19
+
20
+
21
+ class BatchGenerator:
22
+ """Processes a list of URLs to generate structured data outputs."""
23
+
24
+ def __init__(
25
+ self,
26
+ output_dir: Path,
27
+ output_format: str,
28
+ concurrency: str,
29
+ headed: bool,
30
+ timeout_ms: int,
31
+ wait_until: str,
32
+ max_xhtml_chars: int,
33
+ max_text_node_chars: int,
34
+ dataset_uri: str,
35
+ verbose: bool,
36
+ ) -> None:
37
+ self._output_dir = output_dir
38
+ self._output_format = output_format
39
+ self._concurrency = concurrency
40
+ self._dataset_uri = dataset_uri
41
+ self._verbose = verbose
42
+ self._headed = headed
43
+ self._timeout_ms = timeout_ms
44
+ self._wait_until = wait_until
45
+ self._max_xhtml_chars = max_xhtml_chars
46
+ self._max_text_node_chars = max_text_node_chars
47
+ self._yarrrml = YarrrmlPipeline()
48
+ self._materializer = MaterializationPipeline(self._yarrrml)
49
+
50
+ def generate(
51
+ self, urls: list[str], yarrrml: str, log: Callable[[str], None]
52
+ ) -> dict[str, object]:
53
+ if not urls:
54
+ raise RuntimeError("No URLs provided for generation.")
55
+
56
+ _, extension = normalize_output_format(self._output_format)
57
+ self._output_dir.mkdir(parents=True, exist_ok=True)
58
+
59
+ auto_concurrency = self._concurrency.strip().lower() == "auto"
60
+ if auto_concurrency:
61
+ min_workers = 2
62
+ max_workers = 12
63
+ current_workers = min(max_workers, max(min_workers, 4))
64
+ else:
65
+ try:
66
+ current_workers = int(self._concurrency)
67
+ except ValueError as exc:
68
+ raise RuntimeError("Concurrency must be an integer or 'auto'.") from exc
69
+ if current_workers <= 0:
70
+ raise RuntimeError("Concurrency must be greater than 0.")
71
+ min_workers = max_workers = current_workers
72
+
73
+ results: list[dict[str, object]] = []
74
+ errors: list[dict[str, str]] = []
75
+
76
+ with tempfile.TemporaryDirectory(prefix="structured-data-generate-") as tmp_dir:
77
+ tmp_root = Path(tmp_dir)
78
+ index = 0
79
+ total = len(urls)
80
+ if self._verbose:
81
+ log(f"Processing {total} URLs...")
82
+ from tqdm import tqdm
83
+
84
+ progress = tqdm(total=total, disable=not self._verbose)
85
+
86
+ def _process_url(url: str) -> dict[str, object]:
87
+ status_code = None
88
+ try:
89
+ step_start = time.perf_counter()
90
+ log(f"Start: {url}")
91
+ render_options = RenderOptions(
92
+ url=url,
93
+ headless=not self._headed,
94
+ timeout_ms=self._timeout_ms,
95
+ wait_until=self._wait_until,
96
+ )
97
+ rendered = render_html(render_options)
98
+ log(f"Rendered: {url} in {time.perf_counter() - step_start:.2f}s")
99
+ status_code = getattr(rendered, "status_code", None)
100
+ step_start = time.perf_counter()
101
+ cleanup_options = CleanupOptions(
102
+ max_xhtml_chars=self._max_xhtml_chars,
103
+ max_text_node_chars=self._max_text_node_chars,
104
+ )
105
+ cleaned_xhtml = clean_xhtml(rendered.xhtml, cleanup_options)
106
+ log(
107
+ f"Cleaned XHTML: {url} in {time.perf_counter() - step_start:.2f}s"
108
+ )
109
+ basename = self._yarrrml.build_output_basename(url)
110
+ xhtml_path = tmp_root / f"{basename}.xhtml"
111
+ xhtml_path.write_text(cleaned_xhtml)
112
+ workdir = tmp_root / f"work-{basename}"
113
+ step_start = time.perf_counter()
114
+ normalized_yarrrml, mappings = self._materializer.normalize(
115
+ yarrrml,
116
+ url,
117
+ xhtml_path,
118
+ target_type=None,
119
+ )
120
+ log(
121
+ f"Normalized YARRRML: {url} in {time.perf_counter() - step_start:.2f}s"
122
+ )
123
+ step_start = time.perf_counter()
124
+ jsonld_raw = self._materializer.materialize(
125
+ normalized_yarrrml, xhtml_path, workdir, url=url
126
+ )
127
+ log(
128
+ f"Materialized JSON-LD: {url} in {time.perf_counter() - step_start:.2f}s"
129
+ )
130
+ step_start = time.perf_counter()
131
+ jsonld = self._materializer.postprocess(
132
+ jsonld_raw,
133
+ mappings,
134
+ cleaned_xhtml,
135
+ self._dataset_uri,
136
+ url,
137
+ target_type=None,
138
+ )
139
+ log(
140
+ f"Postprocessed JSON-LD: {url} in {time.perf_counter() - step_start:.2f}s"
141
+ )
142
+ step_start = time.perf_counter()
143
+ graph = Graph()
144
+ graph.parse(data=json.dumps(jsonld), format="json-ld")
145
+ self._yarrrml.ensure_no_blank_nodes(graph)
146
+ output_path = self._output_dir / f"{basename}.{extension}"
147
+ if self._output_format.lower() in {"jsonld", "json-ld"}:
148
+ write_output(output_path, json.dumps(jsonld, indent=2))
149
+ else:
150
+ serialized = serialize_graph(graph, self._output_format)
151
+ write_output(output_path, serialized)
152
+ log(
153
+ f"Wrote output: {url} in {time.perf_counter() - step_start:.2f}s"
154
+ )
155
+ return {
156
+ "ok": True,
157
+ "url": url,
158
+ "status_code": status_code,
159
+ "output": str(output_path),
160
+ }
161
+ except Exception as exc:
162
+ log(f"Failed: {url} with {exc}")
163
+ return {
164
+ "ok": False,
165
+ "url": url,
166
+ "status_code": status_code,
167
+ "error": str(exc),
168
+ }
169
+
170
+ while index < total:
171
+ batch = urls[index : index + current_workers]
172
+ if not batch:
173
+ break
174
+ batch_results: list[dict[str, object]] = []
175
+ with ThreadPoolExecutor(max_workers=current_workers) as executor:
176
+ futures = {executor.submit(_process_url, url): url for url in batch}
177
+ for future in as_completed(futures):
178
+ result = future.result()
179
+ batch_results.append(result)
180
+ progress.update(1)
181
+ if not result.get("ok"):
182
+ errors.append(
183
+ {
184
+ "url": str(result.get("url")),
185
+ "error": str(result.get("error")),
186
+ }
187
+ )
188
+ results.extend(batch_results)
189
+
190
+ if auto_concurrency:
191
+ buckets = {
192
+ self._status_bucket(item.get("status_code"))
193
+ for item in batch_results
194
+ }
195
+ if buckets & {"throttle", "server_error", "error"}:
196
+ current_workers = max(min_workers, current_workers - 1)
197
+ elif buckets == {"ok"}:
198
+ current_workers = min(max_workers, current_workers + 1)
199
+ index += len(batch)
200
+ progress.close()
201
+
202
+ return {
203
+ "format": self._output_format,
204
+ "output_dir": str(self._output_dir),
205
+ "total": len(urls),
206
+ "success": sum(1 for item in results if item.get("ok")),
207
+ "failed": sum(1 for item in results if not item.get("ok")),
208
+ "errors": errors,
209
+ }
210
+
211
+ def _status_bucket(self, status_code: int | None) -> str:
212
+ if status_code is None:
213
+ return "error"
214
+ if status_code == 429:
215
+ return "throttle"
216
+ if 500 <= status_code < 600:
217
+ return "server_error"
218
+ if 200 <= status_code < 400:
219
+ return "ok"
220
+ return "client_error"
@@ -0,0 +1 @@
1
+ DEFAULT_BASE_URL = "https://api.wordlift.io"
@@ -0,0 +1,32 @@
1
+ """Dataset resolution helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from wordlift_client import ApiClient
6
+
7
+ from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
8
+
9
+ from .engine import (
10
+ _build_agent_client,
11
+ _build_client,
12
+ get_dataset_uri,
13
+ get_dataset_uri_async,
14
+ )
15
+
16
+
17
+ class DatasetResolver:
18
+ """Resolves dataset URIs and builds API clients."""
19
+
20
+ def build_client(self, api_key: str, base_url: str) -> ApiClient:
21
+ return _build_client(api_key, base_url)
22
+
23
+ def build_agent_client(self, api_key: str) -> ApiClient:
24
+ return _build_agent_client(api_key)
25
+
26
+ async def get_dataset_uri_async(
27
+ self, api_key: str, base_url: str = DEFAULT_BASE_URL
28
+ ) -> str:
29
+ return await get_dataset_uri_async(api_key, base_url)
30
+
31
+ def get_dataset_uri(self, api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
32
+ return get_dataset_uri(api_key, base_url)
@@ -0,0 +1,23 @@
1
+ """Debug output helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Callable
8
+
9
+
10
+ def echo_debug(debug_path: Path, log: Callable[[str], None]) -> None:
11
+ if not debug_path.exists():
12
+ return
13
+ try:
14
+ payload = json.loads(debug_path.read_text())
15
+ except Exception:
16
+ log(f"Debug output written to {debug_path}")
17
+ return
18
+ prompt = payload.get("prompt", "")
19
+ response = payload.get("response")
20
+ log("--- Agent prompt ---")
21
+ log(prompt)
22
+ log("--- Agent response ---")
23
+ log(json.dumps(response, indent=2))