wordlift-sdk 2.9.1__py3-none-any.whl → 2.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +1 -1
- wordlift_sdk/render/__init__.py +30 -0
- wordlift_sdk/render/browser.py +132 -0
- wordlift_sdk/render/cleanup_options.py +24 -0
- wordlift_sdk/render/html_renderer.py +86 -0
- wordlift_sdk/render/render_options.py +21 -0
- wordlift_sdk/render/rendered_page.py +13 -0
- wordlift_sdk/render/xhtml_cleaner.py +126 -0
- wordlift_sdk/structured_data/__init__.py +27 -0
- wordlift_sdk/structured_data/agent.py +49 -0
- wordlift_sdk/structured_data/agent_generator.py +12 -0
- wordlift_sdk/structured_data/batch.py +220 -0
- wordlift_sdk/structured_data/constants.py +1 -0
- wordlift_sdk/structured_data/dataset_resolver.py +32 -0
- wordlift_sdk/structured_data/debug.py +23 -0
- wordlift_sdk/structured_data/engine.py +2875 -0
- wordlift_sdk/structured_data/inputs.py +58 -0
- wordlift_sdk/structured_data/io.py +44 -0
- wordlift_sdk/structured_data/materialization.py +70 -0
- wordlift_sdk/structured_data/models.py +48 -0
- wordlift_sdk/structured_data/orchestrator.py +194 -0
- wordlift_sdk/structured_data/rendering.py +43 -0
- wordlift_sdk/structured_data/schema_guide.py +17 -0
- wordlift_sdk/structured_data/structured_data_engine.py +58 -0
- wordlift_sdk/structured_data/validation.py +31 -0
- wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
- wordlift_sdk/url_source/__init__.py +7 -2
- wordlift_sdk/validation/__init__.py +7 -0
- wordlift_sdk/validation/generator.py +446 -0
- wordlift_sdk/validation/shacl.py +205 -0
- wordlift_sdk/validation/shacls/__init__.py +1 -0
- wordlift_sdk/validation/shacls/google-article.ttl +148 -0
- wordlift_sdk/validation/shacls/google-book.ttl +660 -0
- wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
- wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
- wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
- wordlift_sdk/validation/shacls/google-course.ttl +43 -0
- wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
- wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
- wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
- wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
- wordlift_sdk/validation/shacls/google-event.ttl +46 -0
- wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
- wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
- wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
- wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
- wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
- wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
- wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
- wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
- wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
- wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
- wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
- wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
- wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
- wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
- wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
- wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
- wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
- wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
- wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
- wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
- wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
- wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
- wordlift_sdk/validation/shacls/google-video.ttl +149 -0
- wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/METADATA +3 -1
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/RECORD +69 -5
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Batch generation for structured data using YARRRML mappings."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import tempfile
|
|
7
|
+
import time
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Callable
|
|
11
|
+
|
|
12
|
+
from rdflib import Graph
|
|
13
|
+
|
|
14
|
+
from wordlift_sdk.structured_data.yarrrml_pipeline import YarrrmlPipeline
|
|
15
|
+
from wordlift_sdk.render import CleanupOptions, RenderOptions, clean_xhtml, render_html
|
|
16
|
+
|
|
17
|
+
from .io import normalize_output_format, serialize_graph, write_output
|
|
18
|
+
from .materialization import MaterializationPipeline
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BatchGenerator:
|
|
22
|
+
"""Processes a list of URLs to generate structured data outputs."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
output_dir: Path,
|
|
27
|
+
output_format: str,
|
|
28
|
+
concurrency: str,
|
|
29
|
+
headed: bool,
|
|
30
|
+
timeout_ms: int,
|
|
31
|
+
wait_until: str,
|
|
32
|
+
max_xhtml_chars: int,
|
|
33
|
+
max_text_node_chars: int,
|
|
34
|
+
dataset_uri: str,
|
|
35
|
+
verbose: bool,
|
|
36
|
+
) -> None:
|
|
37
|
+
self._output_dir = output_dir
|
|
38
|
+
self._output_format = output_format
|
|
39
|
+
self._concurrency = concurrency
|
|
40
|
+
self._dataset_uri = dataset_uri
|
|
41
|
+
self._verbose = verbose
|
|
42
|
+
self._headed = headed
|
|
43
|
+
self._timeout_ms = timeout_ms
|
|
44
|
+
self._wait_until = wait_until
|
|
45
|
+
self._max_xhtml_chars = max_xhtml_chars
|
|
46
|
+
self._max_text_node_chars = max_text_node_chars
|
|
47
|
+
self._yarrrml = YarrrmlPipeline()
|
|
48
|
+
self._materializer = MaterializationPipeline(self._yarrrml)
|
|
49
|
+
|
|
50
|
+
def generate(
|
|
51
|
+
self, urls: list[str], yarrrml: str, log: Callable[[str], None]
|
|
52
|
+
) -> dict[str, object]:
|
|
53
|
+
if not urls:
|
|
54
|
+
raise RuntimeError("No URLs provided for generation.")
|
|
55
|
+
|
|
56
|
+
_, extension = normalize_output_format(self._output_format)
|
|
57
|
+
self._output_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
auto_concurrency = self._concurrency.strip().lower() == "auto"
|
|
60
|
+
if auto_concurrency:
|
|
61
|
+
min_workers = 2
|
|
62
|
+
max_workers = 12
|
|
63
|
+
current_workers = min(max_workers, max(min_workers, 4))
|
|
64
|
+
else:
|
|
65
|
+
try:
|
|
66
|
+
current_workers = int(self._concurrency)
|
|
67
|
+
except ValueError as exc:
|
|
68
|
+
raise RuntimeError("Concurrency must be an integer or 'auto'.") from exc
|
|
69
|
+
if current_workers <= 0:
|
|
70
|
+
raise RuntimeError("Concurrency must be greater than 0.")
|
|
71
|
+
min_workers = max_workers = current_workers
|
|
72
|
+
|
|
73
|
+
results: list[dict[str, object]] = []
|
|
74
|
+
errors: list[dict[str, str]] = []
|
|
75
|
+
|
|
76
|
+
with tempfile.TemporaryDirectory(prefix="structured-data-generate-") as tmp_dir:
|
|
77
|
+
tmp_root = Path(tmp_dir)
|
|
78
|
+
index = 0
|
|
79
|
+
total = len(urls)
|
|
80
|
+
if self._verbose:
|
|
81
|
+
log(f"Processing {total} URLs...")
|
|
82
|
+
from tqdm import tqdm
|
|
83
|
+
|
|
84
|
+
progress = tqdm(total=total, disable=not self._verbose)
|
|
85
|
+
|
|
86
|
+
def _process_url(url: str) -> dict[str, object]:
|
|
87
|
+
status_code = None
|
|
88
|
+
try:
|
|
89
|
+
step_start = time.perf_counter()
|
|
90
|
+
log(f"Start: {url}")
|
|
91
|
+
render_options = RenderOptions(
|
|
92
|
+
url=url,
|
|
93
|
+
headless=not self._headed,
|
|
94
|
+
timeout_ms=self._timeout_ms,
|
|
95
|
+
wait_until=self._wait_until,
|
|
96
|
+
)
|
|
97
|
+
rendered = render_html(render_options)
|
|
98
|
+
log(f"Rendered: {url} in {time.perf_counter() - step_start:.2f}s")
|
|
99
|
+
status_code = getattr(rendered, "status_code", None)
|
|
100
|
+
step_start = time.perf_counter()
|
|
101
|
+
cleanup_options = CleanupOptions(
|
|
102
|
+
max_xhtml_chars=self._max_xhtml_chars,
|
|
103
|
+
max_text_node_chars=self._max_text_node_chars,
|
|
104
|
+
)
|
|
105
|
+
cleaned_xhtml = clean_xhtml(rendered.xhtml, cleanup_options)
|
|
106
|
+
log(
|
|
107
|
+
f"Cleaned XHTML: {url} in {time.perf_counter() - step_start:.2f}s"
|
|
108
|
+
)
|
|
109
|
+
basename = self._yarrrml.build_output_basename(url)
|
|
110
|
+
xhtml_path = tmp_root / f"{basename}.xhtml"
|
|
111
|
+
xhtml_path.write_text(cleaned_xhtml)
|
|
112
|
+
workdir = tmp_root / f"work-{basename}"
|
|
113
|
+
step_start = time.perf_counter()
|
|
114
|
+
normalized_yarrrml, mappings = self._materializer.normalize(
|
|
115
|
+
yarrrml,
|
|
116
|
+
url,
|
|
117
|
+
xhtml_path,
|
|
118
|
+
target_type=None,
|
|
119
|
+
)
|
|
120
|
+
log(
|
|
121
|
+
f"Normalized YARRRML: {url} in {time.perf_counter() - step_start:.2f}s"
|
|
122
|
+
)
|
|
123
|
+
step_start = time.perf_counter()
|
|
124
|
+
jsonld_raw = self._materializer.materialize(
|
|
125
|
+
normalized_yarrrml, xhtml_path, workdir, url=url
|
|
126
|
+
)
|
|
127
|
+
log(
|
|
128
|
+
f"Materialized JSON-LD: {url} in {time.perf_counter() - step_start:.2f}s"
|
|
129
|
+
)
|
|
130
|
+
step_start = time.perf_counter()
|
|
131
|
+
jsonld = self._materializer.postprocess(
|
|
132
|
+
jsonld_raw,
|
|
133
|
+
mappings,
|
|
134
|
+
cleaned_xhtml,
|
|
135
|
+
self._dataset_uri,
|
|
136
|
+
url,
|
|
137
|
+
target_type=None,
|
|
138
|
+
)
|
|
139
|
+
log(
|
|
140
|
+
f"Postprocessed JSON-LD: {url} in {time.perf_counter() - step_start:.2f}s"
|
|
141
|
+
)
|
|
142
|
+
step_start = time.perf_counter()
|
|
143
|
+
graph = Graph()
|
|
144
|
+
graph.parse(data=json.dumps(jsonld), format="json-ld")
|
|
145
|
+
self._yarrrml.ensure_no_blank_nodes(graph)
|
|
146
|
+
output_path = self._output_dir / f"{basename}.{extension}"
|
|
147
|
+
if self._output_format.lower() in {"jsonld", "json-ld"}:
|
|
148
|
+
write_output(output_path, json.dumps(jsonld, indent=2))
|
|
149
|
+
else:
|
|
150
|
+
serialized = serialize_graph(graph, self._output_format)
|
|
151
|
+
write_output(output_path, serialized)
|
|
152
|
+
log(
|
|
153
|
+
f"Wrote output: {url} in {time.perf_counter() - step_start:.2f}s"
|
|
154
|
+
)
|
|
155
|
+
return {
|
|
156
|
+
"ok": True,
|
|
157
|
+
"url": url,
|
|
158
|
+
"status_code": status_code,
|
|
159
|
+
"output": str(output_path),
|
|
160
|
+
}
|
|
161
|
+
except Exception as exc:
|
|
162
|
+
log(f"Failed: {url} with {exc}")
|
|
163
|
+
return {
|
|
164
|
+
"ok": False,
|
|
165
|
+
"url": url,
|
|
166
|
+
"status_code": status_code,
|
|
167
|
+
"error": str(exc),
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
while index < total:
|
|
171
|
+
batch = urls[index : index + current_workers]
|
|
172
|
+
if not batch:
|
|
173
|
+
break
|
|
174
|
+
batch_results: list[dict[str, object]] = []
|
|
175
|
+
with ThreadPoolExecutor(max_workers=current_workers) as executor:
|
|
176
|
+
futures = {executor.submit(_process_url, url): url for url in batch}
|
|
177
|
+
for future in as_completed(futures):
|
|
178
|
+
result = future.result()
|
|
179
|
+
batch_results.append(result)
|
|
180
|
+
progress.update(1)
|
|
181
|
+
if not result.get("ok"):
|
|
182
|
+
errors.append(
|
|
183
|
+
{
|
|
184
|
+
"url": str(result.get("url")),
|
|
185
|
+
"error": str(result.get("error")),
|
|
186
|
+
}
|
|
187
|
+
)
|
|
188
|
+
results.extend(batch_results)
|
|
189
|
+
|
|
190
|
+
if auto_concurrency:
|
|
191
|
+
buckets = {
|
|
192
|
+
self._status_bucket(item.get("status_code"))
|
|
193
|
+
for item in batch_results
|
|
194
|
+
}
|
|
195
|
+
if buckets & {"throttle", "server_error", "error"}:
|
|
196
|
+
current_workers = max(min_workers, current_workers - 1)
|
|
197
|
+
elif buckets == {"ok"}:
|
|
198
|
+
current_workers = min(max_workers, current_workers + 1)
|
|
199
|
+
index += len(batch)
|
|
200
|
+
progress.close()
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"format": self._output_format,
|
|
204
|
+
"output_dir": str(self._output_dir),
|
|
205
|
+
"total": len(urls),
|
|
206
|
+
"success": sum(1 for item in results if item.get("ok")),
|
|
207
|
+
"failed": sum(1 for item in results if not item.get("ok")),
|
|
208
|
+
"errors": errors,
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
def _status_bucket(self, status_code: int | None) -> str:
|
|
212
|
+
if status_code is None:
|
|
213
|
+
return "error"
|
|
214
|
+
if status_code == 429:
|
|
215
|
+
return "throttle"
|
|
216
|
+
if 500 <= status_code < 600:
|
|
217
|
+
return "server_error"
|
|
218
|
+
if 200 <= status_code < 400:
|
|
219
|
+
return "ok"
|
|
220
|
+
return "client_error"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
DEFAULT_BASE_URL = "https://api.wordlift.io"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Dataset resolution helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from wordlift_client import ApiClient
|
|
6
|
+
|
|
7
|
+
from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
|
|
8
|
+
|
|
9
|
+
from .engine import (
|
|
10
|
+
_build_agent_client,
|
|
11
|
+
_build_client,
|
|
12
|
+
get_dataset_uri,
|
|
13
|
+
get_dataset_uri_async,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatasetResolver:
|
|
18
|
+
"""Resolves dataset URIs and builds API clients."""
|
|
19
|
+
|
|
20
|
+
def build_client(self, api_key: str, base_url: str) -> ApiClient:
|
|
21
|
+
return _build_client(api_key, base_url)
|
|
22
|
+
|
|
23
|
+
def build_agent_client(self, api_key: str) -> ApiClient:
|
|
24
|
+
return _build_agent_client(api_key)
|
|
25
|
+
|
|
26
|
+
async def get_dataset_uri_async(
|
|
27
|
+
self, api_key: str, base_url: str = DEFAULT_BASE_URL
|
|
28
|
+
) -> str:
|
|
29
|
+
return await get_dataset_uri_async(api_key, base_url)
|
|
30
|
+
|
|
31
|
+
def get_dataset_uri(self, api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
|
|
32
|
+
return get_dataset_uri(api_key, base_url)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Debug output helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def echo_debug(debug_path: Path, log: Callable[[str], None]) -> None:
|
|
11
|
+
if not debug_path.exists():
|
|
12
|
+
return
|
|
13
|
+
try:
|
|
14
|
+
payload = json.loads(debug_path.read_text())
|
|
15
|
+
except Exception:
|
|
16
|
+
log(f"Debug output written to {debug_path}")
|
|
17
|
+
return
|
|
18
|
+
prompt = payload.get("prompt", "")
|
|
19
|
+
response = payload.get("response")
|
|
20
|
+
log("--- Agent prompt ---")
|
|
21
|
+
log(prompt)
|
|
22
|
+
log("--- Agent response ---")
|
|
23
|
+
log(json.dumps(response, indent=2))
|