wordlift-sdk 2.9.1__py3-none-any.whl → 2.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +1 -1
- wordlift_sdk/render/__init__.py +30 -0
- wordlift_sdk/render/browser.py +132 -0
- wordlift_sdk/render/cleanup_options.py +24 -0
- wordlift_sdk/render/html_renderer.py +86 -0
- wordlift_sdk/render/render_options.py +21 -0
- wordlift_sdk/render/rendered_page.py +13 -0
- wordlift_sdk/render/xhtml_cleaner.py +126 -0
- wordlift_sdk/structured_data/__init__.py +27 -0
- wordlift_sdk/structured_data/agent.py +49 -0
- wordlift_sdk/structured_data/agent_generator.py +12 -0
- wordlift_sdk/structured_data/batch.py +220 -0
- wordlift_sdk/structured_data/constants.py +1 -0
- wordlift_sdk/structured_data/dataset_resolver.py +32 -0
- wordlift_sdk/structured_data/debug.py +23 -0
- wordlift_sdk/structured_data/engine.py +2875 -0
- wordlift_sdk/structured_data/inputs.py +58 -0
- wordlift_sdk/structured_data/io.py +44 -0
- wordlift_sdk/structured_data/materialization.py +70 -0
- wordlift_sdk/structured_data/models.py +48 -0
- wordlift_sdk/structured_data/orchestrator.py +194 -0
- wordlift_sdk/structured_data/rendering.py +43 -0
- wordlift_sdk/structured_data/schema_guide.py +17 -0
- wordlift_sdk/structured_data/structured_data_engine.py +58 -0
- wordlift_sdk/structured_data/validation.py +31 -0
- wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
- wordlift_sdk/url_source/__init__.py +7 -2
- wordlift_sdk/validation/__init__.py +7 -0
- wordlift_sdk/validation/generator.py +446 -0
- wordlift_sdk/validation/shacl.py +205 -0
- wordlift_sdk/validation/shacls/__init__.py +1 -0
- wordlift_sdk/validation/shacls/google-article.ttl +148 -0
- wordlift_sdk/validation/shacls/google-book.ttl +660 -0
- wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
- wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
- wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
- wordlift_sdk/validation/shacls/google-course.ttl +43 -0
- wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
- wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
- wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
- wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
- wordlift_sdk/validation/shacls/google-event.ttl +46 -0
- wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
- wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
- wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
- wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
- wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
- wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
- wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
- wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
- wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
- wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
- wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
- wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
- wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
- wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
- wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
- wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
- wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
- wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
- wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
- wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
- wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
- wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
- wordlift_sdk/validation/shacls/google-video.ttl +149 -0
- wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/METADATA +3 -1
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/RECORD +69 -5
- {wordlift_sdk-2.9.1.dist-info → wordlift_sdk-2.10.2.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Input resolution helpers for structured data workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def is_url(value: str) -> bool:
|
|
10
|
+
parsed = urlparse(value)
|
|
11
|
+
return parsed.scheme in {"http", "https"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def urls_from_sitemap(source: str) -> list[str]:
|
|
15
|
+
try:
|
|
16
|
+
import advertools as adv
|
|
17
|
+
except ImportError as exc: # pragma: no cover - runtime dependency
|
|
18
|
+
raise RuntimeError(
|
|
19
|
+
"advertools is required. Install with: pip install advertools"
|
|
20
|
+
) from exc
|
|
21
|
+
df = adv.sitemap_to_df(source)
|
|
22
|
+
if df is None or df.empty:
|
|
23
|
+
return []
|
|
24
|
+
for column in ("loc", "url"):
|
|
25
|
+
if column in df.columns:
|
|
26
|
+
values = df[column].dropna().astype(str).tolist()
|
|
27
|
+
return [value for value in values if value]
|
|
28
|
+
return df.iloc[:, 0].dropna().astype(str).tolist()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def resolve_input_urls(value: str) -> list[str]:
|
|
32
|
+
path = Path(value)
|
|
33
|
+
if path.exists():
|
|
34
|
+
urls = urls_from_sitemap(str(path))
|
|
35
|
+
if not urls:
|
|
36
|
+
raise RuntimeError("No URLs found in sitemap file.")
|
|
37
|
+
return urls
|
|
38
|
+
if is_url(value):
|
|
39
|
+
try:
|
|
40
|
+
urls = urls_from_sitemap(value)
|
|
41
|
+
if urls:
|
|
42
|
+
return urls
|
|
43
|
+
except Exception:
|
|
44
|
+
pass
|
|
45
|
+
return [value]
|
|
46
|
+
raise RuntimeError("INPUT must be a sitemap URL/path or a page URL.")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def filter_urls(urls: list[str], regex: str, max_pages: int | None) -> list[str]:
|
|
50
|
+
import re
|
|
51
|
+
|
|
52
|
+
pattern = re.compile(regex)
|
|
53
|
+
urls = [url for url in urls if pattern.search(url)]
|
|
54
|
+
if not urls:
|
|
55
|
+
raise RuntimeError("No URLs matched the provided regex.")
|
|
56
|
+
if max_pages is not None:
|
|
57
|
+
urls = urls[:max_pages]
|
|
58
|
+
return urls
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Output utilities for structured data workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rdflib import Graph
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_OUTPUT_FORMATS: dict[str, tuple[str, str]] = {
|
|
11
|
+
"ttl": ("turtle", "ttl"),
|
|
12
|
+
"jsonld": ("json-ld", "jsonld"),
|
|
13
|
+
"json-ld": ("json-ld", "jsonld"),
|
|
14
|
+
"rdf": ("xml", "rdf"),
|
|
15
|
+
"nt": ("nt", "nt"),
|
|
16
|
+
"nq": ("nquads", "nq"),
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def write_output(path: Path, content: str) -> None:
|
|
21
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
path.write_text(content)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def default_output_paths(out_dir: Path, base_name: str) -> tuple[Path, Path]:
|
|
26
|
+
jsonld_path = out_dir / f"{base_name}.jsonld"
|
|
27
|
+
yarrml_path = out_dir / f"{base_name}.yarrml"
|
|
28
|
+
return jsonld_path, yarrml_path
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def normalize_output_format(value: str) -> tuple[str, str]:
|
|
32
|
+
key = value.strip().lower()
|
|
33
|
+
if key not in _OUTPUT_FORMATS:
|
|
34
|
+
supported = ", ".join(sorted({k for k in _OUTPUT_FORMATS if "-" not in k}))
|
|
35
|
+
raise RuntimeError(f"Unsupported format '{value}'. Choose from: {supported}.")
|
|
36
|
+
return _OUTPUT_FORMATS[key]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def serialize_graph(graph: Graph, output_format: str) -> str:
|
|
40
|
+
rdflib_format, _ = normalize_output_format(output_format)
|
|
41
|
+
serialized = graph.serialize(format=rdflib_format)
|
|
42
|
+
if isinstance(serialized, bytes):
|
|
43
|
+
return serialized.decode("utf-8")
|
|
44
|
+
return serialized
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Materialization pipeline for YARRRML -> JSON-LD."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from wordlift_sdk.structured_data.yarrrml_pipeline import YarrrmlPipeline
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MaterializationPipeline:
|
|
11
|
+
"""Normalizes mappings, materializes JSON-LD, and post-processes output."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, pipeline: YarrrmlPipeline | None = None) -> None:
|
|
14
|
+
self._pipeline = pipeline or YarrrmlPipeline()
|
|
15
|
+
|
|
16
|
+
def normalize(
|
|
17
|
+
self, yarrrml: str, url: str, xhtml_path: Path, target_type: str | None
|
|
18
|
+
) -> tuple[str, list[dict]]:
|
|
19
|
+
return self._pipeline.normalize_mappings(
|
|
20
|
+
yarrrml, url, xhtml_path, target_type=target_type
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def materialize(
|
|
24
|
+
self, normalized_yarrrml: str, xhtml_path: Path, workdir: Path, url: str
|
|
25
|
+
) -> dict:
|
|
26
|
+
return self._pipeline.materialize_jsonld(
|
|
27
|
+
normalized_yarrrml, xhtml_path, workdir, url=url
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
def postprocess(
|
|
31
|
+
self,
|
|
32
|
+
jsonld_raw: dict,
|
|
33
|
+
mappings: list[dict],
|
|
34
|
+
cleaned_xhtml: str,
|
|
35
|
+
dataset_uri: str,
|
|
36
|
+
url: str,
|
|
37
|
+
target_type: str | None,
|
|
38
|
+
) -> dict:
|
|
39
|
+
return self._pipeline.postprocess_jsonld(
|
|
40
|
+
jsonld_raw,
|
|
41
|
+
mappings,
|
|
42
|
+
cleaned_xhtml,
|
|
43
|
+
dataset_uri,
|
|
44
|
+
url,
|
|
45
|
+
target_type=target_type,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def run(
|
|
49
|
+
self,
|
|
50
|
+
yarrrml: str,
|
|
51
|
+
url: str,
|
|
52
|
+
cleaned_xhtml: str,
|
|
53
|
+
dataset_uri: str,
|
|
54
|
+
xhtml_path: Path,
|
|
55
|
+
workdir: Path,
|
|
56
|
+
target_type: str | None,
|
|
57
|
+
) -> tuple[dict, list[dict]]:
|
|
58
|
+
normalized_yarrrml, mappings = self.normalize(
|
|
59
|
+
yarrrml, url, xhtml_path, target_type=target_type
|
|
60
|
+
)
|
|
61
|
+
jsonld_raw = self.materialize(normalized_yarrrml, xhtml_path, workdir, url=url)
|
|
62
|
+
jsonld = self.postprocess(
|
|
63
|
+
jsonld_raw,
|
|
64
|
+
mappings,
|
|
65
|
+
cleaned_xhtml,
|
|
66
|
+
dataset_uri,
|
|
67
|
+
url,
|
|
68
|
+
target_type=target_type,
|
|
69
|
+
)
|
|
70
|
+
return jsonld, mappings
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Request/response models for structured data workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class CreateRequest:
|
|
11
|
+
url: str
|
|
12
|
+
target_type: str
|
|
13
|
+
output_dir: Path
|
|
14
|
+
base_name: str
|
|
15
|
+
jsonld_path: Path | None
|
|
16
|
+
yarrml_path: Path | None
|
|
17
|
+
api_key: str | None
|
|
18
|
+
base_url: str | None
|
|
19
|
+
debug: bool
|
|
20
|
+
headed: bool
|
|
21
|
+
timeout_ms: int
|
|
22
|
+
max_retries: int
|
|
23
|
+
quality_check: bool
|
|
24
|
+
max_xhtml_chars: int
|
|
25
|
+
max_text_node_chars: int
|
|
26
|
+
max_nesting_depth: int
|
|
27
|
+
verbose: bool
|
|
28
|
+
validate: bool
|
|
29
|
+
wait_until: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class GenerateRequest:
|
|
34
|
+
input_value: str
|
|
35
|
+
yarrrml_path: Path
|
|
36
|
+
regex: str
|
|
37
|
+
output_dir: Path
|
|
38
|
+
output_format: str
|
|
39
|
+
concurrency: str
|
|
40
|
+
api_key: str | None
|
|
41
|
+
base_url: str | None
|
|
42
|
+
headed: bool
|
|
43
|
+
timeout_ms: int
|
|
44
|
+
wait_until: str
|
|
45
|
+
max_xhtml_chars: int
|
|
46
|
+
max_text_node_chars: int
|
|
47
|
+
max_pages: int | None
|
|
48
|
+
verbose: bool
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Orchestration for structured data workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
from wordlift_sdk.structured_data.engine import (
|
|
9
|
+
StructuredDataOptions,
|
|
10
|
+
StructuredDataResult,
|
|
11
|
+
)
|
|
12
|
+
from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
|
|
13
|
+
from wordlift_sdk.structured_data.structured_data_engine import StructuredDataEngine
|
|
14
|
+
from wordlift_sdk.structured_data.yarrrml_pipeline import YarrrmlPipeline
|
|
15
|
+
|
|
16
|
+
from .agent import AgentGenerator
|
|
17
|
+
from .batch import BatchGenerator
|
|
18
|
+
from .debug import echo_debug
|
|
19
|
+
from .io import default_output_paths, write_output
|
|
20
|
+
from .inputs import filter_urls, resolve_input_urls
|
|
21
|
+
from .models import CreateRequest, GenerateRequest
|
|
22
|
+
from .rendering import RenderPipeline
|
|
23
|
+
from .validation import ValidationService
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CreateWorkflow:
|
|
27
|
+
"""Workflow for generating structured data from a single URL."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
agent: AgentGenerator | None = None,
|
|
32
|
+
renderer: RenderPipeline | None = None,
|
|
33
|
+
validator: ValidationService | None = None,
|
|
34
|
+
engine: StructuredDataEngine | None = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
self._agent = agent or AgentGenerator()
|
|
37
|
+
self._renderer = renderer
|
|
38
|
+
self._validator = validator or ValidationService()
|
|
39
|
+
self._engine = engine or StructuredDataEngine()
|
|
40
|
+
self._yarrrml = YarrrmlPipeline()
|
|
41
|
+
|
|
42
|
+
def run(
|
|
43
|
+
self, request: CreateRequest, log: Callable[[str], None]
|
|
44
|
+
) -> StructuredDataResult:
|
|
45
|
+
if not request.api_key:
|
|
46
|
+
raise RuntimeError(
|
|
47
|
+
"WORDLIFT_KEY is required (or set wordlift.api_key in config)."
|
|
48
|
+
)
|
|
49
|
+
base_url = request.base_url or DEFAULT_BASE_URL
|
|
50
|
+
dataset_uri = self._engine.get_dataset_uri(request.api_key, base_url=base_url)
|
|
51
|
+
|
|
52
|
+
renderer = self._renderer or RenderPipeline(
|
|
53
|
+
headed=request.headed,
|
|
54
|
+
timeout_ms=request.timeout_ms,
|
|
55
|
+
wait_until=request.wait_until,
|
|
56
|
+
max_xhtml_chars=request.max_xhtml_chars,
|
|
57
|
+
max_text_node_chars=request.max_text_node_chars,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
rendered, cleaned_xhtml = renderer.render(request.url, log)
|
|
61
|
+
|
|
62
|
+
options = StructuredDataOptions(
|
|
63
|
+
url=request.url,
|
|
64
|
+
target_type=request.target_type,
|
|
65
|
+
dataset_uri=dataset_uri,
|
|
66
|
+
headless=not request.headed,
|
|
67
|
+
timeout_ms=request.timeout_ms,
|
|
68
|
+
wait_until=request.wait_until,
|
|
69
|
+
max_retries=request.max_retries,
|
|
70
|
+
max_xhtml_chars=request.max_xhtml_chars,
|
|
71
|
+
max_text_node_chars=request.max_text_node_chars,
|
|
72
|
+
max_nesting_depth=request.max_nesting_depth,
|
|
73
|
+
verbose=request.verbose,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
workdir = request.output_dir / ".structured-data"
|
|
77
|
+
debug_path = workdir / "agent_debug.json"
|
|
78
|
+
try:
|
|
79
|
+
log("Generating YARRRML mapping and JSON-LD...")
|
|
80
|
+
yarrml, jsonld = self._agent.generate(
|
|
81
|
+
options.url,
|
|
82
|
+
rendered.html,
|
|
83
|
+
rendered.xhtml,
|
|
84
|
+
cleaned_xhtml,
|
|
85
|
+
request.api_key,
|
|
86
|
+
options.dataset_uri,
|
|
87
|
+
options.target_type,
|
|
88
|
+
workdir,
|
|
89
|
+
debug=request.debug,
|
|
90
|
+
max_retries=options.max_retries,
|
|
91
|
+
max_nesting_depth=options.max_nesting_depth,
|
|
92
|
+
quality_check=request.quality_check,
|
|
93
|
+
log=log,
|
|
94
|
+
)
|
|
95
|
+
except Exception:
|
|
96
|
+
if request.debug:
|
|
97
|
+
echo_debug(debug_path, log)
|
|
98
|
+
raise
|
|
99
|
+
if request.debug:
|
|
100
|
+
echo_debug(debug_path, log)
|
|
101
|
+
|
|
102
|
+
jsonld_path = request.jsonld_path
|
|
103
|
+
yarrml_path = request.yarrml_path
|
|
104
|
+
if jsonld_path is None or yarrml_path is None:
|
|
105
|
+
jsonld_path, yarrml_path = default_output_paths(
|
|
106
|
+
request.output_dir, request.base_name
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
write_output(jsonld_path, json.dumps(jsonld, indent=2))
|
|
110
|
+
yarrml = self._yarrrml.make_reusable_yarrrml(yarrml, request.url)
|
|
111
|
+
write_output(yarrml_path, yarrml)
|
|
112
|
+
|
|
113
|
+
if request.verbose:
|
|
114
|
+
mapping_validation_path = workdir / "mapping.validation.json"
|
|
115
|
+
if mapping_validation_path.exists():
|
|
116
|
+
try:
|
|
117
|
+
validation_payload = json.loads(mapping_validation_path.read_text())
|
|
118
|
+
except Exception:
|
|
119
|
+
validation_payload = {}
|
|
120
|
+
for warning in validation_payload.get("warnings", []):
|
|
121
|
+
if "reviewRating dropped" in warning:
|
|
122
|
+
log(warning)
|
|
123
|
+
|
|
124
|
+
if request.validate:
|
|
125
|
+
log("Validating JSON-LD output...")
|
|
126
|
+
report_text = self._validator.validate(
|
|
127
|
+
jsonld_path, request.target_type, workdir
|
|
128
|
+
)
|
|
129
|
+
log(report_text)
|
|
130
|
+
|
|
131
|
+
return StructuredDataResult(
|
|
132
|
+
jsonld=jsonld,
|
|
133
|
+
yarrml=yarrml,
|
|
134
|
+
jsonld_filename=str(jsonld_path),
|
|
135
|
+
yarrml_filename=str(yarrml_path),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class GenerateWorkflow:
|
|
140
|
+
"""Workflow for generating structured data in batch from YARRRML."""
|
|
141
|
+
|
|
142
|
+
def __init__(self, engine: StructuredDataEngine | None = None) -> None:
|
|
143
|
+
self._engine = engine or StructuredDataEngine()
|
|
144
|
+
|
|
145
|
+
def run(
|
|
146
|
+
self, request: GenerateRequest, log: Callable[[str], None]
|
|
147
|
+
) -> dict[str, object]:
|
|
148
|
+
base_url = request.base_url or DEFAULT_BASE_URL
|
|
149
|
+
dataset_uri = (
|
|
150
|
+
self._engine.get_dataset_uri(request.api_key, base_url=base_url)
|
|
151
|
+
if request.api_key
|
|
152
|
+
else None
|
|
153
|
+
)
|
|
154
|
+
if not dataset_uri:
|
|
155
|
+
raise RuntimeError(
|
|
156
|
+
"WORDLIFT_KEY is required (or set wordlift.api_key in config)."
|
|
157
|
+
)
|
|
158
|
+
if not request.yarrrml_path.exists():
|
|
159
|
+
raise RuntimeError(f"YARRRML file not found: {request.yarrrml_path}")
|
|
160
|
+
yarrrml = request.yarrrml_path.read_text()
|
|
161
|
+
|
|
162
|
+
urls = resolve_input_urls(request.input_value)
|
|
163
|
+
urls = filter_urls(urls, request.regex, request.max_pages)
|
|
164
|
+
|
|
165
|
+
batch = BatchGenerator(
|
|
166
|
+
output_dir=request.output_dir,
|
|
167
|
+
output_format=request.output_format,
|
|
168
|
+
concurrency=request.concurrency,
|
|
169
|
+
headed=request.headed,
|
|
170
|
+
timeout_ms=request.timeout_ms,
|
|
171
|
+
wait_until=request.wait_until,
|
|
172
|
+
max_xhtml_chars=request.max_xhtml_chars,
|
|
173
|
+
max_text_node_chars=request.max_text_node_chars,
|
|
174
|
+
dataset_uri=dataset_uri,
|
|
175
|
+
verbose=request.verbose,
|
|
176
|
+
)
|
|
177
|
+
summary = batch.generate(urls, yarrrml, log)
|
|
178
|
+
summary["input"] = request.input_value
|
|
179
|
+
return summary
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def resolve_api_key_from_context(
|
|
183
|
+
ctx_config: object | None, env_key: str = "WORDLIFT_KEY"
|
|
184
|
+
) -> str | None:
|
|
185
|
+
if ctx_config is not None:
|
|
186
|
+
try:
|
|
187
|
+
value = ctx_config.get("wordlift.api_key")
|
|
188
|
+
if value:
|
|
189
|
+
return value
|
|
190
|
+
except Exception:
|
|
191
|
+
return None
|
|
192
|
+
import os
|
|
193
|
+
|
|
194
|
+
return os.environ.get(env_key)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Rendering and cleanup pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
from wordlift_sdk.render import CleanupOptions, RenderOptions, clean_xhtml, render_html
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RenderPipeline:
|
|
11
|
+
"""Renders a page and cleans XHTML for structured data generation."""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
headed: bool,
|
|
16
|
+
timeout_ms: int,
|
|
17
|
+
wait_until: str,
|
|
18
|
+
max_xhtml_chars: int,
|
|
19
|
+
max_text_node_chars: int,
|
|
20
|
+
) -> None:
|
|
21
|
+
self._headed = headed
|
|
22
|
+
self._timeout_ms = timeout_ms
|
|
23
|
+
self._wait_until = wait_until
|
|
24
|
+
self._max_xhtml_chars = max_xhtml_chars
|
|
25
|
+
self._max_text_node_chars = max_text_node_chars
|
|
26
|
+
|
|
27
|
+
def render(self, url: str, log: Callable[[str], None]) -> tuple[object, str]:
|
|
28
|
+
log("Rendering page with Playwright...")
|
|
29
|
+
render_options = RenderOptions(
|
|
30
|
+
url=url,
|
|
31
|
+
headless=not self._headed,
|
|
32
|
+
timeout_ms=self._timeout_ms,
|
|
33
|
+
wait_until=self._wait_until,
|
|
34
|
+
)
|
|
35
|
+
rendered = render_html(render_options)
|
|
36
|
+
|
|
37
|
+
log("Cleaning XHTML for prompt usage...")
|
|
38
|
+
cleanup_options = CleanupOptions(
|
|
39
|
+
max_xhtml_chars=self._max_xhtml_chars,
|
|
40
|
+
max_text_node_chars=self._max_text_node_chars,
|
|
41
|
+
)
|
|
42
|
+
cleaned_xhtml = clean_xhtml(rendered.xhtml, cleanup_options)
|
|
43
|
+
return rendered, cleaned_xhtml
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Schema.org shape and property guidance."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .engine import normalize_type, shape_specs_for_type
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SchemaGuide:
|
|
9
|
+
"""Builds schema.org property guides and shape specs."""
|
|
10
|
+
|
|
11
|
+
def normalize_type(self, value: str) -> str:
|
|
12
|
+
return normalize_type(value)
|
|
13
|
+
|
|
14
|
+
def shape_specs_for_type(self, type_name: str | None) -> list[str]:
|
|
15
|
+
if not type_name:
|
|
16
|
+
return []
|
|
17
|
+
return shape_specs_for_type(type_name)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""OOP facade for structured data utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from wordlift_sdk.structured_data.constants import DEFAULT_BASE_URL
|
|
6
|
+
|
|
7
|
+
from .agent_generator import AgentGenerator
|
|
8
|
+
from .dataset_resolver import DatasetResolver
|
|
9
|
+
from .schema_guide import SchemaGuide
|
|
10
|
+
from .yarrrml_pipeline import YarrrmlPipeline
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StructuredDataEngine:
|
|
14
|
+
"""OOP facade for structured data utilities."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
dataset: DatasetResolver | None = None,
|
|
19
|
+
schema: SchemaGuide | None = None,
|
|
20
|
+
yarrrml: YarrrmlPipeline | None = None,
|
|
21
|
+
agent: AgentGenerator | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
self.dataset = dataset or DatasetResolver()
|
|
24
|
+
self.schema = schema or SchemaGuide()
|
|
25
|
+
self.yarrrml = yarrrml or YarrrmlPipeline()
|
|
26
|
+
self.agent = agent or AgentGenerator()
|
|
27
|
+
|
|
28
|
+
def get_dataset_uri(self, api_key: str, base_url: str = DEFAULT_BASE_URL) -> str:
|
|
29
|
+
return self.dataset.get_dataset_uri(api_key, base_url)
|
|
30
|
+
|
|
31
|
+
async def get_dataset_uri_async(
|
|
32
|
+
self, api_key: str, base_url: str = DEFAULT_BASE_URL
|
|
33
|
+
) -> str:
|
|
34
|
+
return await self.dataset.get_dataset_uri_async(api_key, base_url)
|
|
35
|
+
|
|
36
|
+
def generate_from_agent(self, *args, **kwargs):
|
|
37
|
+
return self.agent.generate_from_agent(*args, **kwargs)
|
|
38
|
+
|
|
39
|
+
def normalize_yarrrml_mappings(self, *args, **kwargs):
|
|
40
|
+
return self.yarrrml.normalize_mappings(*args, **kwargs)
|
|
41
|
+
|
|
42
|
+
def materialize_yarrrml_jsonld(self, *args, **kwargs):
|
|
43
|
+
return self.yarrrml.materialize_jsonld(*args, **kwargs)
|
|
44
|
+
|
|
45
|
+
def postprocess_jsonld(self, *args, **kwargs):
|
|
46
|
+
return self.yarrrml.postprocess_jsonld(*args, **kwargs)
|
|
47
|
+
|
|
48
|
+
def make_reusable_yarrrml(self, *args, **kwargs):
|
|
49
|
+
return self.yarrrml.make_reusable_yarrrml(*args, **kwargs)
|
|
50
|
+
|
|
51
|
+
def ensure_no_blank_nodes(self, *args, **kwargs):
|
|
52
|
+
return self.yarrrml.ensure_no_blank_nodes(*args, **kwargs)
|
|
53
|
+
|
|
54
|
+
def build_output_basename(self, *args, **kwargs):
|
|
55
|
+
return self.yarrrml.build_output_basename(*args, **kwargs)
|
|
56
|
+
|
|
57
|
+
def shape_specs_for_type(self, *args, **kwargs):
|
|
58
|
+
return self.schema.shape_specs_for_type(*args, **kwargs)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""SHACL validation helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from wordlift_sdk.structured_data.schema_guide import SchemaGuide
|
|
9
|
+
from wordlift_sdk.validation.shacl import validate_file
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ValidationService:
|
|
13
|
+
"""Validates JSON-LD outputs with SHACL shapes."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, schema: SchemaGuide | None = None) -> None:
|
|
16
|
+
self._schema = schema or SchemaGuide()
|
|
17
|
+
|
|
18
|
+
def validate(self, jsonld_path: Path, target_type: str, workdir: Path) -> str:
|
|
19
|
+
shape_specs = self._schema.shape_specs_for_type(target_type)
|
|
20
|
+
result = validate_file(str(jsonld_path), shape_specs=shape_specs)
|
|
21
|
+
(workdir / "jsonld.validation.json").write_text(
|
|
22
|
+
json.dumps(
|
|
23
|
+
{
|
|
24
|
+
"conforms": result.conforms,
|
|
25
|
+
"warning_count": result.warning_count,
|
|
26
|
+
"report_text": result.report_text,
|
|
27
|
+
},
|
|
28
|
+
indent=2,
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
return result.report_text
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""YARRRML to JSON-LD pipeline helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .engine import (
|
|
6
|
+
build_output_basename,
|
|
7
|
+
ensure_no_blank_nodes,
|
|
8
|
+
make_reusable_yarrrml,
|
|
9
|
+
materialize_yarrrml_jsonld,
|
|
10
|
+
normalize_yarrrml_mappings,
|
|
11
|
+
postprocess_jsonld,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class YarrrmlPipeline:
|
|
16
|
+
"""YARRRML -> JSON-LD pipeline helpers."""
|
|
17
|
+
|
|
18
|
+
def normalize_mappings(self, *args, **kwargs):
|
|
19
|
+
return normalize_yarrrml_mappings(*args, **kwargs)
|
|
20
|
+
|
|
21
|
+
def materialize_jsonld(self, *args, **kwargs):
|
|
22
|
+
return materialize_yarrrml_jsonld(*args, **kwargs)
|
|
23
|
+
|
|
24
|
+
def postprocess_jsonld(self, *args, **kwargs):
|
|
25
|
+
return postprocess_jsonld(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def ensure_no_blank_nodes(self, *args, **kwargs):
|
|
28
|
+
return ensure_no_blank_nodes(*args, **kwargs)
|
|
29
|
+
|
|
30
|
+
def make_reusable_yarrrml(self, *args, **kwargs):
|
|
31
|
+
return make_reusable_yarrrml(*args, **kwargs)
|
|
32
|
+
|
|
33
|
+
def build_output_basename(self, *args, **kwargs):
|
|
34
|
+
return build_output_basename(*args, **kwargs)
|
|
@@ -1,6 +1,11 @@
|
|
|
1
|
+
__all__ = [
|
|
2
|
+
"Url",
|
|
3
|
+
"UrlSource",
|
|
4
|
+
"GoogleSheetsUrlSource",
|
|
5
|
+
"ListUrlSource",
|
|
6
|
+
"SitemapUrlSource",
|
|
7
|
+
]
|
|
1
8
|
from .google_sheets_url_source import GoogleSheetsUrlSource
|
|
2
9
|
from .list_url_source import ListUrlSource
|
|
3
10
|
from .sitemap_url_source import SitemapUrlSource
|
|
4
11
|
from .url_source import UrlSource, Url
|
|
5
|
-
|
|
6
|
-
__all__ = ["Url", "UrlSource", "GoogleSheetsUrlSource", "ListUrlSource", "SitemapUrlSource"]
|