wordlift-sdk 2.9.1__tar.gz → 2.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/PKG-INFO +1 -1
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/pyproject.toml +1 -1
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/__init__.py +1 -1
- wordlift_sdk-2.10.1/wordlift_sdk/render/__init__.py +30 -0
- wordlift_sdk-2.10.1/wordlift_sdk/render/browser.py +132 -0
- wordlift_sdk-2.10.1/wordlift_sdk/render/cleanup_options.py +24 -0
- wordlift_sdk-2.10.1/wordlift_sdk/render/html_renderer.py +86 -0
- wordlift_sdk-2.10.1/wordlift_sdk/render/render_options.py +21 -0
- wordlift_sdk-2.10.1/wordlift_sdk/render/rendered_page.py +13 -0
- wordlift_sdk-2.10.1/wordlift_sdk/render/xhtml_cleaner.py +126 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/__init__.py +27 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/agent.py +49 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/agent_generator.py +12 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/batch.py +220 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/constants.py +1 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/dataset_resolver.py +32 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/debug.py +23 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/engine.py +2875 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/inputs.py +58 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/io.py +44 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/materialization.py +70 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/models.py +48 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/orchestrator.py +194 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/rendering.py +43 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/schema_guide.py +17 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/structured_data_engine.py +58 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/validation.py +31 -0
- wordlift_sdk-2.10.1/wordlift_sdk/structured_data/yarrrml_pipeline.py +34 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/url_source/__init__.py +7 -2
- wordlift_sdk-2.10.1/wordlift_sdk/validation/__init__.py +7 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/generator.py +446 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacl.py +205 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/__init__.py +1 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-article.ttl +148 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-book.ttl +660 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-breadcrumb.ttl +33 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-carousel.ttl +37 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-carousels-beta.ttl +291 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-course.ttl +43 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-dataset.ttl +146 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-discussion-forum.ttl +247 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-education-qa.ttl +75 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-employer-rating.ttl +40 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-event.ttl +46 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-factcheck.ttl +86 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-faqpage.ttl +38 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-image-license-metadata.ttl +93 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-job-posting.ttl +74 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-local-business.ttl +483 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-loyalty-program.ttl +61 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-math-solvers.ttl +63 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-merchant-listing.ttl +435 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-movie.ttl +44 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-organization.ttl +180 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-paywalled-content.ttl +34 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-product-snippet.ttl +121 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-product-variants.ttl +64 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-profile-page.ttl +130 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-qapage.ttl +195 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-recipe.ttl +201 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-return-policy.ttl +122 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-review-snippet.ttl +87 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-shipping-policy.ttl +606 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-software-app.ttl +40 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-speakable.ttl +20 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-vacation-rental.ttl +278 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/google-video.ttl +149 -0
- wordlift_sdk-2.10.1/wordlift_sdk/validation/shacls/schemaorg-grammar.ttl +20540 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/README.md +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/client/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/client/client_configuration_factory.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/configuration/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/configuration/configuration_provider.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/configuration/get_config_value.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/container/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/container/application_container.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/deprecated/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/entity/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/entity/enrich.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/entity/patch.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/google_search_console/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/google_search_console/create_google_search_console_data_import.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/google_sheets/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/google_sheets/google_sheets_lookup.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graph/graph_bag.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graph/ttl_liquid/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/client/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/client/client.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/client/factory.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/client/gql_client_provider.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/data/entities_by_type.graphql +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/data/entities_embedding_value.graphql +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/data/entities_top_query.graphql +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/data/entities_url_id.graphql +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/data/entities_url_iri.graphql +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/query.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/utils/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/utils/query/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/utils/query/entity_top_query.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/graphql/utils/query/entity_with_top_query.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/id_generator/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/id_generator/id_generator.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/id_generator/id_generator_interface.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/internal_link/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/internal_link/utils.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/kg/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/kg/entity.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/kg/entity_store.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/kg/entity_store_factory.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/kg/relation/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/kg/relation/relation_service.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/main.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/namespace/SDO.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/namespace/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/notebook/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/notebook/install_if_missing.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/context.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/entity_patch/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/entity_patch/entity_patch.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/graph/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/graph/graph_queue.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/load_override_class.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/protocol/web_page_import_protocol.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/url_source/google_sheets_url_source.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/url_source/list_url_source.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/url_source/new_or_changed_url_source.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/url_source/sitemap_url_source.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/url_source/url_source.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/url_source/url_source_input.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/create_dataframe_from_google_sheets.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/create_dataframe_of_url_iri.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/create_entity_patch_request.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/delayed.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/get_me.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/html_converter.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/utils/import_url.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/entity_gaps/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/create_or_update_entities_factory.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/kg_import_workflow.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/patch_entities_factory.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/url_handler/__init__.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/url_handler/default_url_handler.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/url_handler/search_console_url_handler.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/url_handler/url_handler.py +0 -0
- {wordlift_sdk-2.9.1 → wordlift_sdk-2.10.1}/wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +0 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Render and XHTML cleanup utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .cleanup_options import CleanupOptions
|
|
6
|
+
from .html_renderer import HtmlRenderer
|
|
7
|
+
from .render_options import RenderOptions
|
|
8
|
+
from .rendered_page import RenderedPage
|
|
9
|
+
from .xhtml_cleaner import XhtmlCleaner
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def render_html(options: RenderOptions) -> RenderedPage:
|
|
13
|
+
"""Wrapper for backward compatibility."""
|
|
14
|
+
return HtmlRenderer().render(options)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def clean_xhtml(xhtml: str, options: CleanupOptions) -> str:
|
|
18
|
+
"""Wrapper for backward compatibility."""
|
|
19
|
+
return XhtmlCleaner().clean(xhtml, options)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"CleanupOptions",
|
|
24
|
+
"HtmlRenderer",
|
|
25
|
+
"RenderOptions",
|
|
26
|
+
"RenderedPage",
|
|
27
|
+
"XhtmlCleaner",
|
|
28
|
+
"clean_xhtml",
|
|
29
|
+
"render_html",
|
|
30
|
+
]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Browser helper for rendering pages with Playwright."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextlib import AbstractContextManager
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from time import perf_counter
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from playwright.sync_api import Error as PlaywrightError
|
|
11
|
+
from playwright.sync_api import sync_playwright
|
|
12
|
+
except ImportError: # pragma: no cover - runtime dependency
|
|
13
|
+
sync_playwright = None
|
|
14
|
+
PlaywrightError = Exception
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class PageFetch:
|
|
19
|
+
response: object | None
|
|
20
|
+
elapsed_ms: float
|
|
21
|
+
resources: list[dict]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Browser(AbstractContextManager):
|
|
25
|
+
_DEFAULT_HEADERS = {
|
|
26
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
27
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
28
|
+
"Referer": "https://wordlift.io",
|
|
29
|
+
"Upgrade-Insecure-Requests": "1",
|
|
30
|
+
"Sec-CH-UA": '"Not A(Brand";v="99", "Chromium";v="120", "Google Chrome";v="120"',
|
|
31
|
+
"Sec-CH-UA-Mobile": "?0",
|
|
32
|
+
"Sec-CH-UA-Platform": '"macOS"',
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
*,
|
|
38
|
+
headless: bool,
|
|
39
|
+
timeout_ms: int,
|
|
40
|
+
wait_until: str,
|
|
41
|
+
locale: str | None = None,
|
|
42
|
+
user_agent: str | None = None,
|
|
43
|
+
viewport_width: int | None = None,
|
|
44
|
+
viewport_height: int | None = None,
|
|
45
|
+
ignore_https_errors: bool = False,
|
|
46
|
+
) -> None:
|
|
47
|
+
self.headless = headless
|
|
48
|
+
self.timeout_ms = timeout_ms
|
|
49
|
+
self.wait_until = wait_until
|
|
50
|
+
self.locale = locale
|
|
51
|
+
self.user_agent = user_agent
|
|
52
|
+
self.viewport_width = viewport_width
|
|
53
|
+
self.viewport_height = viewport_height
|
|
54
|
+
self.ignore_https_errors = ignore_https_errors
|
|
55
|
+
self._playwright = None
|
|
56
|
+
self._browser = None
|
|
57
|
+
self._context = None
|
|
58
|
+
|
|
59
|
+
def __enter__(self) -> "Browser":
|
|
60
|
+
if sync_playwright is None:
|
|
61
|
+
raise RuntimeError(
|
|
62
|
+
"Playwright is not installed. Run: uv pip install playwright && playwright install"
|
|
63
|
+
)
|
|
64
|
+
self._playwright = sync_playwright().start()
|
|
65
|
+
self._browser = self._playwright.chromium.launch(headless=self.headless)
|
|
66
|
+
context_kwargs: dict[str, object] = {}
|
|
67
|
+
context_kwargs["locale"] = self.locale or "en-US"
|
|
68
|
+
context_kwargs["timezone_id"] = "America/New_York"
|
|
69
|
+
if self.user_agent:
|
|
70
|
+
context_kwargs["user_agent"] = self.user_agent
|
|
71
|
+
if self.viewport_width and self.viewport_height:
|
|
72
|
+
viewport = {"width": self.viewport_width, "height": self.viewport_height}
|
|
73
|
+
else:
|
|
74
|
+
viewport = {"width": 1365, "height": 768}
|
|
75
|
+
context_kwargs["viewport"] = viewport
|
|
76
|
+
context_kwargs["ignore_https_errors"] = self.ignore_https_errors
|
|
77
|
+
context_kwargs["extra_http_headers"] = dict(self._DEFAULT_HEADERS)
|
|
78
|
+
self._context = self._browser.new_context(**context_kwargs)
|
|
79
|
+
self._context.add_init_script(
|
|
80
|
+
"""
|
|
81
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
82
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
83
|
+
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
84
|
+
window.chrome = window.chrome || { runtime: {} };
|
|
85
|
+
const originalQuery = window.navigator.permissions.query;
|
|
86
|
+
window.navigator.permissions.query = (parameters) => (
|
|
87
|
+
parameters.name === 'notifications'
|
|
88
|
+
? Promise.resolve({ state: Notification.permission })
|
|
89
|
+
: originalQuery(parameters)
|
|
90
|
+
);
|
|
91
|
+
"""
|
|
92
|
+
)
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
def __exit__(self, exc_type, exc, tb) -> None:
|
|
96
|
+
if self._context is not None:
|
|
97
|
+
self._context.close()
|
|
98
|
+
if self._browser is not None:
|
|
99
|
+
self._browser.close()
|
|
100
|
+
if self._playwright is not None:
|
|
101
|
+
self._playwright.stop()
|
|
102
|
+
|
|
103
|
+
def open(self, url: str) -> tuple[object | None, object | None, float, list[dict]]:
|
|
104
|
+
if self._context is None:
|
|
105
|
+
raise RuntimeError("Browser not initialized")
|
|
106
|
+
page = self._context.new_page()
|
|
107
|
+
resources: list[dict] = []
|
|
108
|
+
|
|
109
|
+
def handle_response(resp) -> None:
|
|
110
|
+
try:
|
|
111
|
+
request = resp.request
|
|
112
|
+
resources.append(
|
|
113
|
+
{
|
|
114
|
+
"url": resp.url,
|
|
115
|
+
"status": resp.status,
|
|
116
|
+
"resource_type": request.resource_type,
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
except Exception:
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
page.on("response", handle_response)
|
|
123
|
+
start = perf_counter()
|
|
124
|
+
response = None
|
|
125
|
+
try:
|
|
126
|
+
response = page.goto(
|
|
127
|
+
url, wait_until=self.wait_until, timeout=self.timeout_ms
|
|
128
|
+
)
|
|
129
|
+
except PlaywrightError:
|
|
130
|
+
pass
|
|
131
|
+
elapsed_ms = (perf_counter() - start) * 1000
|
|
132
|
+
return page, response, elapsed_ms, resources
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Cleanup options for XHTML sanitization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class CleanupOptions:
|
|
10
|
+
max_xhtml_chars: int = 40000
|
|
11
|
+
max_text_node_chars: int = 400
|
|
12
|
+
remove_tags: tuple[str, ...] = (
|
|
13
|
+
"script",
|
|
14
|
+
"style",
|
|
15
|
+
"noscript",
|
|
16
|
+
"svg",
|
|
17
|
+
"canvas",
|
|
18
|
+
"iframe",
|
|
19
|
+
"form",
|
|
20
|
+
"input",
|
|
21
|
+
"button",
|
|
22
|
+
"nav",
|
|
23
|
+
"aside",
|
|
24
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""HTML rendering and XHTML conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from wordlift_sdk.utils import HtmlConverter
|
|
10
|
+
|
|
11
|
+
from .browser import Browser
|
|
12
|
+
|
|
13
|
+
from .render_options import RenderOptions
|
|
14
|
+
from .rendered_page import RenderedPage
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HtmlRenderer:
|
|
18
|
+
"""Renders a web page using a browser and converts it to XHTML."""
|
|
19
|
+
|
|
20
|
+
def render(self, options: RenderOptions) -> RenderedPage:
|
|
21
|
+
"""
|
|
22
|
+
Render a URL to HTML and XHTML.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
options: Configuration for rendering (URL, headless, timeout, etc.).
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A RenderedPage object containing the HTML, XHTML, and status code.
|
|
29
|
+
"""
|
|
30
|
+
ignore_https_errors = options.ignore_https_errors or self._is_localhost_url(
|
|
31
|
+
options.url
|
|
32
|
+
)
|
|
33
|
+
with Browser(
|
|
34
|
+
headless=options.headless,
|
|
35
|
+
timeout_ms=options.timeout_ms,
|
|
36
|
+
wait_until=options.wait_until,
|
|
37
|
+
locale=options.locale,
|
|
38
|
+
user_agent=options.user_agent,
|
|
39
|
+
viewport_width=options.viewport_width,
|
|
40
|
+
viewport_height=options.viewport_height,
|
|
41
|
+
ignore_https_errors=ignore_https_errors,
|
|
42
|
+
) as browser:
|
|
43
|
+
page, response, _elapsed_ms, resources = browser.open(options.url)
|
|
44
|
+
if page is None:
|
|
45
|
+
raise RuntimeError("Failed to open page in browser.")
|
|
46
|
+
try:
|
|
47
|
+
html = self._safe_page_content(page, options.timeout_ms)
|
|
48
|
+
finally:
|
|
49
|
+
page.close()
|
|
50
|
+
|
|
51
|
+
xhtml = HtmlConverter().convert(html)
|
|
52
|
+
|
|
53
|
+
status_code = None
|
|
54
|
+
if response is not None:
|
|
55
|
+
try:
|
|
56
|
+
status_code = response.status
|
|
57
|
+
except Exception:
|
|
58
|
+
status_code = None
|
|
59
|
+
return RenderedPage(
|
|
60
|
+
html=html, xhtml=xhtml, status_code=status_code, resources=resources
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def _is_localhost_url(self, url: str) -> bool:
|
|
64
|
+
try:
|
|
65
|
+
parsed = urlparse(url)
|
|
66
|
+
except Exception:
|
|
67
|
+
return False
|
|
68
|
+
host = (parsed.hostname or "").lower()
|
|
69
|
+
return host == "localhost" or host.endswith(".localhost")
|
|
70
|
+
|
|
71
|
+
def _safe_page_content(self, page: Any, timeout_ms: int, retries: int = 3) -> str:
|
|
72
|
+
for attempt in range(retries + 1):
|
|
73
|
+
try:
|
|
74
|
+
return page.content()
|
|
75
|
+
except Exception:
|
|
76
|
+
if attempt >= retries:
|
|
77
|
+
raise
|
|
78
|
+
try:
|
|
79
|
+
page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
|
80
|
+
except Exception:
|
|
81
|
+
try:
|
|
82
|
+
page.wait_for_load_state("load", timeout=timeout_ms)
|
|
83
|
+
except Exception:
|
|
84
|
+
pass
|
|
85
|
+
time.sleep(0.2)
|
|
86
|
+
return page.content()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Render options for HTML to XHTML pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class RenderOptions:
|
|
10
|
+
url: str
|
|
11
|
+
headless: bool = True
|
|
12
|
+
timeout_ms: int = 30000
|
|
13
|
+
wait_until: str = "networkidle"
|
|
14
|
+
locale: str = "en-US"
|
|
15
|
+
user_agent: str | None = (
|
|
16
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
17
|
+
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
18
|
+
)
|
|
19
|
+
viewport_width: int = 1365
|
|
20
|
+
viewport_height: int = 768
|
|
21
|
+
ignore_https_errors: bool = False
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Rendered page data returned by the renderer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class RenderedPage:
|
|
10
|
+
html: str
|
|
11
|
+
xhtml: str
|
|
12
|
+
status_code: int | None = None
|
|
13
|
+
resources: list[dict] = field(default_factory=list)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""XHTML cleanup utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from .cleanup_options import CleanupOptions
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class XhtmlCleaner:
|
|
13
|
+
"""Cleans and optimizes XHTML content."""
|
|
14
|
+
|
|
15
|
+
def clean(self, xhtml: str, options: CleanupOptions) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Clean an XHTML string based on the provided options.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
xhtml: The XHTML string to clean.
|
|
21
|
+
options: Configuration for cleaning (tags to remove, max chars, etc.).
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
The cleaned XHTML string.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
from lxml import html as lxml_html
|
|
28
|
+
except Exception as exc:
|
|
29
|
+
raise RuntimeError(
|
|
30
|
+
"lxml is required for XHTML cleanup. Install with: pip install lxml"
|
|
31
|
+
) from exc
|
|
32
|
+
parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
|
|
33
|
+
doc = lxml_html.document_fromstring(xhtml, parser=parser)
|
|
34
|
+
self._strip_unwanted_tags(doc, options.remove_tags)
|
|
35
|
+
self._compact_text_nodes(doc, options.max_text_node_chars)
|
|
36
|
+
self._cap_text_content(doc, options.max_xhtml_chars)
|
|
37
|
+
cleaned = lxml_html.tostring(doc, encoding="unicode", method="xml")
|
|
38
|
+
if len(cleaned) > options.max_xhtml_chars:
|
|
39
|
+
self._trim_elements_to_size(doc, options.max_xhtml_chars)
|
|
40
|
+
cleaned = lxml_html.tostring(doc, encoding="unicode", method="xml")
|
|
41
|
+
return cleaned
|
|
42
|
+
|
|
43
|
+
def _strip_unwanted_tags(self, doc: Any, tags: tuple[str, ...]) -> None:
|
|
44
|
+
if not tags:
|
|
45
|
+
return
|
|
46
|
+
tag_expr = " | ".join(f"//{tag}" for tag in tags)
|
|
47
|
+
if not tag_expr:
|
|
48
|
+
return
|
|
49
|
+
for element in doc.xpath(tag_expr):
|
|
50
|
+
parent = element.getparent()
|
|
51
|
+
if parent is not None:
|
|
52
|
+
parent.remove(element)
|
|
53
|
+
|
|
54
|
+
def _compact_text(self, value: str | None, max_chars: int) -> str | None:
|
|
55
|
+
if value is None:
|
|
56
|
+
return None
|
|
57
|
+
text = re.sub(r"\s+", " ", value).strip()
|
|
58
|
+
if not text:
|
|
59
|
+
return None
|
|
60
|
+
if max_chars > 0 and len(text) > max_chars:
|
|
61
|
+
if max_chars <= 3:
|
|
62
|
+
text = text[:max_chars]
|
|
63
|
+
else:
|
|
64
|
+
text = text[: max_chars - 3].rstrip() + "..."
|
|
65
|
+
return text
|
|
66
|
+
|
|
67
|
+
def _compact_text_nodes(self, doc: Any, max_chars: int) -> None:
|
|
68
|
+
for element in doc.iter():
|
|
69
|
+
if hasattr(element, "text"):
|
|
70
|
+
element.text = self._compact_text(element.text, max_chars)
|
|
71
|
+
if hasattr(element, "tail"):
|
|
72
|
+
element.tail = self._compact_text(element.tail, max_chars)
|
|
73
|
+
|
|
74
|
+
def _cap_text_content(self, doc: Any, max_chars: int) -> None:
|
|
75
|
+
if max_chars <= 0:
|
|
76
|
+
return
|
|
77
|
+
remaining = max_chars
|
|
78
|
+
for element in doc.iter():
|
|
79
|
+
if hasattr(element, "text") and element.text:
|
|
80
|
+
if len(element.text) <= remaining:
|
|
81
|
+
remaining -= len(element.text)
|
|
82
|
+
else:
|
|
83
|
+
element.text = element.text[: max(0, remaining)].rstrip()
|
|
84
|
+
remaining = 0
|
|
85
|
+
if remaining <= 0:
|
|
86
|
+
self._clear_text_after(doc, element)
|
|
87
|
+
break
|
|
88
|
+
if hasattr(element, "tail") and element.tail:
|
|
89
|
+
if len(element.tail) <= remaining:
|
|
90
|
+
remaining -= len(element.tail)
|
|
91
|
+
else:
|
|
92
|
+
element.tail = element.tail[: max(0, remaining)].rstrip()
|
|
93
|
+
remaining = 0
|
|
94
|
+
if remaining <= 0:
|
|
95
|
+
self._clear_text_after(doc, element)
|
|
96
|
+
break
|
|
97
|
+
|
|
98
|
+
def _clear_text_after(self, doc: Any, stop_element: Any) -> None:
|
|
99
|
+
seen = False
|
|
100
|
+
for element in doc.iter():
|
|
101
|
+
if element is stop_element:
|
|
102
|
+
seen = True
|
|
103
|
+
continue
|
|
104
|
+
if not seen:
|
|
105
|
+
continue
|
|
106
|
+
if hasattr(element, "text") and element.text:
|
|
107
|
+
element.text = None
|
|
108
|
+
if hasattr(element, "tail") and element.tail:
|
|
109
|
+
element.tail = None
|
|
110
|
+
|
|
111
|
+
def _trim_elements_to_size(self, doc: Any, max_chars: int) -> None:
|
|
112
|
+
if max_chars <= 0:
|
|
113
|
+
return
|
|
114
|
+
try:
|
|
115
|
+
from lxml import html as lxml_html
|
|
116
|
+
except Exception:
|
|
117
|
+
return
|
|
118
|
+
elements = list(doc.iter())
|
|
119
|
+
for element in reversed(elements):
|
|
120
|
+
parent = element.getparent()
|
|
121
|
+
if parent is None:
|
|
122
|
+
continue
|
|
123
|
+
parent.remove(element)
|
|
124
|
+
current = lxml_html.tostring(doc, encoding="unicode", method="xml")
|
|
125
|
+
if len(current) <= max_chars:
|
|
126
|
+
return
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Structured data workflows and utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .agent_generator import AgentGenerator
|
|
6
|
+
from .dataset_resolver import DatasetResolver
|
|
7
|
+
from .engine import StructuredDataOptions, StructuredDataResult
|
|
8
|
+
from .schema_guide import SchemaGuide
|
|
9
|
+
from .structured_data_engine import StructuredDataEngine
|
|
10
|
+
from .yarrrml_pipeline import YarrrmlPipeline
|
|
11
|
+
from .models import CreateRequest, GenerateRequest
|
|
12
|
+
from .orchestrator import CreateWorkflow, GenerateWorkflow, resolve_api_key_from_context
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"CreateRequest",
|
|
16
|
+
"CreateWorkflow",
|
|
17
|
+
"GenerateRequest",
|
|
18
|
+
"GenerateWorkflow",
|
|
19
|
+
"resolve_api_key_from_context",
|
|
20
|
+
"AgentGenerator",
|
|
21
|
+
"DatasetResolver",
|
|
22
|
+
"SchemaGuide",
|
|
23
|
+
"StructuredDataEngine",
|
|
24
|
+
"StructuredDataOptions",
|
|
25
|
+
"StructuredDataResult",
|
|
26
|
+
"YarrrmlPipeline",
|
|
27
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Agent-backed structured data generation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from wordlift_sdk.structured_data.agent_generator import (
|
|
9
|
+
AgentGenerator as EngineAgentGenerator,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AgentGenerator:
|
|
14
|
+
"""Generates YARRRML and JSON-LD via the agent."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, engine: EngineAgentGenerator | None = None) -> None:
|
|
17
|
+
self._engine = engine or EngineAgentGenerator()
|
|
18
|
+
|
|
19
|
+
def generate(
|
|
20
|
+
self,
|
|
21
|
+
url: str,
|
|
22
|
+
html: str,
|
|
23
|
+
xhtml: str,
|
|
24
|
+
cleaned_xhtml: str,
|
|
25
|
+
api_key: str,
|
|
26
|
+
dataset_uri: str,
|
|
27
|
+
target_type: str,
|
|
28
|
+
workdir: Path,
|
|
29
|
+
debug: bool,
|
|
30
|
+
max_retries: int,
|
|
31
|
+
max_nesting_depth: int,
|
|
32
|
+
quality_check: bool,
|
|
33
|
+
log: Callable[[str], None],
|
|
34
|
+
) -> tuple[str, dict]:
|
|
35
|
+
return self._engine.generate_from_agent(
|
|
36
|
+
url,
|
|
37
|
+
html,
|
|
38
|
+
xhtml,
|
|
39
|
+
cleaned_xhtml,
|
|
40
|
+
api_key,
|
|
41
|
+
dataset_uri,
|
|
42
|
+
target_type,
|
|
43
|
+
workdir,
|
|
44
|
+
debug=debug,
|
|
45
|
+
max_retries=max_retries,
|
|
46
|
+
max_nesting_depth=max_nesting_depth,
|
|
47
|
+
quality_check=quality_check,
|
|
48
|
+
log=log,
|
|
49
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Agent-backed generation utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .engine import generate_from_agent
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AgentGenerator:
|
|
9
|
+
"""Agent-backed generation utilities."""
|
|
10
|
+
|
|
11
|
+
def generate_from_agent(self, *args, **kwargs):
|
|
12
|
+
return generate_from_agent(*args, **kwargs)
|