wordlift-sdk 2.7.6__tar.gz → 2.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/PKG-INFO +6 -1
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/README.md +5 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/pyproject.toml +1 -1
- wordlift_sdk-2.9.0/wordlift_sdk/google_sheets/__init__.py +3 -0
- wordlift_sdk-2.9.0/wordlift_sdk/google_sheets/google_sheets_lookup.py +121 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/__init__.py +6 -2
- wordlift_sdk-2.9.0/wordlift_sdk/utils/html_converter.py +56 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/client/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/client/client_configuration_factory.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/configuration/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/configuration/configuration_provider.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/configuration/get_config_value.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/container/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/container/application_container.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/deprecated/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/entity/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/entity/enrich.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/entity/patch.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/google_search_console/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/google_search_console/create_google_search_console_data_import.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graph/graph_bag.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graph/ttl_liquid/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/client.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/factory.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/gql_client_provider.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_by_type.graphql +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_embedding_value.graphql +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_top_query.graphql +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_url_id.graphql +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_url_iri.graphql +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/query.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/entity_top_query.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/entity_with_top_query.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/id_generator/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/id_generator/id_generator.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/id_generator/id_generator_interface.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/internal_link/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/internal_link/utils.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/entity.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/entity_store.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/entity_store_factory.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/relation/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/relation/relation_service.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/main.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/namespace/SDO.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/namespace/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/notebook/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/notebook/install_if_missing.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/context.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/entity_patch.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/graph/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/graph/graph_queue.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/load_override_class.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/web_page_import_protocol.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/google_sheets_url_source.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/list_url_source.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/new_or_changed_url_source.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/sitemap_url_source.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/url_source.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/url_source_input.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_from_google_sheets.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_of_url_iri.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_entity_patch_request.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/delayed.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/get_me.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/import_url.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/entity_gaps/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/create_or_update_entities_factory.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/kg_import_workflow.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/patch_entities_factory.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/__init__.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/default_url_handler.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/search_console_url_handler.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/url_handler.py +0 -0
- {wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wordlift-sdk
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.9.0
|
|
4
4
|
Summary:
|
|
5
5
|
Author: David Riccitelli
|
|
6
6
|
Author-email: david@wordlift.io
|
|
@@ -123,3 +123,8 @@ poetry install --with dev
|
|
|
123
123
|
poetry run pytest
|
|
124
124
|
```
|
|
125
125
|
|
|
126
|
+
## Documentation
|
|
127
|
+
|
|
128
|
+
- [Google Sheets Lookup](docs/google_sheets_lookup.md): Utility for O(1) lookups from Google Sheets.
|
|
129
|
+
|
|
130
|
+
|
|
@@ -96,3 +96,8 @@ Add `.ttl.liquid` files under `data/templates`. Templates render with `account`
|
|
|
96
96
|
poetry install --with dev
|
|
97
97
|
poetry run pytest
|
|
98
98
|
```
|
|
99
|
+
|
|
100
|
+
## Documentation
|
|
101
|
+
|
|
102
|
+
- [Google Sheets Lookup](docs/google_sheets_lookup.md): Utility for O(1) lookups from Google Sheets.
|
|
103
|
+
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional, Any, Dict
|
|
4
|
+
|
|
5
|
+
import gspread
|
|
6
|
+
from wordlift_sdk.configuration import ConfigurationProvider
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GoogleSheetsLookup:
|
|
12
|
+
"""
|
|
13
|
+
A generic class to lookup values from a Google Sheet.
|
|
14
|
+
Preloads data upon initialization for O(1) lookup performance.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
spreadsheet_url: str,
|
|
20
|
+
sheet_name: str,
|
|
21
|
+
key_column: str,
|
|
22
|
+
value_column: str,
|
|
23
|
+
configuration_provider: ConfigurationProvider,
|
|
24
|
+
service_account_file: Optional[str] = None,
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Initialize the GoogleSheetsLookup.
|
|
28
|
+
|
|
29
|
+
:param spreadsheet_url: The URL of the Google Sheet.
|
|
30
|
+
:param sheet_name: The name of the specific worksheet (tab).
|
|
31
|
+
:param key_column: The header name of the column to use as keys.
|
|
32
|
+
:param value_column: The header name of the column to use as values.
|
|
33
|
+
:param configuration_provider: The ConfigurationProvider instance.
|
|
34
|
+
:param service_account_file: Optional path to the service account JSON file.
|
|
35
|
+
If not provided, it will be looked up in the configuration.
|
|
36
|
+
"""
|
|
37
|
+
self.spreadsheet_url = spreadsheet_url
|
|
38
|
+
self.sheet_name = sheet_name
|
|
39
|
+
self.key_column = key_column
|
|
40
|
+
self.value_column = value_column
|
|
41
|
+
self.configuration_provider = configuration_provider
|
|
42
|
+
self.service_account_file = service_account_file
|
|
43
|
+
|
|
44
|
+
self._data: Dict[str, Any] = {}
|
|
45
|
+
self._load_data()
|
|
46
|
+
|
|
47
|
+
def _resolve_service_account_file(self) -> Optional[str]:
|
|
48
|
+
"""
|
|
49
|
+
Resolves the service account file path.
|
|
50
|
+
Priority:
|
|
51
|
+
1. Argument passed to __init__
|
|
52
|
+
2. 'SERVICE_ACCOUNT_FILE' from ConfigurationProvider
|
|
53
|
+
"""
|
|
54
|
+
if self.service_account_file:
|
|
55
|
+
return self.service_account_file
|
|
56
|
+
|
|
57
|
+
# Attempt to retrieve from ConfigurationProvider
|
|
58
|
+
return self.configuration_provider.get_value("SERVICE_ACCOUNT_FILE")
|
|
59
|
+
|
|
60
|
+
# Fallback: check environment variable directly if ConfigurationProvider
|
|
61
|
+
# behavior is different or it didn't return anything.
|
|
62
|
+
return os.getenv("SERVICE_ACCOUNT_FILE")
|
|
63
|
+
|
|
64
|
+
def _load_data(self):
|
|
65
|
+
"""
|
|
66
|
+
Connects to Google Sheets and preloads the data into a dictionary.
|
|
67
|
+
"""
|
|
68
|
+
credentials_file = self._resolve_service_account_file()
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
if credentials_file:
|
|
72
|
+
logger.info(
|
|
73
|
+
f"Connecting to Google Sheets using credentials: {credentials_file}"
|
|
74
|
+
)
|
|
75
|
+
gc = gspread.service_account(filename=credentials_file)
|
|
76
|
+
else:
|
|
77
|
+
logger.info(
|
|
78
|
+
"Connecting to Google Sheets using default environment credentials"
|
|
79
|
+
)
|
|
80
|
+
gc = gspread.service_account()
|
|
81
|
+
|
|
82
|
+
# Open spreadsheet by URL
|
|
83
|
+
sh = gc.open_by_url(self.spreadsheet_url)
|
|
84
|
+
|
|
85
|
+
# Select worksheet
|
|
86
|
+
worksheet = sh.worksheet(self.sheet_name)
|
|
87
|
+
|
|
88
|
+
# Get all records
|
|
89
|
+
records = worksheet.get_all_records()
|
|
90
|
+
logger.info(
|
|
91
|
+
f"Fetched {len(records)} records from sheet '{self.sheet_name}'"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Build lookup dictionary
|
|
95
|
+
for i, record in enumerate(records):
|
|
96
|
+
key = record.get(self.key_column)
|
|
97
|
+
value = record.get(self.value_column)
|
|
98
|
+
|
|
99
|
+
if key is None:
|
|
100
|
+
logger.warning(
|
|
101
|
+
f"Row {i + 2}: Key column '{self.key_column}' is missing or empty. Skipping."
|
|
102
|
+
)
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# We stringify the key to ensure consistent lookup, optional but recommended for mixed types
|
|
106
|
+
self._data[str(key)] = value
|
|
107
|
+
|
|
108
|
+
logger.info(f"Successfully loaded {len(self._data)} items into cache.")
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"Failed to load data from Google Sheets: {e}")
|
|
112
|
+
raise
|
|
113
|
+
|
|
114
|
+
def get_value(self, key: Any) -> Optional[Any]:
|
|
115
|
+
"""
|
|
116
|
+
Look up a value by its key.
|
|
117
|
+
|
|
118
|
+
:param key: The key to look up.
|
|
119
|
+
:return: The corresponding value, or None if not found.
|
|
120
|
+
"""
|
|
121
|
+
return self._data.get(str(key))
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from .create_dataframe_from_google_sheets import create_dataframe_from_google_sheets
|
|
2
2
|
from .create_dataframe_of_entities_by_types import create_dataframe_of_entities_by_types
|
|
3
|
-
from .create_dataframe_of_entities_with_embedding_vectors import
|
|
3
|
+
from .create_dataframe_of_entities_with_embedding_vectors import (
|
|
4
|
+
create_dataframe_of_entities_with_embedding_vectors,
|
|
5
|
+
)
|
|
4
6
|
from .create_dataframe_of_url_iri import create_dataframe_of_url_iri
|
|
5
7
|
from .create_entity_patch_request import create_entity_patch_request
|
|
6
8
|
from .delayed import create_delayed
|
|
7
9
|
from .get_me import get_me
|
|
10
|
+
from .html_converter import HtmlConverter
|
|
8
11
|
|
|
9
12
|
__all__ = [
|
|
10
13
|
"create_dataframe_from_google_sheets",
|
|
@@ -13,5 +16,6 @@ __all__ = [
|
|
|
13
16
|
"create_dataframe_of_url_iri",
|
|
14
17
|
"create_entity_patch_request",
|
|
15
18
|
"create_delayed",
|
|
16
|
-
"get_me"
|
|
19
|
+
"get_me",
|
|
20
|
+
"HtmlConverter",
|
|
17
21
|
]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""HTML to XHTML conversion utility."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
_INVALID_XML_CHARS_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
|
|
9
|
+
_XML_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_.:-]*$")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HtmlConverter:
|
|
13
|
+
"""Converts HTML to XHTML."""
|
|
14
|
+
|
|
15
|
+
def convert(self, html: str) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Convert an HTML string to a valid XHTML string.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
html: The raw HTML string.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
A sanitized XHTML string.
|
|
24
|
+
"""
|
|
25
|
+
html = re.sub(r"<!DOCTYPE[^>]*>", "", html, flags=re.IGNORECASE)
|
|
26
|
+
html = self._strip_invalid_xml_chars(html)
|
|
27
|
+
try:
|
|
28
|
+
from lxml import html as lxml_html
|
|
29
|
+
except ImportError as exc:
|
|
30
|
+
raise ImportError(
|
|
31
|
+
"lxml is required for XHTML output. Install with: pip install lxml"
|
|
32
|
+
) from exc
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
|
|
36
|
+
doc = lxml_html.document_fromstring(html, parser=parser)
|
|
37
|
+
self._sanitize_xhtml_tree(doc)
|
|
38
|
+
xhtml = lxml_html.tostring(doc, encoding="unicode", method="xml")
|
|
39
|
+
return self._strip_invalid_xml_chars(xhtml)
|
|
40
|
+
except Exception as exc:
|
|
41
|
+
raise RuntimeError("Failed to convert HTML to XHTML.") from exc
|
|
42
|
+
|
|
43
|
+
def _strip_invalid_xml_chars(self, value: str) -> str:
|
|
44
|
+
return _INVALID_XML_CHARS_RE.sub("", value)
|
|
45
|
+
|
|
46
|
+
def _sanitize_xhtml_tree(self, doc: Any) -> None:
|
|
47
|
+
for element in doc.iter():
|
|
48
|
+
if not hasattr(element, "attrib"):
|
|
49
|
+
continue
|
|
50
|
+
for attr in list(element.attrib):
|
|
51
|
+
if not _XML_NAME_RE.match(attr):
|
|
52
|
+
del element.attrib[attr]
|
|
53
|
+
continue
|
|
54
|
+
value = element.attrib.get(attr)
|
|
55
|
+
if isinstance(value, str):
|
|
56
|
+
element.attrib[attr] = self._strip_invalid_xml_chars(value)
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/client/client_configuration_factory.py
RENAMED
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/configuration/configuration_provider.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/gql_client_provider.py
RENAMED
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_by_type.graphql
RENAMED
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_embedding_value.graphql
RENAMED
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_top_query.graphql
RENAMED
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_url_iri.graphql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/entity_top_query.py
RENAMED
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/entity_with_top_query.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/id_generator/id_generator_interface.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/entity_patch.py
RENAMED
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/entity_patch_queue.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/google_sheets_url_source.py
RENAMED
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/new_or_changed_url_source.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_from_google_sheets.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py
RENAMED
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/create_or_update_entities_factory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wordlift_sdk-2.7.6 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/default_url_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|