wordlift-sdk 2.8.0__tar.gz → 2.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/PKG-INFO +1 -1
  2. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/pyproject.toml +1 -1
  3. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/__init__.py +6 -2
  4. wordlift_sdk-2.9.0/wordlift_sdk/utils/html_converter.py +56 -0
  5. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/README.md +0 -0
  6. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/__init__.py +0 -0
  7. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/client/__init__.py +0 -0
  8. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/client/client_configuration_factory.py +0 -0
  9. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/configuration/__init__.py +0 -0
  10. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/configuration/configuration_provider.py +0 -0
  11. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/configuration/get_config_value.py +0 -0
  12. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/container/__init__.py +0 -0
  13. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/container/application_container.py +0 -0
  14. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/deprecated/__init__.py +0 -0
  15. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +0 -0
  16. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/entity/__init__.py +0 -0
  17. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/entity/enrich.py +0 -0
  18. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/entity/patch.py +0 -0
  19. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/google_search_console/__init__.py +0 -0
  20. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/google_search_console/create_google_search_console_data_import.py +0 -0
  21. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +0 -0
  22. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/google_sheets/__init__.py +0 -0
  23. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/google_sheets/google_sheets_lookup.py +0 -0
  24. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graph/graph_bag.py +0 -0
  25. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graph/ttl_liquid/__init__.py +0 -0
  26. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +0 -0
  27. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/__init__.py +0 -0
  28. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/__init__.py +0 -0
  29. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/client.py +0 -0
  30. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/factory.py +0 -0
  31. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/client/gql_client_provider.py +0 -0
  32. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_by_type.graphql +0 -0
  33. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_embedding_value.graphql +0 -0
  34. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_top_query.graphql +0 -0
  35. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_url_id.graphql +0 -0
  36. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_url_iri.graphql +0 -0
  37. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +0 -0
  38. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/query.py +0 -0
  39. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/__init__.py +0 -0
  40. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/__init__.py +0 -0
  41. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/entity_top_query.py +0 -0
  42. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/graphql/utils/query/entity_with_top_query.py +0 -0
  43. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/id_generator/__init__.py +0 -0
  44. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/id_generator/id_generator.py +0 -0
  45. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/id_generator/id_generator_interface.py +0 -0
  46. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/internal_link/__init__.py +0 -0
  47. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/internal_link/utils.py +0 -0
  48. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/__init__.py +0 -0
  49. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/entity.py +0 -0
  50. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/entity_store.py +0 -0
  51. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/entity_store_factory.py +0 -0
  52. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/relation/__init__.py +0 -0
  53. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/kg/relation/relation_service.py +0 -0
  54. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/main.py +0 -0
  55. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/namespace/SDO.py +0 -0
  56. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/namespace/__init__.py +0 -0
  57. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/notebook/__init__.py +0 -0
  58. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/notebook/install_if_missing.py +0 -0
  59. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/__init__.py +0 -0
  60. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/context.py +0 -0
  61. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/__init__.py +0 -0
  62. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/entity_patch.py +0 -0
  63. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +0 -0
  64. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/graph/__init__.py +0 -0
  65. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/graph/graph_queue.py +0 -0
  66. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/load_override_class.py +0 -0
  67. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/protocol/web_page_import_protocol.py +0 -0
  68. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/__init__.py +0 -0
  69. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/google_sheets_url_source.py +0 -0
  70. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/list_url_source.py +0 -0
  71. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/new_or_changed_url_source.py +0 -0
  72. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/sitemap_url_source.py +0 -0
  73. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/url_source.py +0 -0
  74. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/url_source/url_source_input.py +0 -0
  75. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_from_google_sheets.py +0 -0
  76. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +0 -0
  77. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +0 -0
  78. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_dataframe_of_url_iri.py +0 -0
  79. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/create_entity_patch_request.py +0 -0
  80. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/delayed.py +0 -0
  81. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/get_me.py +0 -0
  82. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/utils/import_url.py +0 -0
  83. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/__init__.py +0 -0
  84. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/entity_gaps/__init__.py +0 -0
  85. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +0 -0
  86. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +0 -0
  87. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  88. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +0 -0
  89. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +0 -0
  90. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +0 -0
  91. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +0 -0
  92. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +0 -0
  93. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +0 -0
  94. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +0 -0
  95. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/__init__.py +0 -0
  96. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/create_or_update_entities_factory.py +0 -0
  97. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/kg_import_workflow.py +0 -0
  98. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/patch_entities_factory.py +0 -0
  99. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/__init__.py +0 -0
  100. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/default_url_handler.py +0 -0
  101. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/search_console_url_handler.py +0 -0
  102. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/url_handler.py +0 -0
  103. {wordlift_sdk-2.8.0 → wordlift_sdk-2.9.0}/wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wordlift-sdk
3
- Version: 2.8.0
3
+ Version: 2.9.0
4
4
  Summary:
5
5
  Author: David Riccitelli
6
6
  Author-email: david@wordlift.io
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "wordlift-sdk"
3
- version = "2.8.0"
3
+ version = "2.9.0"
4
4
  description = ""
5
5
  authors = ["David Riccitelli <david@wordlift.io>"]
6
6
  readme = "README.md"
@@ -1,10 +1,13 @@
1
1
  from .create_dataframe_from_google_sheets import create_dataframe_from_google_sheets
2
2
  from .create_dataframe_of_entities_by_types import create_dataframe_of_entities_by_types
3
- from .create_dataframe_of_entities_with_embedding_vectors import create_dataframe_of_entities_with_embedding_vectors
3
+ from .create_dataframe_of_entities_with_embedding_vectors import (
4
+ create_dataframe_of_entities_with_embedding_vectors,
5
+ )
4
6
  from .create_dataframe_of_url_iri import create_dataframe_of_url_iri
5
7
  from .create_entity_patch_request import create_entity_patch_request
6
8
  from .delayed import create_delayed
7
9
  from .get_me import get_me
10
+ from .html_converter import HtmlConverter
8
11
 
9
12
  __all__ = [
10
13
  "create_dataframe_from_google_sheets",
@@ -13,5 +16,6 @@ __all__ = [
13
16
  "create_dataframe_of_url_iri",
14
17
  "create_entity_patch_request",
15
18
  "create_delayed",
16
- "get_me"
19
+ "get_me",
20
+ "HtmlConverter",
17
21
  ]
@@ -0,0 +1,56 @@
1
+ """HTML to XHTML conversion utility."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ _INVALID_XML_CHARS_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
9
+ _XML_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_.:-]*$")
10
+
11
+
12
+ class HtmlConverter:
13
+ """Converts HTML to XHTML."""
14
+
15
+ def convert(self, html: str) -> str:
16
+ """
17
+ Convert an HTML string to a valid XHTML string.
18
+
19
+ Args:
20
+ html: The raw HTML string.
21
+
22
+ Returns:
23
+ A sanitized XHTML string.
24
+ """
25
+ html = re.sub(r"<!DOCTYPE[^>]*>", "", html, flags=re.IGNORECASE)
26
+ html = self._strip_invalid_xml_chars(html)
27
+ try:
28
+ from lxml import html as lxml_html
29
+ except ImportError as exc:
30
+ raise ImportError(
31
+ "lxml is required for XHTML output. Install with: pip install lxml"
32
+ ) from exc
33
+
34
+ try:
35
+ parser = lxml_html.HTMLParser(encoding="utf-8", recover=True)
36
+ doc = lxml_html.document_fromstring(html, parser=parser)
37
+ self._sanitize_xhtml_tree(doc)
38
+ xhtml = lxml_html.tostring(doc, encoding="unicode", method="xml")
39
+ return self._strip_invalid_xml_chars(xhtml)
40
+ except Exception as exc:
41
+ raise RuntimeError("Failed to convert HTML to XHTML.") from exc
42
+
43
+ def _strip_invalid_xml_chars(self, value: str) -> str:
44
+ return _INVALID_XML_CHARS_RE.sub("", value)
45
+
46
+ def _sanitize_xhtml_tree(self, doc: Any) -> None:
47
+ for element in doc.iter():
48
+ if not hasattr(element, "attrib"):
49
+ continue
50
+ for attr in list(element.attrib):
51
+ if not _XML_NAME_RE.match(attr):
52
+ del element.attrib[attr]
53
+ continue
54
+ value = element.attrib.get(attr)
55
+ if isinstance(value, str):
56
+ element.attrib[attr] = self._strip_invalid_xml_chars(value)
File without changes