wordlift-sdk 2.7.5__tar.gz → 2.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/PKG-INFO +7 -2
  2. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/README.md +5 -0
  3. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/pyproject.toml +2 -2
  4. wordlift_sdk-2.8.0/wordlift_sdk/google_sheets/__init__.py +3 -0
  5. wordlift_sdk-2.8.0/wordlift_sdk/google_sheets/google_sheets_lookup.py +121 -0
  6. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/url_source/new_or_changed_url_source.py +3 -0
  7. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/__init__.py +0 -0
  8. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/client/__init__.py +0 -0
  9. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/client/client_configuration_factory.py +0 -0
  10. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/configuration/__init__.py +0 -0
  11. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/configuration/configuration_provider.py +0 -0
  12. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/configuration/get_config_value.py +0 -0
  13. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/container/__init__.py +0 -0
  14. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/container/application_container.py +0 -0
  15. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/deprecated/__init__.py +0 -0
  16. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +0 -0
  17. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/entity/__init__.py +0 -0
  18. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/entity/enrich.py +0 -0
  19. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/entity/patch.py +0 -0
  20. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/google_search_console/__init__.py +0 -0
  21. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/google_search_console/create_google_search_console_data_import.py +0 -0
  22. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +0 -0
  23. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graph/graph_bag.py +0 -0
  24. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graph/ttl_liquid/__init__.py +0 -0
  25. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +0 -0
  26. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/__init__.py +0 -0
  27. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/client/__init__.py +0 -0
  28. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/client/client.py +0 -0
  29. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/client/factory.py +0 -0
  30. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/client/gql_client_provider.py +0 -0
  31. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/data/entities_by_type.graphql +0 -0
  32. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/data/entities_embedding_value.graphql +0 -0
  33. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/data/entities_top_query.graphql +0 -0
  34. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/data/entities_url_id.graphql +0 -0
  35. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/data/entities_url_iri.graphql +0 -0
  36. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +0 -0
  37. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/query.py +0 -0
  38. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/utils/__init__.py +0 -0
  39. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/utils/query/__init__.py +0 -0
  40. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/utils/query/entity_top_query.py +0 -0
  41. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/graphql/utils/query/entity_with_top_query.py +0 -0
  42. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/id_generator/__init__.py +0 -0
  43. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/id_generator/id_generator.py +0 -0
  44. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/id_generator/id_generator_interface.py +0 -0
  45. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/internal_link/__init__.py +0 -0
  46. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/internal_link/utils.py +0 -0
  47. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/kg/__init__.py +0 -0
  48. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/kg/entity.py +0 -0
  49. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/kg/entity_store.py +0 -0
  50. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/kg/entity_store_factory.py +0 -0
  51. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/kg/relation/__init__.py +0 -0
  52. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/kg/relation/relation_service.py +0 -0
  53. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/main.py +0 -0
  54. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/namespace/SDO.py +0 -0
  55. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/namespace/__init__.py +0 -0
  56. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/notebook/__init__.py +0 -0
  57. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/notebook/install_if_missing.py +0 -0
  58. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/__init__.py +0 -0
  59. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/context.py +0 -0
  60. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/entity_patch/__init__.py +0 -0
  61. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/entity_patch/entity_patch.py +0 -0
  62. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +0 -0
  63. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/graph/__init__.py +0 -0
  64. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/graph/graph_queue.py +0 -0
  65. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/load_override_class.py +0 -0
  66. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/protocol/web_page_import_protocol.py +0 -0
  67. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/url_source/__init__.py +0 -0
  68. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/url_source/google_sheets_url_source.py +0 -0
  69. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/url_source/list_url_source.py +0 -0
  70. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/url_source/sitemap_url_source.py +0 -0
  71. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/url_source/url_source.py +0 -0
  72. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/url_source/url_source_input.py +0 -0
  73. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/__init__.py +0 -0
  74. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/create_dataframe_from_google_sheets.py +0 -0
  75. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +0 -0
  76. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +0 -0
  77. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/create_dataframe_of_url_iri.py +0 -0
  78. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/create_entity_patch_request.py +0 -0
  79. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/delayed.py +0 -0
  80. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/get_me.py +0 -0
  81. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/utils/import_url.py +0 -0
  82. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/__init__.py +0 -0
  83. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/entity_gaps/__init__.py +0 -0
  84. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +0 -0
  85. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +0 -0
  86. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  87. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +0 -0
  88. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +0 -0
  89. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +0 -0
  90. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +0 -0
  91. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +0 -0
  92. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +0 -0
  93. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +0 -0
  94. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/__init__.py +0 -0
  95. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/create_or_update_entities_factory.py +0 -0
  96. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/kg_import_workflow.py +0 -0
  97. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/patch_entities_factory.py +0 -0
  98. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/url_handler/__init__.py +0 -0
  99. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/url_handler/default_url_handler.py +0 -0
  100. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/url_handler/search_console_url_handler.py +0 -0
  101. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/url_handler/url_handler.py +0 -0
  102. {wordlift_sdk-2.7.5 → wordlift_sdk-2.8.0}/wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wordlift-sdk
3
- Version: 2.7.5
3
+ Version: 2.8.0
4
4
  Summary:
5
5
  Author: David Riccitelli
6
6
  Author-email: david@wordlift.io
@@ -21,7 +21,7 @@ Requires-Dist: python-liquid (>=2.0.1,<3.0.0)
21
21
  Requires-Dist: rdflib (>=7.0.0,<8.0.0)
22
22
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
23
23
  Requires-Dist: tqdm (>=4.67.1,<5.0.0)
24
- Requires-Dist: wordlift-client (>=1.117.0,<2.0.0)
24
+ Requires-Dist: wordlift-client (>=1.133.0,<2.0.0)
25
25
  Description-Content-Type: text/markdown
26
26
 
27
27
  # WordLift Python SDK
@@ -123,3 +123,8 @@ poetry install --with dev
123
123
  poetry run pytest
124
124
  ```
125
125
 
126
+ ## Documentation
127
+
128
+ - [Google Sheets Lookup](docs/google_sheets_lookup.md): Utility for O(1) lookups from Google Sheets.
129
+
130
+
@@ -96,3 +96,8 @@ Add `.ttl.liquid` files under `data/templates`. Templates render with `account`
96
96
  poetry install --with dev
97
97
  poetry run pytest
98
98
  ```
99
+
100
+ ## Documentation
101
+
102
+ - [Google Sheets Lookup](docs/google_sheets_lookup.md): Utility for O(1) lookups from Google Sheets.
103
+
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "wordlift-sdk"
3
- version = "2.7.5"
3
+ version = "2.8.0"
4
4
  description = ""
5
5
  authors = ["David Riccitelli <david@wordlift.io>"]
6
6
  readme = "README.md"
@@ -15,7 +15,7 @@ aiohttp = "^3.10.5"
15
15
  # Google Colab requires pandas 2.1.4
16
16
  pandas = ">=2.1.4, <2.3.0"
17
17
  rdflib = "^7.0.0"
18
- wordlift-client = "^1.117.0"
18
+ wordlift-client = "^1.133.0"
19
19
  gspread = "^6.1.2"
20
20
  google-auth = "^2.35.0"
21
21
  tqdm = "^4.67.1"
@@ -0,0 +1,3 @@
1
+ from .google_sheets_lookup import GoogleSheetsLookup
2
+
3
+ __all__ = ["GoogleSheetsLookup"]
@@ -0,0 +1,121 @@
1
+ import logging
2
+ import os
3
+ from typing import Optional, Any, Dict
4
+
5
+ import gspread
6
+ from wordlift_sdk.configuration import ConfigurationProvider
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class GoogleSheetsLookup:
12
+ """
13
+ A generic class to lookup values from a Google Sheet.
14
+ Preloads data upon initialization for O(1) lookup performance.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ spreadsheet_url: str,
20
+ sheet_name: str,
21
+ key_column: str,
22
+ value_column: str,
23
+ configuration_provider: ConfigurationProvider,
24
+ service_account_file: Optional[str] = None,
25
+ ):
26
+ """
27
+ Initialize the GoogleSheetsLookup.
28
+
29
+ :param spreadsheet_url: The URL of the Google Sheet.
30
+ :param sheet_name: The name of the specific worksheet (tab).
31
+ :param key_column: The header name of the column to use as keys.
32
+ :param value_column: The header name of the column to use as values.
33
+ :param configuration_provider: The ConfigurationProvider instance.
34
+ :param service_account_file: Optional path to the service account JSON file.
35
+ If not provided, it will be looked up in the configuration.
36
+ """
37
+ self.spreadsheet_url = spreadsheet_url
38
+ self.sheet_name = sheet_name
39
+ self.key_column = key_column
40
+ self.value_column = value_column
41
+ self.configuration_provider = configuration_provider
42
+ self.service_account_file = service_account_file
43
+
44
+ self._data: Dict[str, Any] = {}
45
+ self._load_data()
46
+
47
+ def _resolve_service_account_file(self) -> Optional[str]:
48
+ """
49
+ Resolves the service account file path.
50
+ Priority:
51
+ 1. Argument passed to __init__
52
+ 2. 'SERVICE_ACCOUNT_FILE' from ConfigurationProvider
53
+ """
54
+ if self.service_account_file:
55
+ return self.service_account_file
56
+
57
+ # Attempt to retrieve from ConfigurationProvider
58
+ return self.configuration_provider.get_value("SERVICE_ACCOUNT_FILE")
59
+
60
+ # Fallback: check environment variable directly if ConfigurationProvider
61
+ # behavior is different or it didn't return anything.
62
+ return os.getenv("SERVICE_ACCOUNT_FILE")
63
+
64
+ def _load_data(self):
65
+ """
66
+ Connects to Google Sheets and preloads the data into a dictionary.
67
+ """
68
+ credentials_file = self._resolve_service_account_file()
69
+
70
+ try:
71
+ if credentials_file:
72
+ logger.info(
73
+ f"Connecting to Google Sheets using credentials: {credentials_file}"
74
+ )
75
+ gc = gspread.service_account(filename=credentials_file)
76
+ else:
77
+ logger.info(
78
+ "Connecting to Google Sheets using default environment credentials"
79
+ )
80
+ gc = gspread.service_account()
81
+
82
+ # Open spreadsheet by URL
83
+ sh = gc.open_by_url(self.spreadsheet_url)
84
+
85
+ # Select worksheet
86
+ worksheet = sh.worksheet(self.sheet_name)
87
+
88
+ # Get all records
89
+ records = worksheet.get_all_records()
90
+ logger.info(
91
+ f"Fetched {len(records)} records from sheet '{self.sheet_name}'"
92
+ )
93
+
94
+ # Build lookup dictionary
95
+ for i, record in enumerate(records):
96
+ key = record.get(self.key_column)
97
+ value = record.get(self.value_column)
98
+
99
+ if key is None:
100
+ logger.warning(
101
+ f"Row {i + 2}: Key column '{self.key_column}' is missing or empty. Skipping."
102
+ )
103
+ continue
104
+
105
+ # We stringify the key to ensure consistent lookup, optional but recommended for mixed types
106
+ self._data[str(key)] = value
107
+
108
+ logger.info(f"Successfully loaded {len(self._data)} items into cache.")
109
+
110
+ except Exception as e:
111
+ logger.error(f"Failed to load data from Google Sheets: {e}")
112
+ raise
113
+
114
+ def get_value(self, key: Any) -> Optional[Any]:
115
+ """
116
+ Look up a value by its key.
117
+
118
+ :param key: The key to look up.
119
+ :return: The corresponding value, or None if not found.
120
+ """
121
+ return self._data.get(str(key))
@@ -42,6 +42,9 @@ class NewOrChangedUrlSource(UrlSource):
42
42
  how="left",
43
43
  suffixes=("", "_graphql"),
44
44
  )
45
+ merged_df["date_modified"] = pd.to_datetime(
46
+ merged_df["date_modified"], utc=True, errors="coerce"
47
+ )
45
48
  filtered_df = merged_df[
46
49
  self.overwrite
47
50
  | merged_df["date_imported"].isna()