wordlift-sdk 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. wordlift_sdk/__init__.py +3 -0
  2. wordlift_sdk/client/__init__.py +3 -0
  3. wordlift_sdk/client/client_configuration_factory.py +26 -0
  4. wordlift_sdk/configuration/__init__.py +4 -0
  5. wordlift_sdk/configuration/configuration_provider.py +44 -0
  6. wordlift_sdk/configuration/get_config_value.py +39 -0
  7. wordlift_sdk/container/__init__.py +3 -0
  8. wordlift_sdk/container/application_container.py +234 -0
  9. wordlift_sdk/deprecated/__init__.py +5 -0
  10. wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
  11. wordlift_sdk/entity/__init__.py +4 -0
  12. wordlift_sdk/entity/enrich.py +54 -0
  13. wordlift_sdk/entity/patch.py +14 -0
  14. wordlift_sdk/google_search_console/__init__.py +5 -0
  15. wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
  16. wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
  17. wordlift_sdk/graph/graph_bag.py +7 -0
  18. wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
  19. wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
  20. wordlift_sdk/graphql/__init__.py +3 -0
  21. wordlift_sdk/graphql/client/__init__.py +5 -0
  22. wordlift_sdk/graphql/client/client.py +69 -0
  23. wordlift_sdk/graphql/client/factory.py +36 -0
  24. wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
  25. wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
  26. wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
  27. wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
  28. wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
  29. wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
  30. wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
  31. wordlift_sdk/graphql/query.py +20 -0
  32. wordlift_sdk/graphql/utils/__init__.py +0 -0
  33. wordlift_sdk/graphql/utils/query/__init__.py +4 -0
  34. wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
  35. wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
  36. wordlift_sdk/id_generator/__init__.py +3 -0
  37. wordlift_sdk/id_generator/id_generator.py +40 -0
  38. wordlift_sdk/id_generator/id_generator_interface.py +8 -0
  39. wordlift_sdk/internal_link/__init__.py +3 -0
  40. wordlift_sdk/internal_link/utils.py +231 -0
  41. wordlift_sdk/kg/__init__.py +5 -0
  42. wordlift_sdk/kg/entity.py +17 -0
  43. wordlift_sdk/kg/entity_store.py +94 -0
  44. wordlift_sdk/kg/entity_store_factory.py +13 -0
  45. wordlift_sdk/kg/relation/__init__.py +0 -0
  46. wordlift_sdk/kg/relation/relation_service.py +78 -0
  47. wordlift_sdk/main.py +7 -0
  48. wordlift_sdk/namespace/SDO.py +3281 -0
  49. wordlift_sdk/namespace/__init__.py +3 -0
  50. wordlift_sdk/notebook/__init__.py +3 -0
  51. wordlift_sdk/notebook/install_if_missing.py +12 -0
  52. wordlift_sdk/protocol/__init__.py +5 -0
  53. wordlift_sdk/protocol/context.py +21 -0
  54. wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
  55. wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
  56. wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
  57. wordlift_sdk/protocol/graph/__init__.py +3 -0
  58. wordlift_sdk/protocol/graph/graph_queue.py +64 -0
  59. wordlift_sdk/protocol/load_override_class.py +30 -0
  60. wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
  61. wordlift_sdk/url_source/__init__.py +6 -0
  62. wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
  63. wordlift_sdk/url_source/list_url_source.py +28 -0
  64. wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
  65. wordlift_sdk/url_source/sitemap_url_source.py +36 -0
  66. wordlift_sdk/url_source/url_source.py +18 -0
  67. wordlift_sdk/url_source/url_source_input.py +6 -0
  68. wordlift_sdk/utils/__init__.py +17 -0
  69. wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
  70. wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
  71. wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
  72. wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
  73. wordlift_sdk/utils/create_entity_patch_request.py +14 -0
  74. wordlift_sdk/utils/delayed.py +12 -0
  75. wordlift_sdk/utils/get_me.py +8 -0
  76. wordlift_sdk/utils/import_url.py +35 -0
  77. wordlift_sdk/wordlift/__init__.py +0 -0
  78. wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
  79. wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
  80. wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
  81. wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  82. wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
  83. wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
  84. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
  85. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
  86. wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
  87. wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
  88. wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
  89. wordlift_sdk/workflow/__init__.py +3 -0
  90. wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
  91. wordlift_sdk/workflow/kg_import_workflow.py +49 -0
  92. wordlift_sdk/workflow/patch_entities_factory.py +16 -0
  93. wordlift_sdk/workflow/url_handler/__init__.py +3 -0
  94. wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
  95. wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
  96. wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
  97. wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
  98. wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
  99. wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
  100. wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,3 @@
1
+ from .main import run_kg_import_workflow
2
+
3
+ __all__ = ['run_kg_import_workflow']
@@ -0,0 +1,3 @@
1
+ from .client_configuration_factory import ClientConfigurationFactory
2
+
3
+ __all__ = ['ClientConfigurationFactory']
@@ -0,0 +1,26 @@
1
+ import wordlift_client
2
+
3
+
4
+ class ClientConfigurationFactory:
5
+ _api_url: str
6
+ _key: str
7
+
8
+ def __init__(self, key: str, api_url: str = "https://api.wordlift.io"):
9
+ self._api_url = api_url
10
+ self._key = key
11
+
12
+ def create(self):
13
+ configuration = wordlift_client.Configuration(
14
+ host=self._api_url,
15
+ )
16
+
17
+ # The client must configure the authentication and authorization parameters
18
+ # in accordance with the API server security policy.
19
+ # Examples for each auth method are provided below, use the example that
20
+ # satisfies your auth use case.
21
+
22
+ # Configure API key authorization: ApiKey
23
+ configuration.api_key['ApiKey'] = self._key
24
+ configuration.api_key_prefix['ApiKey'] = 'Key'
25
+
26
+ return configuration
@@ -0,0 +1,4 @@
1
+ from .configuration_provider import ConfigurationProvider
2
+ from .get_config_value import get_config_value
3
+
4
+ __all__ = ['ConfigurationProvider', 'get_config_value']
@@ -0,0 +1,44 @@
1
+ import importlib.util
2
+ import os
3
+
4
+
5
+ class ConfigurationProvider:
6
+ _config: dict
7
+
8
+ @staticmethod
9
+ def create(filepath: str = "config/default.py") -> "ConfigurationProvider":
10
+ return ConfigurationProvider(filepath=filepath)
11
+
12
+ def __init__(self, filepath: str):
13
+ if not os.path.exists(filepath):
14
+ self._config = {}
15
+ spec = importlib.util.spec_from_file_location("local_config", filepath)
16
+ module = importlib.util.module_from_spec(spec)
17
+ spec.loader.exec_module(module)
18
+ self._config = {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
19
+
20
+ def get_value(self, key: str, default=None):
21
+ # 1. Check globals
22
+ if key in globals():
23
+ return globals()[key]
24
+
25
+ # 2. Check config.py
26
+ if key in self._config:
27
+ return self._config[key]
28
+
29
+ # 3. Check environment variables
30
+ import os
31
+ if key in os.environ:
32
+ return os.environ[key]
33
+
34
+ # 4. Check Google Colab userdata
35
+ try:
36
+ from google.colab import userdata
37
+ secret = userdata.get(key)
38
+ if secret is not None:
39
+ return secret
40
+ except ImportError:
41
+ pass # Not running in Google Colab
42
+
43
+ # 5. Return default if provided
44
+ return default
@@ -0,0 +1,39 @@
1
+ import os
2
+ import importlib.util
3
+
4
+
5
+ def load_config_py(filepath="config.py"):
6
+ if not os.path.exists(filepath):
7
+ return {}
8
+ spec = importlib.util.spec_from_file_location("local_config", filepath)
9
+ module = importlib.util.module_from_spec(spec)
10
+ spec.loader.exec_module(module)
11
+ return {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
12
+
13
+
14
+ def get_config_value(key, config_py_path=None, default=None):
15
+ # 1. Check globals
16
+ if key in globals():
17
+ return globals()[key]
18
+
19
+ # 2. Check config.py
20
+ config = load_config_py(config_py_path)
21
+ if key in config:
22
+ return config[key]
23
+
24
+ # 3. Check environment variables
25
+ import os
26
+ if key in os.environ:
27
+ return os.environ[key]
28
+
29
+ # 4. Check Google Colab userdata
30
+ try:
31
+ from google.colab import userdata
32
+ secret = userdata.get(key)
33
+ if secret is not None:
34
+ return secret
35
+ except ImportError:
36
+ pass # Not running in Google Colab
37
+
38
+ # 5. Return default if provided
39
+ return default
@@ -0,0 +1,3 @@
1
+ from .application_container import ApplicationContainer
2
+
3
+ __all__ = ['ApplicationContainer']
@@ -0,0 +1,234 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from os import cpu_count
4
+ from typing import Optional, Union
5
+
6
+ import gspread
7
+ from google.auth.credentials import Credentials
8
+ from gspread import Client
9
+ from wordlift_client import Configuration, AccountInfo
10
+
11
+ from ..client.client_configuration_factory import ClientConfigurationFactory
12
+ from ..configuration import ConfigurationProvider
13
+ from ..graphql.client import GraphQlClientFactory, GraphQlClient, GqlClientProvider
14
+ from ..id_generator import IdGenerator
15
+ from ..protocol import Context
16
+ from ..protocol.entity_patch import EntityPatchQueue
17
+ from ..protocol.graph import GraphQueue
18
+ from ..url_source import (
19
+ SitemapUrlSource,
20
+ GoogleSheetsUrlSource,
21
+ ListUrlSource,
22
+ UrlSource,
23
+ )
24
+ from ..url_source.new_or_changed_url_source import NewOrChangedUrlSource
25
+ from ..utils import get_me
26
+ from ..workflow.kg_import_workflow import KgImportWorkflow
27
+ from ..workflow.url_handler import WebPageImportUrlHandler
28
+ from ..workflow.url_handler.default_url_handler import DefaultUrlHandler
29
+ from ..workflow.url_handler.search_console_url_handler import SearchConsoleUrlHandler
30
+ from ..workflow.url_handler.url_handler import UrlHandler
31
+
32
+
33
+ @dataclass
34
+ class UrlSourceInput:
35
+ """
36
+ Input structure for the UrlProviderFactory.
37
+
38
+ This class holds all possible parameters needed to create any of the supported URL providers.
39
+ The factory will use these parameters to determine which provider to create based on availability.
40
+ """
41
+
42
+ sitemap_url: Optional[str] = None
43
+ sitemap_url_pattern: Optional[str] = None
44
+ sheets_url: Optional[str] = None
45
+ sheets_name: Optional[str] = None
46
+ sheets_creds_or_client: Optional[Union[Credentials, Client]] = None
47
+ urls: Optional[list[str]] = None
48
+
49
+
50
+ class ApplicationContainer:
51
+ _api_url: str
52
+ _client_configuration: Configuration
53
+ _configuration_provider: ConfigurationProvider
54
+ _key: str
55
+
56
+ _context: Context | None = None
57
+ _graphql_client: GraphQlClient | None = None
58
+
59
+ def __init__(self, configuration_provider: ConfigurationProvider | None = None):
60
+ self._configuration_provider = (
61
+ configuration_provider or ConfigurationProvider.create()
62
+ )
63
+ self._api_url = self._configuration_provider.get_value(
64
+ "API_URL", "https://api.wordlift.io"
65
+ )
66
+ self._key = self._configuration_provider.get_value("WORDLIFT_KEY")
67
+ self._client_configuration = ClientConfigurationFactory(
68
+ key=self._key,
69
+ api_url=self._api_url,
70
+ ).create()
71
+
72
+ async def get_account(self) -> AccountInfo:
73
+ return await get_me(configuration=self._client_configuration)
74
+
75
+ async def get_context(self) -> Context:
76
+ if not self._context:
77
+ account = await self.get_account()
78
+ self._context = Context(
79
+ account=account,
80
+ client_configuration=self._client_configuration,
81
+ configuration_provider=self._configuration_provider,
82
+ id_generator=IdGenerator(account=account),
83
+ graph_queue=GraphQueue(client_configuration=self._client_configuration),
84
+ entity_patch_queue=EntityPatchQueue(
85
+ client_configuration=self._client_configuration
86
+ ),
87
+ )
88
+
89
+ return self._context
90
+
91
+ async def create_web_page_import_url_handler(self) -> WebPageImportUrlHandler:
92
+ write_strategy = self._configuration_provider.get_value(
93
+ "WEB_PAGE_IMPORT_WRITE_STRATEGY", "createOrUpdateModel"
94
+ )
95
+ return WebPageImportUrlHandler(
96
+ context=await self.get_context(),
97
+ embedding_properties=self._configuration_provider.get_value(
98
+ "EMBEDDING_PROPERTIES",
99
+ [
100
+ "http://schema.org/headline",
101
+ "http://schema.org/abstract",
102
+ "http://schema.org/text",
103
+ ],
104
+ ),
105
+ web_page_types=self._configuration_provider.get_value(
106
+ "WEB_PAGE_TYPES", ["http://schema.org/Article"]
107
+ ),
108
+ write_strategy=write_strategy,
109
+ )
110
+
111
+ async def create_search_console_url_handler(self):
112
+ return SearchConsoleUrlHandler(
113
+ context=await self.get_context(),
114
+ graphql_client=await self.get_graphql_client(),
115
+ )
116
+
117
+ async def create_multi_url_handler(self):
118
+ handlers: list[UrlHandler] = [
119
+ await self.create_web_page_import_url_handler(),
120
+ ]
121
+ if (
122
+ self._configuration_provider.get_value("GOOGLE_SEARCH_CONSOLE", True)
123
+ is True
124
+ ):
125
+ handlers.append(await self.create_search_console_url_handler())
126
+
127
+ return DefaultUrlHandler(url_handler_list=handlers)
128
+
129
+ async def create_kg_import_workflow(self) -> KgImportWorkflow:
130
+ concurrency = self._configuration_provider.get_value(
131
+ "CONCURRENCY", min(cpu_count(), 4)
132
+ )
133
+ return KgImportWorkflow(
134
+ context=await self.get_context(),
135
+ url_source=await self.create_new_or_changed_source(),
136
+ url_handler=await self.create_multi_url_handler(),
137
+ concurrency=concurrency,
138
+ )
139
+
140
+ async def create_graphql_client_factory(self) -> GraphQlClientFactory:
141
+ return GraphQlClientFactory(key=self._key, api_url=self._api_url + "/graphql")
142
+
143
+ async def create_gql_client_provider(self) -> GqlClientProvider:
144
+ graphql_client_factory = await self.create_graphql_client_factory()
145
+ return graphql_client_factory.create_provider()
146
+
147
+ async def get_graphql_client(self) -> GraphQlClient:
148
+ if self._graphql_client is None:
149
+ graphql_client_factory = await self.create_graphql_client_factory()
150
+ self._graphql_client = graphql_client_factory.create()
151
+
152
+ return self._graphql_client
153
+
154
+ async def create_url_source(self) -> UrlSource:
155
+ # Try to read the configuration from the `config/default.py` file.
156
+ sitemap_url = self._configuration_provider.get_value("SITEMAP_URL")
157
+ sitemap_url_pattern = self._configuration_provider.get_value(
158
+ "SITEMAP_URL_PATTERN", None
159
+ )
160
+ sheets_url = self._configuration_provider.get_value("SHEETS_URL")
161
+ sheets_name = self._configuration_provider.get_value("SHEETS_NAME")
162
+ sheets_service_account = self._configuration_provider.get_value(
163
+ "SHEETS_SERVICE_ACCOUNT"
164
+ )
165
+ urls = self._configuration_provider.get_value("URLS")
166
+
167
+ if (
168
+ sitemap_url is None
169
+ and urls is None
170
+ and (
171
+ sheets_url is None
172
+ or sheets_name is None
173
+ or sheets_service_account is None
174
+ )
175
+ ):
176
+ raise ValueError(
177
+ "One of `sitemap_url` or `sheets_url`/`sheets_name`/`sheets_service_account` is required."
178
+ )
179
+
180
+ input_params = UrlSourceInput(
181
+ sitemap_url=sitemap_url,
182
+ sitemap_url_pattern=sitemap_url_pattern,
183
+ sheets_url=sheets_url,
184
+ sheets_name=sheets_name,
185
+ sheets_creds_or_client=(
186
+ gspread.service_account(filename=sheets_service_account)
187
+ if sheets_service_account
188
+ else None
189
+ ),
190
+ urls=urls,
191
+ )
192
+
193
+ # Try to create a SitemapUrlProvider if sitemap_url is provided
194
+ if input_params.sitemap_url:
195
+ return SitemapUrlSource(
196
+ input_params.sitemap_url,
197
+ re.compile(input_params.sitemap_url_pattern)
198
+ if input_params.sitemap_url_pattern
199
+ else None,
200
+ )
201
+
202
+ # Try to create a GoogleSheetsUrlProvider if all required sheets parameters are provided
203
+ if (
204
+ input_params.sheets_url
205
+ and input_params.sheets_name
206
+ and input_params.sheets_creds_or_client
207
+ ):
208
+ return GoogleSheetsUrlSource(
209
+ input_params.sheets_creds_or_client,
210
+ input_params.sheets_url,
211
+ input_params.sheets_name,
212
+ )
213
+
214
+ # Try to create a ListUrlProvider if urls is provided
215
+ if input_params.urls:
216
+ return ListUrlSource(input_params.urls)
217
+
218
+ # If we get here, none of the required parameters were provided
219
+ raise ValueError(
220
+ "No valid parameters provided to create a URL provider. "
221
+ "Please provide either sitemap_url, all sheets parameters "
222
+ "(sheets_url, sheets_name, sheets_creds_or_client), or urls."
223
+ )
224
+
225
+ async def create_new_or_changed_source(self) -> UrlSource:
226
+ overwrite = self._configuration_provider.get_value("OVERWRITE", False)
227
+ return NewOrChangedUrlSource(
228
+ url_provider=await self.create_url_source(),
229
+ graphql_client=await self.get_graphql_client(),
230
+ overwrite=overwrite,
231
+ )
232
+
233
+ async def create_url_source_with_overwrite(self) -> UrlSource:
234
+ return await self.create_new_or_changed_source()
@@ -0,0 +1,5 @@
1
+ from .create_entities_with_top_query_dataframe import (
2
+ create_entities_with_top_query_dataframe,
3
+ )
4
+
5
+ __all__ = ["create_entities_with_top_query_dataframe"]
@@ -0,0 +1,30 @@
1
+ import logging
2
+
3
+ import pandas as pd
4
+ from pandas import DataFrame
5
+ from tqdm.asyncio import tqdm
6
+
7
+ from ..graphql.utils.query import entity_with_top_query_factory
8
+ from ..utils import create_delayed
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ async def create_entities_with_top_query_dataframe(
14
+ key: str, url_list: list[str]
15
+ ) -> DataFrame:
16
+ # Get the entities data with the top query.
17
+ logger.info("Loading entities with top query...")
18
+ entity_with_top_query = await entity_with_top_query_factory(key)
19
+ delayed = create_delayed(entity_with_top_query, 4)
20
+ entities_with_top_query = await tqdm.gather(
21
+ *[delayed(url) for url in url_list], total=len(url_list)
22
+ )
23
+
24
+ # Get a list of dataframes.
25
+ dataframes = [
26
+ obj.to_dataframe() for obj in entities_with_top_query if obj is not None
27
+ ]
28
+
29
+ # Concat them together, with a new index.
30
+ return pd.concat(dataframes, ignore_index=True)
@@ -0,0 +1,4 @@
1
+ from .enrich import enrich, ParseHtmlCallback
2
+ from .patch import patch
3
+
4
+ __all__ = ['enrich', 'ParseHtmlCallback', 'patch']
@@ -0,0 +1,54 @@
1
+ import logging
2
+ from typing import Callable, Awaitable, Coroutine
3
+
4
+ from aiohttp import ClientSession
5
+ from pandas import Series
6
+ from tenacity import retry, stop_after_attempt, wait_fixed
7
+ from wordlift_client import EntityPatchRequest, Configuration, WebPagesApi
8
+ import wordlift_client
9
+
10
+ from .patch import patch
11
+ from ..wordlift.sitemap_import.protocol.parse_html_protocol_interface import ParseHtmlInput
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ ParseHtmlCallback = Callable[[ParseHtmlInput], Awaitable[list[EntityPatchRequest]]]
16
+
17
+ headers = {
18
+ "User-Agent": (
19
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
20
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
21
+ "Chrome/123.0.0.0 Safari/537.36"
22
+ ),
23
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
24
+ "Accept-Language": "en-US,en;q=0.5",
25
+ }
26
+
27
+
28
+ def enrich(configuration: Configuration, callback: ParseHtmlCallback) -> Callable[
29
+ [Series], Coroutine[None, None, None]]:
30
+ @retry(
31
+ stop=stop_after_attempt(5), # Retry up to 5 times
32
+ wait=wait_fixed(2) # Wait 2 seconds between retries
33
+ )
34
+ async def process(row: Series) -> None:
35
+ entity_url = row['url']
36
+ entity_id = row['iri']
37
+
38
+ async with wordlift_client.ApiClient(configuration) as api_client:
39
+ try:
40
+ api_instance = WebPagesApi(api_client=api_client)
41
+ web_page = await api_instance.get_web_page(entity_url)
42
+ html = web_page.html
43
+ parse_html_input = ParseHtmlInput(
44
+ entity_id=entity_id,
45
+ entity_url=entity_url,
46
+ html=html,
47
+ row=row
48
+ )
49
+ payloads = await callback(parse_html_input)
50
+ await patch(configuration, entity_id, payloads)
51
+ except Exception as e:
52
+ logger.error("Error %s occurred while processing entity %s with url %s" % (e, entity_id, entity_url))
53
+
54
+ return process
@@ -0,0 +1,14 @@
1
+ from typing import List, Optional
2
+
3
+ import wordlift_client
4
+ from wordlift_client import Configuration, EntityPatchRequest
5
+
6
+
7
+ async def patch(configuration: Configuration, entity_id: str, payloads: List[EntityPatchRequest]) -> Optional[str]:
8
+ # If the payloads are empty, exit.
9
+ if not payloads:
10
+ return None
11
+
12
+ async with wordlift_client.ApiClient(configuration) as api_client:
13
+ api_instance = wordlift_client.EntitiesApi(api_client)
14
+ return await api_instance.patch_entities(entity_id, payloads)
@@ -0,0 +1,5 @@
1
+ from .create_google_search_console_data_import import create_google_search_console_data_import, import_url_analytics_factory
2
+ from .raise_error_if_account_analytics_not_configured import raise_error_if_account_analytics_not_configured
3
+
4
+ __all__ = ['create_google_search_console_data_import', 'import_url_analytics_factory',
5
+ 'raise_error_if_account_analytics_not_configured']
@@ -0,0 +1,59 @@
1
+ import logging
2
+ from datetime import datetime, timedelta
3
+ from typing import Callable, Awaitable
4
+
5
+ import wordlift_client
6
+ from pandas import Series
7
+ from tenacity import retry, wait_fixed, stop_after_attempt
8
+ from tqdm.asyncio import tqdm
9
+ from twisted.mail.scripts.mailmail import Configuration
10
+ from wordlift_client import AnalyticsImportRequest
11
+
12
+ from ..deprecated import create_entities_with_top_query_dataframe
13
+ from ..utils import create_delayed
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ async def create_google_search_console_data_import(
19
+ configuration: Configuration, key: str, url_list: list[str]
20
+ ) -> None:
21
+ # Get the entities data with the top query.
22
+ entities_with_top_query_df = await create_entities_with_top_query_dataframe(
23
+ key=key, url_list=url_list
24
+ )
25
+
26
+ # Calculate the date 7 days ago from today
27
+ seven_days_ago = datetime.now() - timedelta(days=7)
28
+
29
+ # Filter the DataFrame
30
+ entities_with_stale_data_df = entities_with_top_query_df[
31
+ entities_with_top_query_df["top_query_date_created"].isna()
32
+ | (entities_with_top_query_df["top_query_date_created"] < seven_days_ago)
33
+ ]
34
+
35
+ import_url_analytics = await import_url_analytics_factory(
36
+ configuration=configuration
37
+ )
38
+ if len(entities_with_stale_data_df) > 0:
39
+ logger.info("Updating missing or stale Google Search Console data...")
40
+ # We're polite and not making more than 2 concurrent reqs.
41
+ delayed = create_delayed(import_url_analytics, 2)
42
+ await tqdm.gather(
43
+ *[delayed(row) for index, row in entities_with_stale_data_df.iterrows()],
44
+ total=len(entities_with_stale_data_df),
45
+ )
46
+
47
+
48
+ async def import_url_analytics_factory(
49
+ configuration: Configuration,
50
+ ) -> Callable[[Series], Awaitable[None]]:
51
+ @retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
52
+ async def import_url_analytics(row: Series) -> None:
53
+ url = row["url"]
54
+ async with wordlift_client.ApiClient(configuration) as api_client:
55
+ api_instance = wordlift_client.AnalyticsImportsApi(api_client)
56
+ request = AnalyticsImportRequest(urls=[url])
57
+ await api_instance.create_analytics_import(request)
58
+
59
+ return import_url_analytics
@@ -0,0 +1,20 @@
1
+ from pycountry import countries
2
+ from wordlift_client import AccountInfo
3
+
4
+
5
+ async def raise_error_if_account_analytics_not_configured(account: AccountInfo) -> bool:
6
+ if account.google_search_console_site_url is None:
7
+ raise ValueError(
8
+ "%s is not connected to Google Search Console, open https://my.wordlift.io to connect it." % account.dataset_uri)
9
+
10
+ if account.country_code is None:
11
+ raise ValueError(
12
+ "%s country code not configured, open https://my.wordlift.io to configure it." % account.dataset_uri)
13
+
14
+ # Get the country name
15
+ country = countries.get(alpha_2=account.country_code.upper())
16
+ if country is None:
17
+ raise ValueError(
18
+ "Country code %s is invalid, open https://my.wordlift.io to reconfigure it." % account.country_code.upper())
19
+
20
+ return True
@@ -0,0 +1,7 @@
1
+ from dataclasses import dataclass
2
+ from rdflib import Graph
3
+
4
+
5
+ @dataclass
6
+ class GraphBag:
7
+ graph: Graph
@@ -0,0 +1,3 @@
1
+ from .ttl_liquid_graph_factory import TtlLiquidGraphFactory
2
+
3
+ __all__ = ['TtlLiquidGraphFactory']
@@ -0,0 +1,43 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from liquid import Environment, CachingFileSystemLoader
5
+ from rdflib import Graph
6
+
7
+ from ...protocol import Context
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class TtlLiquidGraphFactory:
13
+ path: Path
14
+ context: Context
15
+
16
+ def __init__(self, context: Context, path: Path):
17
+ self.context = context
18
+ self.path = path
19
+
20
+ async def graphs(self) -> None:
21
+ templates = list(self.path.rglob("*.ttl.liquid"))
22
+ env = Environment(
23
+ loader=CachingFileSystemLoader(self.path),
24
+ )
25
+
26
+ for template in templates:
27
+ template = env.get_template(str(template.absolute()))
28
+ turtle = template.render(account=self.context.account.__dict__)
29
+
30
+ try:
31
+ # Create a new RDF graph
32
+ graph = Graph()
33
+
34
+ # Parse the Turtle data into the graph
35
+ graph.parse(data=turtle, format="turtle")
36
+
37
+ logger.info(
38
+ f"Successfully loaded {template} graph with {len(graph)} triples"
39
+ )
40
+
41
+ await self.context.graph_queue.put(graph)
42
+ except Exception as e:
43
+ logger.error(f"Error loading contact points graph: {e}")
@@ -0,0 +1,3 @@
1
+ from .query import query
2
+
3
+ __all__ = ['query']
@@ -0,0 +1,5 @@
1
+ from .client import GraphQlClient
2
+ from .factory import GraphQlClientFactory
3
+ from .gql_client_provider import GqlClientProvider
4
+
5
+ __all__ = ["GraphQlClient", "GraphQlClientFactory", "GqlClientProvider"]