wordlift-sdk 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +3 -0
- wordlift_sdk/client/__init__.py +3 -0
- wordlift_sdk/client/client_configuration_factory.py +26 -0
- wordlift_sdk/configuration/__init__.py +4 -0
- wordlift_sdk/configuration/configuration_provider.py +44 -0
- wordlift_sdk/configuration/get_config_value.py +39 -0
- wordlift_sdk/container/__init__.py +3 -0
- wordlift_sdk/container/application_container.py +234 -0
- wordlift_sdk/deprecated/__init__.py +5 -0
- wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
- wordlift_sdk/entity/__init__.py +4 -0
- wordlift_sdk/entity/enrich.py +54 -0
- wordlift_sdk/entity/patch.py +14 -0
- wordlift_sdk/google_search_console/__init__.py +5 -0
- wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
- wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
- wordlift_sdk/graph/graph_bag.py +7 -0
- wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
- wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
- wordlift_sdk/graphql/__init__.py +3 -0
- wordlift_sdk/graphql/client/__init__.py +5 -0
- wordlift_sdk/graphql/client/client.py +69 -0
- wordlift_sdk/graphql/client/factory.py +36 -0
- wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
- wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
- wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
- wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
- wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
- wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
- wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
- wordlift_sdk/graphql/query.py +20 -0
- wordlift_sdk/graphql/utils/__init__.py +0 -0
- wordlift_sdk/graphql/utils/query/__init__.py +4 -0
- wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
- wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
- wordlift_sdk/id_generator/__init__.py +3 -0
- wordlift_sdk/id_generator/id_generator.py +40 -0
- wordlift_sdk/id_generator/id_generator_interface.py +8 -0
- wordlift_sdk/internal_link/__init__.py +3 -0
- wordlift_sdk/internal_link/utils.py +231 -0
- wordlift_sdk/kg/__init__.py +5 -0
- wordlift_sdk/kg/entity.py +17 -0
- wordlift_sdk/kg/entity_store.py +94 -0
- wordlift_sdk/kg/entity_store_factory.py +13 -0
- wordlift_sdk/kg/relation/__init__.py +0 -0
- wordlift_sdk/kg/relation/relation_service.py +78 -0
- wordlift_sdk/main.py +7 -0
- wordlift_sdk/namespace/SDO.py +3281 -0
- wordlift_sdk/namespace/__init__.py +3 -0
- wordlift_sdk/notebook/__init__.py +3 -0
- wordlift_sdk/notebook/install_if_missing.py +12 -0
- wordlift_sdk/protocol/__init__.py +5 -0
- wordlift_sdk/protocol/context.py +21 -0
- wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
- wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
- wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
- wordlift_sdk/protocol/graph/__init__.py +3 -0
- wordlift_sdk/protocol/graph/graph_queue.py +64 -0
- wordlift_sdk/protocol/load_override_class.py +30 -0
- wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
- wordlift_sdk/url_source/__init__.py +6 -0
- wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
- wordlift_sdk/url_source/list_url_source.py +28 -0
- wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
- wordlift_sdk/url_source/sitemap_url_source.py +36 -0
- wordlift_sdk/url_source/url_source.py +18 -0
- wordlift_sdk/url_source/url_source_input.py +6 -0
- wordlift_sdk/utils/__init__.py +17 -0
- wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
- wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
- wordlift_sdk/utils/create_entity_patch_request.py +14 -0
- wordlift_sdk/utils/delayed.py +12 -0
- wordlift_sdk/utils/get_me.py +8 -0
- wordlift_sdk/utils/import_url.py +35 -0
- wordlift_sdk/wordlift/__init__.py +0 -0
- wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
- wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
- wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
- wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
- wordlift_sdk/workflow/__init__.py +3 -0
- wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
- wordlift_sdk/workflow/kg_import_workflow.py +49 -0
- wordlift_sdk/workflow/patch_entities_factory.py +16 -0
- wordlift_sdk/workflow/url_handler/__init__.py +3 -0
- wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
- wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
- wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
- wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
- wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
- wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
- wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
wordlift_sdk/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import wordlift_client
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ClientConfigurationFactory:
|
|
5
|
+
_api_url: str
|
|
6
|
+
_key: str
|
|
7
|
+
|
|
8
|
+
def __init__(self, key: str, api_url: str = "https://api.wordlift.io"):
|
|
9
|
+
self._api_url = api_url
|
|
10
|
+
self._key = key
|
|
11
|
+
|
|
12
|
+
def create(self):
|
|
13
|
+
configuration = wordlift_client.Configuration(
|
|
14
|
+
host=self._api_url,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# The client must configure the authentication and authorization parameters
|
|
18
|
+
# in accordance with the API server security policy.
|
|
19
|
+
# Examples for each auth method are provided below, use the example that
|
|
20
|
+
# satisfies your auth use case.
|
|
21
|
+
|
|
22
|
+
# Configure API key authorization: ApiKey
|
|
23
|
+
configuration.api_key['ApiKey'] = self._key
|
|
24
|
+
configuration.api_key_prefix['ApiKey'] = 'Key'
|
|
25
|
+
|
|
26
|
+
return configuration
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ConfigurationProvider:
|
|
6
|
+
_config: dict
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def create(filepath: str = "config/default.py") -> "ConfigurationProvider":
|
|
10
|
+
return ConfigurationProvider(filepath=filepath)
|
|
11
|
+
|
|
12
|
+
def __init__(self, filepath: str):
|
|
13
|
+
if not os.path.exists(filepath):
|
|
14
|
+
self._config = {}
|
|
15
|
+
spec = importlib.util.spec_from_file_location("local_config", filepath)
|
|
16
|
+
module = importlib.util.module_from_spec(spec)
|
|
17
|
+
spec.loader.exec_module(module)
|
|
18
|
+
self._config = {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
|
|
19
|
+
|
|
20
|
+
def get_value(self, key: str, default=None):
|
|
21
|
+
# 1. Check globals
|
|
22
|
+
if key in globals():
|
|
23
|
+
return globals()[key]
|
|
24
|
+
|
|
25
|
+
# 2. Check config.py
|
|
26
|
+
if key in self._config:
|
|
27
|
+
return self._config[key]
|
|
28
|
+
|
|
29
|
+
# 3. Check environment variables
|
|
30
|
+
import os
|
|
31
|
+
if key in os.environ:
|
|
32
|
+
return os.environ[key]
|
|
33
|
+
|
|
34
|
+
# 4. Check Google Colab userdata
|
|
35
|
+
try:
|
|
36
|
+
from google.colab import userdata
|
|
37
|
+
secret = userdata.get(key)
|
|
38
|
+
if secret is not None:
|
|
39
|
+
return secret
|
|
40
|
+
except ImportError:
|
|
41
|
+
pass # Not running in Google Colab
|
|
42
|
+
|
|
43
|
+
# 5. Return default if provided
|
|
44
|
+
return default
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import importlib.util
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def load_config_py(filepath="config.py"):
|
|
6
|
+
if not os.path.exists(filepath):
|
|
7
|
+
return {}
|
|
8
|
+
spec = importlib.util.spec_from_file_location("local_config", filepath)
|
|
9
|
+
module = importlib.util.module_from_spec(spec)
|
|
10
|
+
spec.loader.exec_module(module)
|
|
11
|
+
return {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_config_value(key, config_py_path=None, default=None):
|
|
15
|
+
# 1. Check globals
|
|
16
|
+
if key in globals():
|
|
17
|
+
return globals()[key]
|
|
18
|
+
|
|
19
|
+
# 2. Check config.py
|
|
20
|
+
config = load_config_py(config_py_path)
|
|
21
|
+
if key in config:
|
|
22
|
+
return config[key]
|
|
23
|
+
|
|
24
|
+
# 3. Check environment variables
|
|
25
|
+
import os
|
|
26
|
+
if key in os.environ:
|
|
27
|
+
return os.environ[key]
|
|
28
|
+
|
|
29
|
+
# 4. Check Google Colab userdata
|
|
30
|
+
try:
|
|
31
|
+
from google.colab import userdata
|
|
32
|
+
secret = userdata.get(key)
|
|
33
|
+
if secret is not None:
|
|
34
|
+
return secret
|
|
35
|
+
except ImportError:
|
|
36
|
+
pass # Not running in Google Colab
|
|
37
|
+
|
|
38
|
+
# 5. Return default if provided
|
|
39
|
+
return default
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from os import cpu_count
|
|
4
|
+
from typing import Optional, Union
|
|
5
|
+
|
|
6
|
+
import gspread
|
|
7
|
+
from google.auth.credentials import Credentials
|
|
8
|
+
from gspread import Client
|
|
9
|
+
from wordlift_client import Configuration, AccountInfo
|
|
10
|
+
|
|
11
|
+
from ..client.client_configuration_factory import ClientConfigurationFactory
|
|
12
|
+
from ..configuration import ConfigurationProvider
|
|
13
|
+
from ..graphql.client import GraphQlClientFactory, GraphQlClient, GqlClientProvider
|
|
14
|
+
from ..id_generator import IdGenerator
|
|
15
|
+
from ..protocol import Context
|
|
16
|
+
from ..protocol.entity_patch import EntityPatchQueue
|
|
17
|
+
from ..protocol.graph import GraphQueue
|
|
18
|
+
from ..url_source import (
|
|
19
|
+
SitemapUrlSource,
|
|
20
|
+
GoogleSheetsUrlSource,
|
|
21
|
+
ListUrlSource,
|
|
22
|
+
UrlSource,
|
|
23
|
+
)
|
|
24
|
+
from ..url_source.new_or_changed_url_source import NewOrChangedUrlSource
|
|
25
|
+
from ..utils import get_me
|
|
26
|
+
from ..workflow.kg_import_workflow import KgImportWorkflow
|
|
27
|
+
from ..workflow.url_handler import WebPageImportUrlHandler
|
|
28
|
+
from ..workflow.url_handler.default_url_handler import DefaultUrlHandler
|
|
29
|
+
from ..workflow.url_handler.search_console_url_handler import SearchConsoleUrlHandler
|
|
30
|
+
from ..workflow.url_handler.url_handler import UrlHandler
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class UrlSourceInput:
|
|
35
|
+
"""
|
|
36
|
+
Input structure for the UrlProviderFactory.
|
|
37
|
+
|
|
38
|
+
This class holds all possible parameters needed to create any of the supported URL providers.
|
|
39
|
+
The factory will use these parameters to determine which provider to create based on availability.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
sitemap_url: Optional[str] = None
|
|
43
|
+
sitemap_url_pattern: Optional[str] = None
|
|
44
|
+
sheets_url: Optional[str] = None
|
|
45
|
+
sheets_name: Optional[str] = None
|
|
46
|
+
sheets_creds_or_client: Optional[Union[Credentials, Client]] = None
|
|
47
|
+
urls: Optional[list[str]] = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ApplicationContainer:
|
|
51
|
+
_api_url: str
|
|
52
|
+
_client_configuration: Configuration
|
|
53
|
+
_configuration_provider: ConfigurationProvider
|
|
54
|
+
_key: str
|
|
55
|
+
|
|
56
|
+
_context: Context | None = None
|
|
57
|
+
_graphql_client: GraphQlClient | None = None
|
|
58
|
+
|
|
59
|
+
def __init__(self, configuration_provider: ConfigurationProvider | None = None):
|
|
60
|
+
self._configuration_provider = (
|
|
61
|
+
configuration_provider or ConfigurationProvider.create()
|
|
62
|
+
)
|
|
63
|
+
self._api_url = self._configuration_provider.get_value(
|
|
64
|
+
"API_URL", "https://api.wordlift.io"
|
|
65
|
+
)
|
|
66
|
+
self._key = self._configuration_provider.get_value("WORDLIFT_KEY")
|
|
67
|
+
self._client_configuration = ClientConfigurationFactory(
|
|
68
|
+
key=self._key,
|
|
69
|
+
api_url=self._api_url,
|
|
70
|
+
).create()
|
|
71
|
+
|
|
72
|
+
async def get_account(self) -> AccountInfo:
|
|
73
|
+
return await get_me(configuration=self._client_configuration)
|
|
74
|
+
|
|
75
|
+
async def get_context(self) -> Context:
|
|
76
|
+
if not self._context:
|
|
77
|
+
account = await self.get_account()
|
|
78
|
+
self._context = Context(
|
|
79
|
+
account=account,
|
|
80
|
+
client_configuration=self._client_configuration,
|
|
81
|
+
configuration_provider=self._configuration_provider,
|
|
82
|
+
id_generator=IdGenerator(account=account),
|
|
83
|
+
graph_queue=GraphQueue(client_configuration=self._client_configuration),
|
|
84
|
+
entity_patch_queue=EntityPatchQueue(
|
|
85
|
+
client_configuration=self._client_configuration
|
|
86
|
+
),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return self._context
|
|
90
|
+
|
|
91
|
+
async def create_web_page_import_url_handler(self) -> WebPageImportUrlHandler:
|
|
92
|
+
write_strategy = self._configuration_provider.get_value(
|
|
93
|
+
"WEB_PAGE_IMPORT_WRITE_STRATEGY", "createOrUpdateModel"
|
|
94
|
+
)
|
|
95
|
+
return WebPageImportUrlHandler(
|
|
96
|
+
context=await self.get_context(),
|
|
97
|
+
embedding_properties=self._configuration_provider.get_value(
|
|
98
|
+
"EMBEDDING_PROPERTIES",
|
|
99
|
+
[
|
|
100
|
+
"http://schema.org/headline",
|
|
101
|
+
"http://schema.org/abstract",
|
|
102
|
+
"http://schema.org/text",
|
|
103
|
+
],
|
|
104
|
+
),
|
|
105
|
+
web_page_types=self._configuration_provider.get_value(
|
|
106
|
+
"WEB_PAGE_TYPES", ["http://schema.org/Article"]
|
|
107
|
+
),
|
|
108
|
+
write_strategy=write_strategy,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
async def create_search_console_url_handler(self):
|
|
112
|
+
return SearchConsoleUrlHandler(
|
|
113
|
+
context=await self.get_context(),
|
|
114
|
+
graphql_client=await self.get_graphql_client(),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
async def create_multi_url_handler(self):
|
|
118
|
+
handlers: list[UrlHandler] = [
|
|
119
|
+
await self.create_web_page_import_url_handler(),
|
|
120
|
+
]
|
|
121
|
+
if (
|
|
122
|
+
self._configuration_provider.get_value("GOOGLE_SEARCH_CONSOLE", True)
|
|
123
|
+
is True
|
|
124
|
+
):
|
|
125
|
+
handlers.append(await self.create_search_console_url_handler())
|
|
126
|
+
|
|
127
|
+
return DefaultUrlHandler(url_handler_list=handlers)
|
|
128
|
+
|
|
129
|
+
async def create_kg_import_workflow(self) -> KgImportWorkflow:
|
|
130
|
+
concurrency = self._configuration_provider.get_value(
|
|
131
|
+
"CONCURRENCY", min(cpu_count(), 4)
|
|
132
|
+
)
|
|
133
|
+
return KgImportWorkflow(
|
|
134
|
+
context=await self.get_context(),
|
|
135
|
+
url_source=await self.create_new_or_changed_source(),
|
|
136
|
+
url_handler=await self.create_multi_url_handler(),
|
|
137
|
+
concurrency=concurrency,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
async def create_graphql_client_factory(self) -> GraphQlClientFactory:
|
|
141
|
+
return GraphQlClientFactory(key=self._key, api_url=self._api_url + "/graphql")
|
|
142
|
+
|
|
143
|
+
async def create_gql_client_provider(self) -> GqlClientProvider:
|
|
144
|
+
graphql_client_factory = await self.create_graphql_client_factory()
|
|
145
|
+
return graphql_client_factory.create_provider()
|
|
146
|
+
|
|
147
|
+
async def get_graphql_client(self) -> GraphQlClient:
|
|
148
|
+
if self._graphql_client is None:
|
|
149
|
+
graphql_client_factory = await self.create_graphql_client_factory()
|
|
150
|
+
self._graphql_client = graphql_client_factory.create()
|
|
151
|
+
|
|
152
|
+
return self._graphql_client
|
|
153
|
+
|
|
154
|
+
async def create_url_source(self) -> UrlSource:
|
|
155
|
+
# Try to read the configuration from the `config/default.py` file.
|
|
156
|
+
sitemap_url = self._configuration_provider.get_value("SITEMAP_URL")
|
|
157
|
+
sitemap_url_pattern = self._configuration_provider.get_value(
|
|
158
|
+
"SITEMAP_URL_PATTERN", None
|
|
159
|
+
)
|
|
160
|
+
sheets_url = self._configuration_provider.get_value("SHEETS_URL")
|
|
161
|
+
sheets_name = self._configuration_provider.get_value("SHEETS_NAME")
|
|
162
|
+
sheets_service_account = self._configuration_provider.get_value(
|
|
163
|
+
"SHEETS_SERVICE_ACCOUNT"
|
|
164
|
+
)
|
|
165
|
+
urls = self._configuration_provider.get_value("URLS")
|
|
166
|
+
|
|
167
|
+
if (
|
|
168
|
+
sitemap_url is None
|
|
169
|
+
and urls is None
|
|
170
|
+
and (
|
|
171
|
+
sheets_url is None
|
|
172
|
+
or sheets_name is None
|
|
173
|
+
or sheets_service_account is None
|
|
174
|
+
)
|
|
175
|
+
):
|
|
176
|
+
raise ValueError(
|
|
177
|
+
"One of `sitemap_url` or `sheets_url`/`sheets_name`/`sheets_service_account` is required."
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
input_params = UrlSourceInput(
|
|
181
|
+
sitemap_url=sitemap_url,
|
|
182
|
+
sitemap_url_pattern=sitemap_url_pattern,
|
|
183
|
+
sheets_url=sheets_url,
|
|
184
|
+
sheets_name=sheets_name,
|
|
185
|
+
sheets_creds_or_client=(
|
|
186
|
+
gspread.service_account(filename=sheets_service_account)
|
|
187
|
+
if sheets_service_account
|
|
188
|
+
else None
|
|
189
|
+
),
|
|
190
|
+
urls=urls,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Try to create a SitemapUrlProvider if sitemap_url is provided
|
|
194
|
+
if input_params.sitemap_url:
|
|
195
|
+
return SitemapUrlSource(
|
|
196
|
+
input_params.sitemap_url,
|
|
197
|
+
re.compile(input_params.sitemap_url_pattern)
|
|
198
|
+
if input_params.sitemap_url_pattern
|
|
199
|
+
else None,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Try to create a GoogleSheetsUrlProvider if all required sheets parameters are provided
|
|
203
|
+
if (
|
|
204
|
+
input_params.sheets_url
|
|
205
|
+
and input_params.sheets_name
|
|
206
|
+
and input_params.sheets_creds_or_client
|
|
207
|
+
):
|
|
208
|
+
return GoogleSheetsUrlSource(
|
|
209
|
+
input_params.sheets_creds_or_client,
|
|
210
|
+
input_params.sheets_url,
|
|
211
|
+
input_params.sheets_name,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Try to create a ListUrlProvider if urls is provided
|
|
215
|
+
if input_params.urls:
|
|
216
|
+
return ListUrlSource(input_params.urls)
|
|
217
|
+
|
|
218
|
+
# If we get here, none of the required parameters were provided
|
|
219
|
+
raise ValueError(
|
|
220
|
+
"No valid parameters provided to create a URL provider. "
|
|
221
|
+
"Please provide either sitemap_url, all sheets parameters "
|
|
222
|
+
"(sheets_url, sheets_name, sheets_creds_or_client), or urls."
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
async def create_new_or_changed_source(self) -> UrlSource:
|
|
226
|
+
overwrite = self._configuration_provider.get_value("OVERWRITE", False)
|
|
227
|
+
return NewOrChangedUrlSource(
|
|
228
|
+
url_provider=await self.create_url_source(),
|
|
229
|
+
graphql_client=await self.get_graphql_client(),
|
|
230
|
+
overwrite=overwrite,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
async def create_url_source_with_overwrite(self) -> UrlSource:
|
|
234
|
+
return await self.create_new_or_changed_source()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pandas import DataFrame
|
|
5
|
+
from tqdm.asyncio import tqdm
|
|
6
|
+
|
|
7
|
+
from ..graphql.utils.query import entity_with_top_query_factory
|
|
8
|
+
from ..utils import create_delayed
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def create_entities_with_top_query_dataframe(
|
|
14
|
+
key: str, url_list: list[str]
|
|
15
|
+
) -> DataFrame:
|
|
16
|
+
# Get the entities data with the top query.
|
|
17
|
+
logger.info("Loading entities with top query...")
|
|
18
|
+
entity_with_top_query = await entity_with_top_query_factory(key)
|
|
19
|
+
delayed = create_delayed(entity_with_top_query, 4)
|
|
20
|
+
entities_with_top_query = await tqdm.gather(
|
|
21
|
+
*[delayed(url) for url in url_list], total=len(url_list)
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Get a list of dataframes.
|
|
25
|
+
dataframes = [
|
|
26
|
+
obj.to_dataframe() for obj in entities_with_top_query if obj is not None
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Concat them together, with a new index.
|
|
30
|
+
return pd.concat(dataframes, ignore_index=True)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Callable, Awaitable, Coroutine
|
|
3
|
+
|
|
4
|
+
from aiohttp import ClientSession
|
|
5
|
+
from pandas import Series
|
|
6
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
7
|
+
from wordlift_client import EntityPatchRequest, Configuration, WebPagesApi
|
|
8
|
+
import wordlift_client
|
|
9
|
+
|
|
10
|
+
from .patch import patch
|
|
11
|
+
from ..wordlift.sitemap_import.protocol.parse_html_protocol_interface import ParseHtmlInput
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
ParseHtmlCallback = Callable[[ParseHtmlInput], Awaitable[list[EntityPatchRequest]]]
|
|
16
|
+
|
|
17
|
+
headers = {
|
|
18
|
+
"User-Agent": (
|
|
19
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
20
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
21
|
+
"Chrome/123.0.0.0 Safari/537.36"
|
|
22
|
+
),
|
|
23
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
24
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def enrich(configuration: Configuration, callback: ParseHtmlCallback) -> Callable[
|
|
29
|
+
[Series], Coroutine[None, None, None]]:
|
|
30
|
+
@retry(
|
|
31
|
+
stop=stop_after_attempt(5), # Retry up to 5 times
|
|
32
|
+
wait=wait_fixed(2) # Wait 2 seconds between retries
|
|
33
|
+
)
|
|
34
|
+
async def process(row: Series) -> None:
|
|
35
|
+
entity_url = row['url']
|
|
36
|
+
entity_id = row['iri']
|
|
37
|
+
|
|
38
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
39
|
+
try:
|
|
40
|
+
api_instance = WebPagesApi(api_client=api_client)
|
|
41
|
+
web_page = await api_instance.get_web_page(entity_url)
|
|
42
|
+
html = web_page.html
|
|
43
|
+
parse_html_input = ParseHtmlInput(
|
|
44
|
+
entity_id=entity_id,
|
|
45
|
+
entity_url=entity_url,
|
|
46
|
+
html=html,
|
|
47
|
+
row=row
|
|
48
|
+
)
|
|
49
|
+
payloads = await callback(parse_html_input)
|
|
50
|
+
await patch(configuration, entity_id, payloads)
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.error("Error %s occurred while processing entity %s with url %s" % (e, entity_id, entity_url))
|
|
53
|
+
|
|
54
|
+
return process
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
import wordlift_client
|
|
4
|
+
from wordlift_client import Configuration, EntityPatchRequest
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def patch(configuration: Configuration, entity_id: str, payloads: List[EntityPatchRequest]) -> Optional[str]:
|
|
8
|
+
# If the payloads are empty, exit.
|
|
9
|
+
if not payloads:
|
|
10
|
+
return None
|
|
11
|
+
|
|
12
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
13
|
+
api_instance = wordlift_client.EntitiesApi(api_client)
|
|
14
|
+
return await api_instance.patch_entities(entity_id, payloads)
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
from .create_google_search_console_data_import import create_google_search_console_data_import, import_url_analytics_factory
|
|
2
|
+
from .raise_error_if_account_analytics_not_configured import raise_error_if_account_analytics_not_configured
|
|
3
|
+
|
|
4
|
+
__all__ = ['create_google_search_console_data_import', 'import_url_analytics_factory',
|
|
5
|
+
'raise_error_if_account_analytics_not_configured']
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
from typing import Callable, Awaitable
|
|
4
|
+
|
|
5
|
+
import wordlift_client
|
|
6
|
+
from pandas import Series
|
|
7
|
+
from tenacity import retry, wait_fixed, stop_after_attempt
|
|
8
|
+
from tqdm.asyncio import tqdm
|
|
9
|
+
from twisted.mail.scripts.mailmail import Configuration
|
|
10
|
+
from wordlift_client import AnalyticsImportRequest
|
|
11
|
+
|
|
12
|
+
from ..deprecated import create_entities_with_top_query_dataframe
|
|
13
|
+
from ..utils import create_delayed
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def create_google_search_console_data_import(
|
|
19
|
+
configuration: Configuration, key: str, url_list: list[str]
|
|
20
|
+
) -> None:
|
|
21
|
+
# Get the entities data with the top query.
|
|
22
|
+
entities_with_top_query_df = await create_entities_with_top_query_dataframe(
|
|
23
|
+
key=key, url_list=url_list
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Calculate the date 7 days ago from today
|
|
27
|
+
seven_days_ago = datetime.now() - timedelta(days=7)
|
|
28
|
+
|
|
29
|
+
# Filter the DataFrame
|
|
30
|
+
entities_with_stale_data_df = entities_with_top_query_df[
|
|
31
|
+
entities_with_top_query_df["top_query_date_created"].isna()
|
|
32
|
+
| (entities_with_top_query_df["top_query_date_created"] < seven_days_ago)
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
import_url_analytics = await import_url_analytics_factory(
|
|
36
|
+
configuration=configuration
|
|
37
|
+
)
|
|
38
|
+
if len(entities_with_stale_data_df) > 0:
|
|
39
|
+
logger.info("Updating missing or stale Google Search Console data...")
|
|
40
|
+
# We're polite and not making more than 2 concurrent reqs.
|
|
41
|
+
delayed = create_delayed(import_url_analytics, 2)
|
|
42
|
+
await tqdm.gather(
|
|
43
|
+
*[delayed(row) for index, row in entities_with_stale_data_df.iterrows()],
|
|
44
|
+
total=len(entities_with_stale_data_df),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def import_url_analytics_factory(
|
|
49
|
+
configuration: Configuration,
|
|
50
|
+
) -> Callable[[Series], Awaitable[None]]:
|
|
51
|
+
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
|
|
52
|
+
async def import_url_analytics(row: Series) -> None:
|
|
53
|
+
url = row["url"]
|
|
54
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
55
|
+
api_instance = wordlift_client.AnalyticsImportsApi(api_client)
|
|
56
|
+
request = AnalyticsImportRequest(urls=[url])
|
|
57
|
+
await api_instance.create_analytics_import(request)
|
|
58
|
+
|
|
59
|
+
return import_url_analytics
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from pycountry import countries
|
|
2
|
+
from wordlift_client import AccountInfo
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
async def raise_error_if_account_analytics_not_configured(account: AccountInfo) -> bool:
|
|
6
|
+
if account.google_search_console_site_url is None:
|
|
7
|
+
raise ValueError(
|
|
8
|
+
"%s is not connected to Google Search Console, open https://my.wordlift.io to connect it." % account.dataset_uri)
|
|
9
|
+
|
|
10
|
+
if account.country_code is None:
|
|
11
|
+
raise ValueError(
|
|
12
|
+
"%s country code not configured, open https://my.wordlift.io to configure it." % account.dataset_uri)
|
|
13
|
+
|
|
14
|
+
# Get the country name
|
|
15
|
+
country = countries.get(alpha_2=account.country_code.upper())
|
|
16
|
+
if country is None:
|
|
17
|
+
raise ValueError(
|
|
18
|
+
"Country code %s is invalid, open https://my.wordlift.io to reconfigure it." % account.country_code.upper())
|
|
19
|
+
|
|
20
|
+
return True
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from liquid import Environment, CachingFileSystemLoader
|
|
5
|
+
from rdflib import Graph
|
|
6
|
+
|
|
7
|
+
from ...protocol import Context
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TtlLiquidGraphFactory:
|
|
13
|
+
path: Path
|
|
14
|
+
context: Context
|
|
15
|
+
|
|
16
|
+
def __init__(self, context: Context, path: Path):
|
|
17
|
+
self.context = context
|
|
18
|
+
self.path = path
|
|
19
|
+
|
|
20
|
+
async def graphs(self) -> None:
|
|
21
|
+
templates = list(self.path.rglob("*.ttl.liquid"))
|
|
22
|
+
env = Environment(
|
|
23
|
+
loader=CachingFileSystemLoader(self.path),
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
for template in templates:
|
|
27
|
+
template = env.get_template(str(template.absolute()))
|
|
28
|
+
turtle = template.render(account=self.context.account.__dict__)
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Create a new RDF graph
|
|
32
|
+
graph = Graph()
|
|
33
|
+
|
|
34
|
+
# Parse the Turtle data into the graph
|
|
35
|
+
graph.parse(data=turtle, format="turtle")
|
|
36
|
+
|
|
37
|
+
logger.info(
|
|
38
|
+
f"Successfully loaded {template} graph with {len(graph)} triples"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
await self.context.graph_queue.put(graph)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.error(f"Error loading contact points graph: {e}")
|