wordlift-sdk 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +3 -0
- wordlift_sdk/client/__init__.py +3 -0
- wordlift_sdk/client/client_configuration_factory.py +26 -0
- wordlift_sdk/configuration/__init__.py +4 -0
- wordlift_sdk/configuration/configuration_provider.py +44 -0
- wordlift_sdk/configuration/get_config_value.py +39 -0
- wordlift_sdk/container/__init__.py +3 -0
- wordlift_sdk/container/application_container.py +234 -0
- wordlift_sdk/deprecated/__init__.py +5 -0
- wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
- wordlift_sdk/entity/__init__.py +4 -0
- wordlift_sdk/entity/enrich.py +54 -0
- wordlift_sdk/entity/patch.py +14 -0
- wordlift_sdk/google_search_console/__init__.py +5 -0
- wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
- wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
- wordlift_sdk/graph/graph_bag.py +7 -0
- wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
- wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
- wordlift_sdk/graphql/__init__.py +3 -0
- wordlift_sdk/graphql/client/__init__.py +5 -0
- wordlift_sdk/graphql/client/client.py +69 -0
- wordlift_sdk/graphql/client/factory.py +36 -0
- wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
- wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
- wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
- wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
- wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
- wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
- wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
- wordlift_sdk/graphql/query.py +20 -0
- wordlift_sdk/graphql/utils/__init__.py +0 -0
- wordlift_sdk/graphql/utils/query/__init__.py +4 -0
- wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
- wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
- wordlift_sdk/id_generator/__init__.py +3 -0
- wordlift_sdk/id_generator/id_generator.py +40 -0
- wordlift_sdk/id_generator/id_generator_interface.py +8 -0
- wordlift_sdk/internal_link/__init__.py +3 -0
- wordlift_sdk/internal_link/utils.py +231 -0
- wordlift_sdk/kg/__init__.py +5 -0
- wordlift_sdk/kg/entity.py +17 -0
- wordlift_sdk/kg/entity_store.py +94 -0
- wordlift_sdk/kg/entity_store_factory.py +13 -0
- wordlift_sdk/kg/relation/__init__.py +0 -0
- wordlift_sdk/kg/relation/relation_service.py +78 -0
- wordlift_sdk/main.py +7 -0
- wordlift_sdk/namespace/SDO.py +3281 -0
- wordlift_sdk/namespace/__init__.py +3 -0
- wordlift_sdk/notebook/__init__.py +3 -0
- wordlift_sdk/notebook/install_if_missing.py +12 -0
- wordlift_sdk/protocol/__init__.py +5 -0
- wordlift_sdk/protocol/context.py +21 -0
- wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
- wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
- wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
- wordlift_sdk/protocol/graph/__init__.py +3 -0
- wordlift_sdk/protocol/graph/graph_queue.py +64 -0
- wordlift_sdk/protocol/load_override_class.py +30 -0
- wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
- wordlift_sdk/url_source/__init__.py +6 -0
- wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
- wordlift_sdk/url_source/list_url_source.py +28 -0
- wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
- wordlift_sdk/url_source/sitemap_url_source.py +36 -0
- wordlift_sdk/url_source/url_source.py +18 -0
- wordlift_sdk/url_source/url_source_input.py +6 -0
- wordlift_sdk/utils/__init__.py +17 -0
- wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
- wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
- wordlift_sdk/utils/create_entity_patch_request.py +14 -0
- wordlift_sdk/utils/delayed.py +12 -0
- wordlift_sdk/utils/get_me.py +8 -0
- wordlift_sdk/utils/import_url.py +35 -0
- wordlift_sdk/wordlift/__init__.py +0 -0
- wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
- wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
- wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
- wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
- wordlift_sdk/workflow/__init__.py +3 -0
- wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
- wordlift_sdk/workflow/kg_import_workflow.py +49 -0
- wordlift_sdk/workflow/patch_entities_factory.py +16 -0
- wordlift_sdk/workflow/url_handler/__init__.py +3 -0
- wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
- wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
- wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
- wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
- wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
- wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
- wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def install_if_missing(package_spec: str, import_name=None):
|
|
7
|
+
import_name = import_name or package_spec.split()[0]
|
|
8
|
+
try:
|
|
9
|
+
__import__(import_name)
|
|
10
|
+
except ImportError:
|
|
11
|
+
print(f"{import_name} not found. Installing...")
|
|
12
|
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package_spec])
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
from .context import Context
|
|
2
|
+
from .load_override_class import load_override_class
|
|
3
|
+
from .web_page_import_protocol import WebPageImportProtocolInterface, DefaultWebPageImportProtocol
|
|
4
|
+
|
|
5
|
+
__all__ = ['Context', 'load_override_class', 'WebPageImportProtocolInterface', 'DefaultWebPageImportProtocol']
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from wordlift_client import AccountInfo, Configuration
|
|
4
|
+
|
|
5
|
+
from .entity_patch import EntityPatchQueue
|
|
6
|
+
from .graph import GraphQueue
|
|
7
|
+
from ..configuration import ConfigurationProvider
|
|
8
|
+
from ..id_generator import IdGenerator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Context:
|
|
13
|
+
account: AccountInfo
|
|
14
|
+
client_configuration: Configuration
|
|
15
|
+
id_generator: IdGenerator
|
|
16
|
+
|
|
17
|
+
configuration_provider: ConfigurationProvider
|
|
18
|
+
|
|
19
|
+
# Queues where clients can append data to be written to the graph.
|
|
20
|
+
graph_queue: GraphQueue
|
|
21
|
+
entity_patch_queue: EntityPatchQueue
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import asyncio
|
|
3
|
+
import aiohttp
|
|
4
|
+
import pydantic_core
|
|
5
|
+
import wordlift_client
|
|
6
|
+
from wordlift_client import Configuration
|
|
7
|
+
|
|
8
|
+
from .entity_patch import EntityPatch
|
|
9
|
+
from tenacity import retry, retry_if_exception_type, wait_fixed, after_log
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EntityPatchQueue:
|
|
15
|
+
client_configuration: Configuration
|
|
16
|
+
|
|
17
|
+
def __init__(self, client_configuration: Configuration):
|
|
18
|
+
self.client_configuration = client_configuration
|
|
19
|
+
|
|
20
|
+
@retry(
|
|
21
|
+
# stop=stop_after_attempt(5), # Retry up to 5 times
|
|
22
|
+
retry=retry_if_exception_type(
|
|
23
|
+
asyncio.TimeoutError
|
|
24
|
+
| aiohttp.client_exceptions.ServerDisconnectedError
|
|
25
|
+
| aiohttp.client_exceptions.ClientConnectorError
|
|
26
|
+
| aiohttp.client_exceptions.ClientPayloadError
|
|
27
|
+
| aiohttp.client_exceptions.ClientConnectorDNSError
|
|
28
|
+
| pydantic_core._pydantic_core.ValidationError
|
|
29
|
+
| wordlift_client.exceptions.ServiceException
|
|
30
|
+
| wordlift_client.exceptions.BadRequestException
|
|
31
|
+
| aiohttp.client_exceptions.ClientOSError
|
|
32
|
+
),
|
|
33
|
+
wait=wait_fixed(2), # Wait 2 seconds between retries
|
|
34
|
+
after=after_log(logger, logging.WARNING),
|
|
35
|
+
reraise=True,
|
|
36
|
+
)
|
|
37
|
+
async def put(self, entity_patch: EntityPatch) -> None:
|
|
38
|
+
async with wordlift_client.ApiClient(
|
|
39
|
+
configuration=self.client_configuration
|
|
40
|
+
) as api_client:
|
|
41
|
+
api_instance = wordlift_client.EntitiesApi(api_client)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
await api_instance.patch_entities(
|
|
45
|
+
id=entity_patch.iri, entity_patch_request=entity_patch.requests
|
|
46
|
+
)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logger.error("Error patching entities", exc_info=e)
|
|
49
|
+
raise e
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import aiohttp
|
|
4
|
+
import asyncio
|
|
5
|
+
|
|
6
|
+
import pydantic_core
|
|
7
|
+
import wordlift_client
|
|
8
|
+
from rdflib import Graph
|
|
9
|
+
from rdflib.compare import to_isomorphic
|
|
10
|
+
from wordlift_client import Configuration
|
|
11
|
+
from tenacity import retry, retry_if_exception_type, wait_fixed, after_log
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GraphQueue:
|
|
17
|
+
client_configuration: Configuration
|
|
18
|
+
hashes: set[str]
|
|
19
|
+
|
|
20
|
+
def __init__(self, client_configuration: Configuration):
|
|
21
|
+
self.client_configuration = client_configuration
|
|
22
|
+
self.hashes = set()
|
|
23
|
+
|
|
24
|
+
@retry(
|
|
25
|
+
# stop=stop_after_attempt(5), # Retry up to 5 times
|
|
26
|
+
retry=retry_if_exception_type(
|
|
27
|
+
asyncio.TimeoutError
|
|
28
|
+
| aiohttp.client_exceptions.ServerDisconnectedError
|
|
29
|
+
| aiohttp.client_exceptions.ClientConnectorError
|
|
30
|
+
| aiohttp.client_exceptions.ClientPayloadError
|
|
31
|
+
| aiohttp.client_exceptions.ClientConnectorDNSError
|
|
32
|
+
| pydantic_core._pydantic_core.ValidationError
|
|
33
|
+
| wordlift_client.exceptions.ServiceException
|
|
34
|
+
| wordlift_client.exceptions.BadRequestException
|
|
35
|
+
| aiohttp.client_exceptions.ClientOSError
|
|
36
|
+
),
|
|
37
|
+
wait=wait_fixed(2), # Wait 2 seconds between retries
|
|
38
|
+
after=after_log(logger, logging.WARNING),
|
|
39
|
+
reraise=True,
|
|
40
|
+
)
|
|
41
|
+
async def put(self, graph: Graph) -> None:
|
|
42
|
+
hash = GraphQueue.hash_graph(graph)
|
|
43
|
+
if hash not in self.hashes:
|
|
44
|
+
self.hashes.add(hash)
|
|
45
|
+
|
|
46
|
+
async with wordlift_client.ApiClient(
|
|
47
|
+
configuration=self.client_configuration
|
|
48
|
+
) as api_client:
|
|
49
|
+
api_instance = wordlift_client.EntitiesApi(api_client)
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
await api_instance.create_or_update_entities(
|
|
53
|
+
graph.serialize(format="turtle"),
|
|
54
|
+
_content_type="text/turtle",
|
|
55
|
+
)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error(f"Failed to create entities: {e}", exc_info=e)
|
|
58
|
+
raise e
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def hash_graph(graph: Graph) -> str:
|
|
62
|
+
iso = to_isomorphic(graph)
|
|
63
|
+
canon = iso.serialize(format="nt")
|
|
64
|
+
return hashlib.sha256(canon.encode("utf-8")).hexdigest()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from importlib.util import spec_from_file_location, module_from_spec
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Type, TypeVar
|
|
6
|
+
|
|
7
|
+
T = TypeVar("T")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def load_override_class(
|
|
11
|
+
name: str, class_name: str, default_class: Type[T], **kwargs
|
|
12
|
+
) -> T:
|
|
13
|
+
override_dir = os.environ.get("WORDLIFT_OVERRIDE_DIR", "app/overrides")
|
|
14
|
+
override_path = Path(f"{override_dir}/{name}.py")
|
|
15
|
+
|
|
16
|
+
# Ensure the override directory is importable
|
|
17
|
+
abs_dir = str(Path(override_dir).resolve())
|
|
18
|
+
if abs_dir not in sys.path:
|
|
19
|
+
sys.path.insert(0, abs_dir)
|
|
20
|
+
|
|
21
|
+
if override_path.exists():
|
|
22
|
+
spec = spec_from_file_location(name, override_path)
|
|
23
|
+
mod = module_from_spec(spec)
|
|
24
|
+
sys.modules[name] = mod
|
|
25
|
+
spec.loader.exec_module(mod)
|
|
26
|
+
|
|
27
|
+
cls = getattr(mod, class_name)
|
|
28
|
+
return cls(**kwargs)
|
|
29
|
+
|
|
30
|
+
return default_class(**kwargs)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
|
|
3
|
+
from .context import Context
|
|
4
|
+
|
|
5
|
+
from typing import Protocol
|
|
6
|
+
from wordlift_client import WebPageImportResponse
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class WebPageImportProtocolInterface(Protocol):
|
|
10
|
+
context: Context
|
|
11
|
+
|
|
12
|
+
def __init__(self, context: Context):
|
|
13
|
+
self.context = context
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
async def callback(self, web_page_import_response: WebPageImportResponse) -> None:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DefaultWebPageImportProtocol(WebPageImportProtocolInterface):
|
|
21
|
+
|
|
22
|
+
async def callback(self, web_page_import_response: WebPageImportResponse) -> None:
|
|
23
|
+
pass
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from .google_sheets_url_source import GoogleSheetsUrlSource
|
|
2
|
+
from .list_url_source import ListUrlSource
|
|
3
|
+
from .sitemap_url_source import SitemapUrlSource
|
|
4
|
+
from .url_source import UrlSource, Url
|
|
5
|
+
|
|
6
|
+
__all__ = ["Url", "UrlSource", "GoogleSheetsUrlSource", "ListUrlSource", "SitemapUrlSource"]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import AsyncGenerator
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from google.auth.credentials import Credentials
|
|
5
|
+
from gspread import Client
|
|
6
|
+
|
|
7
|
+
from .url_source import UrlSource, Url
|
|
8
|
+
from ..utils.create_dataframe_from_google_sheets import create_dataframe_from_google_sheets
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GoogleSheetsUrlSource(UrlSource):
|
|
12
|
+
"""
|
|
13
|
+
A URL provider that extracts URLs from a Google Sheet.
|
|
14
|
+
|
|
15
|
+
This class implements the UrlProvider interface to provide URLs from a Google Sheet.
|
|
16
|
+
It uses the create_dataframe_from_google_sheets function to fetch the sheet data
|
|
17
|
+
and extracts the URLs from the 'url' column.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, creds_or_client: Credentials | Client, url: str, sheet: str):
|
|
21
|
+
"""
|
|
22
|
+
Initialize the GoogleSheetsUrlProvider with Google Sheets credentials and URL.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
creds_or_client (Credentials | Client): Google Auth Credentials or gspread Client
|
|
26
|
+
url (str): The URL of the Google Sheet
|
|
27
|
+
sheet (str): The name of the worksheet
|
|
28
|
+
"""
|
|
29
|
+
self.creds_or_client = creds_or_client
|
|
30
|
+
self.url = url
|
|
31
|
+
self.sheet = sheet
|
|
32
|
+
|
|
33
|
+
async def urls(self) -> AsyncGenerator[Url, None]:
|
|
34
|
+
"""
|
|
35
|
+
Asynchronously yield URLs from the Google Sheet.
|
|
36
|
+
|
|
37
|
+
This method fetches the Google Sheet data using the create_dataframe_from_google_sheets function,
|
|
38
|
+
and yields each URL from the 'url' column as a Url object.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
AsyncGenerator[Url, None]: An asynchronous generator that yields Url objects.
|
|
42
|
+
"""
|
|
43
|
+
# Get the dataframe from the Google Sheet
|
|
44
|
+
df = create_dataframe_from_google_sheets(self.creds_or_client, self.url, self.sheet)
|
|
45
|
+
|
|
46
|
+
# Check if 'url' column exists
|
|
47
|
+
if 'url' not in df.columns:
|
|
48
|
+
raise ValueError("The Google Sheet must contain a 'url' column")
|
|
49
|
+
|
|
50
|
+
# Yield each URL from the 'url' column
|
|
51
|
+
for url in df['url']:
|
|
52
|
+
if pd.notna(url) and url.strip(): # Skip empty or NaN values
|
|
53
|
+
yield Url(value=url.strip())
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import AsyncGenerator
|
|
2
|
+
|
|
3
|
+
from .url_source import UrlSource, Url
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ListUrlSource(UrlSource):
|
|
7
|
+
"""A URL provider that yields URLs from a predefined list.
|
|
8
|
+
|
|
9
|
+
This provider takes a list of URL strings and provides them one by one
|
|
10
|
+
through the async generator method `urls()`.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, urls: list[str]):
|
|
14
|
+
"""Initialize the ListUrlProvider with a list of URLs.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
urls: A list of URL strings to be provided.
|
|
18
|
+
"""
|
|
19
|
+
self._url_list = urls
|
|
20
|
+
|
|
21
|
+
async def urls(self) -> AsyncGenerator[Url, None]:
|
|
22
|
+
"""Asynchronously yield Url objects from the predefined list.
|
|
23
|
+
|
|
24
|
+
Yields:
|
|
25
|
+
Url: A Url object for each URL string in the list.
|
|
26
|
+
"""
|
|
27
|
+
for url in self._url_list:
|
|
28
|
+
yield Url(value=url)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from dataclasses import asdict
|
|
2
|
+
from typing import AsyncGenerator
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from . import UrlSource, Url
|
|
7
|
+
from ..graphql.client import GraphQlClient
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NewOrChangedUrlSource(UrlSource):
|
|
11
|
+
graphql_client: GraphQlClient
|
|
12
|
+
url_provider: UrlSource
|
|
13
|
+
overwrite: bool
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self, url_provider: UrlSource, graphql_client: GraphQlClient, overwrite: bool
|
|
17
|
+
):
|
|
18
|
+
self.graphql_client = graphql_client
|
|
19
|
+
self.url_provider = url_provider
|
|
20
|
+
self.overwrite = overwrite
|
|
21
|
+
|
|
22
|
+
async def urls(self) -> AsyncGenerator[Url, None]:
|
|
23
|
+
# Get the list of URLs from the underlying provider.
|
|
24
|
+
url_df = pd.DataFrame([asdict(url) async for url in self.url_provider.urls()])
|
|
25
|
+
# Get the list of URLs from GraphQL.
|
|
26
|
+
list_records = await self.graphql_client.run(
|
|
27
|
+
"entities_url_iri_with_source_equal_to_web_page_import",
|
|
28
|
+
{"urls": url_df["value"].tolist() if "value" in url_df.columns else []},
|
|
29
|
+
)
|
|
30
|
+
graphql_df = pd.DataFrame.from_records(
|
|
31
|
+
data=[record for record in list_records],
|
|
32
|
+
columns=("url", "iri", "date_imported"),
|
|
33
|
+
)
|
|
34
|
+
graphql_df["date_imported"] = pd.to_datetime(
|
|
35
|
+
graphql_df["date_imported"], utc=True, errors="coerce"
|
|
36
|
+
)
|
|
37
|
+
merged_df = pd.merge(
|
|
38
|
+
url_df,
|
|
39
|
+
graphql_df,
|
|
40
|
+
left_on="value",
|
|
41
|
+
right_on="url",
|
|
42
|
+
how="left",
|
|
43
|
+
suffixes=("", "_graphql"),
|
|
44
|
+
)
|
|
45
|
+
filtered_df = merged_df[
|
|
46
|
+
self.overwrite
|
|
47
|
+
| merged_df["date_imported"].isna()
|
|
48
|
+
| (merged_df["date_imported"] < merged_df["date_modified"])
|
|
49
|
+
]
|
|
50
|
+
for _, row in filtered_df.iterrows():
|
|
51
|
+
yield Url(
|
|
52
|
+
value=row["value"],
|
|
53
|
+
iri=None if pd.isna(row["iri_graphql"]) else row["iri_graphql"],
|
|
54
|
+
date_modified=None
|
|
55
|
+
if pd.isna(row["date_modified"])
|
|
56
|
+
else row["date_modified"],
|
|
57
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import AsyncGenerator, Optional
|
|
3
|
+
|
|
4
|
+
import advertools as adv
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from .url_source import UrlSource, Url
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SitemapUrlSource(UrlSource):
|
|
11
|
+
sitemap_url: str
|
|
12
|
+
pattern: re.Pattern | None
|
|
13
|
+
|
|
14
|
+
def __init__(self, sitemap_url: str, pattern: Optional[re.Pattern] = None):
|
|
15
|
+
self.pattern = pattern
|
|
16
|
+
self.sitemap_url = sitemap_url
|
|
17
|
+
|
|
18
|
+
async def urls(self) -> AsyncGenerator[Url, None]:
|
|
19
|
+
sitemap_df = adv.sitemaps.sitemap_to_df(sitemap_url=self.sitemap_url)
|
|
20
|
+
# Ensure 'lastmod' column exists
|
|
21
|
+
if "lastmod" not in sitemap_df.columns:
|
|
22
|
+
sitemap_df["lastmod"] = None
|
|
23
|
+
sitemap_df["lastmod_as_datetime"] = pd.to_datetime(
|
|
24
|
+
sitemap_df["lastmod"], errors="coerce"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
for _, row in sitemap_df.iterrows():
|
|
28
|
+
url = row["loc"]
|
|
29
|
+
last_mod_as_datetime = row["lastmod_as_datetime"]
|
|
30
|
+
if self.pattern is None or self.pattern.search(url):
|
|
31
|
+
yield Url(
|
|
32
|
+
value=url,
|
|
33
|
+
date_modified=None
|
|
34
|
+
if pd.isna(last_mod_as_datetime)
|
|
35
|
+
else last_mod_as_datetime.to_pydatetime(),
|
|
36
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import AsyncGenerator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Url:
|
|
9
|
+
value: str
|
|
10
|
+
iri: str | None = None
|
|
11
|
+
date_modified: datetime | None = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UrlSource(ABC):
|
|
15
|
+
@abstractmethod
|
|
16
|
+
async def urls(self) -> AsyncGenerator[Url, None]:
|
|
17
|
+
"""Asynchronously yields Url objects."""
|
|
18
|
+
pass
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .create_dataframe_from_google_sheets import create_dataframe_from_google_sheets
|
|
2
|
+
from .create_dataframe_of_entities_by_types import create_dataframe_of_entities_by_types
|
|
3
|
+
from .create_dataframe_of_entities_with_embedding_vectors import create_dataframe_of_entities_with_embedding_vectors
|
|
4
|
+
from .create_dataframe_of_url_iri import create_dataframe_of_url_iri
|
|
5
|
+
from .create_entity_patch_request import create_entity_patch_request
|
|
6
|
+
from .delayed import create_delayed
|
|
7
|
+
from .get_me import get_me
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"create_dataframe_from_google_sheets",
|
|
11
|
+
"create_dataframe_of_entities_by_types",
|
|
12
|
+
"create_dataframe_of_entities_with_embedding_vectors",
|
|
13
|
+
"create_dataframe_of_url_iri",
|
|
14
|
+
"create_entity_patch_request",
|
|
15
|
+
"create_delayed",
|
|
16
|
+
"get_me"
|
|
17
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import gspread
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from google.auth.credentials import Credentials
|
|
4
|
+
from gspread import Client
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_dataframe_from_google_sheets(creds_or_client: Credentials | Client, url: str, sheet: str) -> pd.DataFrame:
|
|
8
|
+
if isinstance(creds_or_client, Credentials):
|
|
9
|
+
return create_dataframe_from_google_sheets_using_credentials(creds_or_client, url, sheet)
|
|
10
|
+
elif isinstance(creds_or_client, Client):
|
|
11
|
+
return create_dataframe_from_google_sheets_using_client(creds_or_client, url, sheet)
|
|
12
|
+
else:
|
|
13
|
+
raise TypeError("Expected creds_or_client to be of type Credentials or Client")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_dataframe_from_google_sheets_using_credentials(creds: Credentials, url: str, sheet: str) -> pd.DataFrame:
|
|
17
|
+
gc = gspread.authorize(creds)
|
|
18
|
+
|
|
19
|
+
return create_dataframe_from_google_sheets_using_client(gc, url, sheet)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_dataframe_from_google_sheets_using_client(gc: Client, url: str, sheet: str) -> pd.DataFrame:
|
|
23
|
+
sheet = gc.open_by_url(url).worksheet(sheet)
|
|
24
|
+
data = sheet.get_all_records()
|
|
25
|
+
|
|
26
|
+
return pd.DataFrame([{k.strip(): v for k, v in row.items()} for row in data])
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
from wordlift_sdk import graphql
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
async def create_dataframe_of_entities_by_types(key: str, types: set[str]) -> pd.DataFrame:
|
|
7
|
+
return await graphql.query(
|
|
8
|
+
key=key,
|
|
9
|
+
query_string="""
|
|
10
|
+
query getEntities($types: [String]!) {
|
|
11
|
+
entities(
|
|
12
|
+
query: { typeConstraint: { in: $types } }
|
|
13
|
+
) {
|
|
14
|
+
iri
|
|
15
|
+
keywords: string(name: "schema:keywords")
|
|
16
|
+
url: string(name: "schema:url")
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
""",
|
|
20
|
+
root_element="entities",
|
|
21
|
+
columns=['iri', 'keywords', 'url'],
|
|
22
|
+
variable_values={
|
|
23
|
+
# `set` cannot be serialized in Python, so we convert to `list`
|
|
24
|
+
"types": list(types)
|
|
25
|
+
}
|
|
26
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
3
|
+
|
|
4
|
+
from wordlift_sdk import graphql
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@retry(
|
|
8
|
+
stop=stop_after_attempt(5),
|
|
9
|
+
wait=wait_fixed(2)
|
|
10
|
+
)
|
|
11
|
+
async def create_dataframe_of_entities_with_embedding_vectors(key: str) -> pd.DataFrame:
|
|
12
|
+
return await graphql.query(
|
|
13
|
+
key=key,
|
|
14
|
+
query_string="""
|
|
15
|
+
query {
|
|
16
|
+
entities(
|
|
17
|
+
query: {
|
|
18
|
+
embeddingValueConstraint: { exists: { exists: true, excludeEmpty: true } }
|
|
19
|
+
}
|
|
20
|
+
) {
|
|
21
|
+
iri
|
|
22
|
+
url: string(name: "schema:url")
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
""",
|
|
26
|
+
root_element='entities',
|
|
27
|
+
columns=['iri', 'url'],
|
|
28
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..graphql.client.factory import GraphQlClientFactory
|
|
6
|
+
from wordlift_sdk.kg import EntityStoreFactory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def create_dataframe_of_url_iri(key: str, url_list: List[str]) -> pd.DataFrame:
|
|
10
|
+
graphql_client_factory = GraphQlClientFactory(key)
|
|
11
|
+
graphql_client = graphql_client_factory.create_gql_client()
|
|
12
|
+
entity_store_factory = EntityStoreFactory(graphql_client)
|
|
13
|
+
entity_store = entity_store_factory.create()
|
|
14
|
+
return await entity_store.url_iri_as_dataframe(url_list)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from rdflib import Graph, URIRef, Node
|
|
2
|
+
import wordlift_client
|
|
3
|
+
from wordlift_client.models.entity_patch_request import EntityPatchRequest
|
|
4
|
+
|
|
5
|
+
def create_entity_patch_request(resource: URIRef, prop: URIRef, value: Node) -> EntityPatchRequest:
|
|
6
|
+
g = Graph()
|
|
7
|
+
g.bind('schema', 'http://schema.org/')
|
|
8
|
+
g.add((resource, prop, value))
|
|
9
|
+
|
|
10
|
+
return wordlift_client.EntityPatchRequest(
|
|
11
|
+
op='add',
|
|
12
|
+
path='/' + str(prop),
|
|
13
|
+
value=g.serialize(format='json-ld', auto_compact=True)
|
|
14
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from multiprocessing import cpu_count
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def create_delayed(callback, concurrency=cpu_count() + 1):
|
|
6
|
+
sem = asyncio.Semaphore(concurrency)
|
|
7
|
+
|
|
8
|
+
async def callback_with_semaphore(*args, **kwargs):
|
|
9
|
+
async with sem:
|
|
10
|
+
return await callback(*args, **kwargs)
|
|
11
|
+
|
|
12
|
+
return callback_with_semaphore
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import wordlift_client
|
|
2
|
+
from wordlift_client import Configuration, AccountInfo, AccountApi
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
async def get_me(configuration: Configuration) -> AccountInfo:
|
|
6
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
7
|
+
api = AccountApi(api_client)
|
|
8
|
+
return await api.get_me()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Callable, Awaitable
|
|
3
|
+
|
|
4
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
5
|
+
from wordlift_client import Configuration, SitemapImportsApi, SitemapImportRequest, EmbeddingRequest
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def import_url_factory(configuration: Configuration, types: set[str]) -> Callable[[set[str]], Awaitable[None]]:
|
|
11
|
+
@retry(
|
|
12
|
+
stop=stop_after_attempt(5),
|
|
13
|
+
wait=wait_fixed(2)
|
|
14
|
+
)
|
|
15
|
+
async def import_url(url_list: set[str]) -> None:
|
|
16
|
+
import wordlift_client
|
|
17
|
+
|
|
18
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
19
|
+
imports_api = SitemapImportsApi(api_client)
|
|
20
|
+
request = SitemapImportRequest(
|
|
21
|
+
embedding=EmbeddingRequest(
|
|
22
|
+
properties=["http://schema.org/headline", "http://schema.org/abstract", "http://schema.org/text"]
|
|
23
|
+
),
|
|
24
|
+
output_types=list(types),
|
|
25
|
+
urls=list(url_list),
|
|
26
|
+
overwrite=True,
|
|
27
|
+
id_generator="headline-with-url-hash"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
await imports_api.create_sitemap_import(sitemap_import_request=request)
|
|
32
|
+
except Exception as e:
|
|
33
|
+
logger.error("Error importing URLs: %s", e)
|
|
34
|
+
|
|
35
|
+
return import_url
|
|
File without changes
|