wordlift-sdk 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. wordlift_sdk/__init__.py +3 -0
  2. wordlift_sdk/client/__init__.py +3 -0
  3. wordlift_sdk/client/client_configuration_factory.py +26 -0
  4. wordlift_sdk/configuration/__init__.py +4 -0
  5. wordlift_sdk/configuration/configuration_provider.py +44 -0
  6. wordlift_sdk/configuration/get_config_value.py +39 -0
  7. wordlift_sdk/container/__init__.py +3 -0
  8. wordlift_sdk/container/application_container.py +234 -0
  9. wordlift_sdk/deprecated/__init__.py +5 -0
  10. wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
  11. wordlift_sdk/entity/__init__.py +4 -0
  12. wordlift_sdk/entity/enrich.py +54 -0
  13. wordlift_sdk/entity/patch.py +14 -0
  14. wordlift_sdk/google_search_console/__init__.py +5 -0
  15. wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
  16. wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
  17. wordlift_sdk/graph/graph_bag.py +7 -0
  18. wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
  19. wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
  20. wordlift_sdk/graphql/__init__.py +3 -0
  21. wordlift_sdk/graphql/client/__init__.py +5 -0
  22. wordlift_sdk/graphql/client/client.py +69 -0
  23. wordlift_sdk/graphql/client/factory.py +36 -0
  24. wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
  25. wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
  26. wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
  27. wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
  28. wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
  29. wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
  30. wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
  31. wordlift_sdk/graphql/query.py +20 -0
  32. wordlift_sdk/graphql/utils/__init__.py +0 -0
  33. wordlift_sdk/graphql/utils/query/__init__.py +4 -0
  34. wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
  35. wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
  36. wordlift_sdk/id_generator/__init__.py +3 -0
  37. wordlift_sdk/id_generator/id_generator.py +40 -0
  38. wordlift_sdk/id_generator/id_generator_interface.py +8 -0
  39. wordlift_sdk/internal_link/__init__.py +3 -0
  40. wordlift_sdk/internal_link/utils.py +231 -0
  41. wordlift_sdk/kg/__init__.py +5 -0
  42. wordlift_sdk/kg/entity.py +17 -0
  43. wordlift_sdk/kg/entity_store.py +94 -0
  44. wordlift_sdk/kg/entity_store_factory.py +13 -0
  45. wordlift_sdk/kg/relation/__init__.py +0 -0
  46. wordlift_sdk/kg/relation/relation_service.py +78 -0
  47. wordlift_sdk/main.py +7 -0
  48. wordlift_sdk/namespace/SDO.py +3281 -0
  49. wordlift_sdk/namespace/__init__.py +3 -0
  50. wordlift_sdk/notebook/__init__.py +3 -0
  51. wordlift_sdk/notebook/install_if_missing.py +12 -0
  52. wordlift_sdk/protocol/__init__.py +5 -0
  53. wordlift_sdk/protocol/context.py +21 -0
  54. wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
  55. wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
  56. wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
  57. wordlift_sdk/protocol/graph/__init__.py +3 -0
  58. wordlift_sdk/protocol/graph/graph_queue.py +64 -0
  59. wordlift_sdk/protocol/load_override_class.py +30 -0
  60. wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
  61. wordlift_sdk/url_source/__init__.py +6 -0
  62. wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
  63. wordlift_sdk/url_source/list_url_source.py +28 -0
  64. wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
  65. wordlift_sdk/url_source/sitemap_url_source.py +36 -0
  66. wordlift_sdk/url_source/url_source.py +18 -0
  67. wordlift_sdk/url_source/url_source_input.py +6 -0
  68. wordlift_sdk/utils/__init__.py +17 -0
  69. wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
  70. wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
  71. wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
  72. wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
  73. wordlift_sdk/utils/create_entity_patch_request.py +14 -0
  74. wordlift_sdk/utils/delayed.py +12 -0
  75. wordlift_sdk/utils/get_me.py +8 -0
  76. wordlift_sdk/utils/import_url.py +35 -0
  77. wordlift_sdk/wordlift/__init__.py +0 -0
  78. wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
  79. wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
  80. wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
  81. wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  82. wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
  83. wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
  84. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
  85. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
  86. wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
  87. wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
  88. wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
  89. wordlift_sdk/workflow/__init__.py +3 -0
  90. wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
  91. wordlift_sdk/workflow/kg_import_workflow.py +49 -0
  92. wordlift_sdk/workflow/patch_entities_factory.py +16 -0
  93. wordlift_sdk/workflow/url_handler/__init__.py +3 -0
  94. wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
  95. wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
  96. wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
  97. wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
  98. wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
  99. wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
  100. wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,3 @@
1
+ from .SDO import SDO
2
+
3
+ __all__ = ["SDO"]
@@ -0,0 +1,3 @@
1
+ from .install_if_missing import install_if_missing
2
+
3
+ __all__ = ['install_if_missing']
@@ -0,0 +1,12 @@
1
+ import logging
2
+ import subprocess
3
+ import sys
4
+
5
+
6
+ def install_if_missing(package_spec: str, import_name=None):
7
+ import_name = import_name or package_spec.split()[0]
8
+ try:
9
+ __import__(import_name)
10
+ except ImportError:
11
+ print(f"{import_name} not found. Installing...")
12
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package_spec])
@@ -0,0 +1,5 @@
1
+ from .context import Context
2
+ from .load_override_class import load_override_class
3
+ from .web_page_import_protocol import WebPageImportProtocolInterface, DefaultWebPageImportProtocol
4
+
5
+ __all__ = ['Context', 'load_override_class', 'WebPageImportProtocolInterface', 'DefaultWebPageImportProtocol']
@@ -0,0 +1,21 @@
1
+ from dataclasses import dataclass
2
+
3
+ from wordlift_client import AccountInfo, Configuration
4
+
5
+ from .entity_patch import EntityPatchQueue
6
+ from .graph import GraphQueue
7
+ from ..configuration import ConfigurationProvider
8
+ from ..id_generator import IdGenerator
9
+
10
+
11
+ @dataclass
12
+ class Context:
13
+ account: AccountInfo
14
+ client_configuration: Configuration
15
+ id_generator: IdGenerator
16
+
17
+ configuration_provider: ConfigurationProvider
18
+
19
+ # Queues where clients can append data to be written to the graph.
20
+ graph_queue: GraphQueue
21
+ entity_patch_queue: EntityPatchQueue
@@ -0,0 +1,4 @@
1
+ from .entity_patch import EntityPatch
2
+ from .entity_patch_queue import EntityPatchQueue
3
+
4
+ __all__ = ["EntityPatch", "EntityPatchQueue"]
@@ -0,0 +1,8 @@
1
+ from dataclasses import dataclass
2
+ from wordlift_client import EntityPatchRequest
3
+
4
+
5
+ @dataclass
6
+ class EntityPatch:
7
+ iri: str
8
+ requests: list[EntityPatchRequest]
@@ -0,0 +1,49 @@
1
+ import logging
2
+ import asyncio
3
+ import aiohttp
4
+ import pydantic_core
5
+ import wordlift_client
6
+ from wordlift_client import Configuration
7
+
8
+ from .entity_patch import EntityPatch
9
+ from tenacity import retry, retry_if_exception_type, wait_fixed, after_log
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class EntityPatchQueue:
15
+ client_configuration: Configuration
16
+
17
+ def __init__(self, client_configuration: Configuration):
18
+ self.client_configuration = client_configuration
19
+
20
+ @retry(
21
+ # stop=stop_after_attempt(5), # Retry up to 5 times
22
+ retry=retry_if_exception_type(
23
+ asyncio.TimeoutError
24
+ | aiohttp.client_exceptions.ServerDisconnectedError
25
+ | aiohttp.client_exceptions.ClientConnectorError
26
+ | aiohttp.client_exceptions.ClientPayloadError
27
+ | aiohttp.client_exceptions.ClientConnectorDNSError
28
+ | pydantic_core._pydantic_core.ValidationError
29
+ | wordlift_client.exceptions.ServiceException
30
+ | wordlift_client.exceptions.BadRequestException
31
+ | aiohttp.client_exceptions.ClientOSError
32
+ ),
33
+ wait=wait_fixed(2), # Wait 2 seconds between retries
34
+ after=after_log(logger, logging.WARNING),
35
+ reraise=True,
36
+ )
37
+ async def put(self, entity_patch: EntityPatch) -> None:
38
+ async with wordlift_client.ApiClient(
39
+ configuration=self.client_configuration
40
+ ) as api_client:
41
+ api_instance = wordlift_client.EntitiesApi(api_client)
42
+
43
+ try:
44
+ await api_instance.patch_entities(
45
+ id=entity_patch.iri, entity_patch_request=entity_patch.requests
46
+ )
47
+ except Exception as e:
48
+ logger.error("Error patching entities", exc_info=e)
49
+ raise e
@@ -0,0 +1,3 @@
1
+ from .graph_queue import GraphQueue
2
+
3
+ __all__ = ["GraphQueue"]
@@ -0,0 +1,64 @@
1
+ import hashlib
2
+ import logging
3
+ import aiohttp
4
+ import asyncio
5
+
6
+ import pydantic_core
7
+ import wordlift_client
8
+ from rdflib import Graph
9
+ from rdflib.compare import to_isomorphic
10
+ from wordlift_client import Configuration
11
+ from tenacity import retry, retry_if_exception_type, wait_fixed, after_log
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class GraphQueue:
17
+ client_configuration: Configuration
18
+ hashes: set[str]
19
+
20
+ def __init__(self, client_configuration: Configuration):
21
+ self.client_configuration = client_configuration
22
+ self.hashes = set()
23
+
24
+ @retry(
25
+ # stop=stop_after_attempt(5), # Retry up to 5 times
26
+ retry=retry_if_exception_type(
27
+ asyncio.TimeoutError
28
+ | aiohttp.client_exceptions.ServerDisconnectedError
29
+ | aiohttp.client_exceptions.ClientConnectorError
30
+ | aiohttp.client_exceptions.ClientPayloadError
31
+ | aiohttp.client_exceptions.ClientConnectorDNSError
32
+ | pydantic_core._pydantic_core.ValidationError
33
+ | wordlift_client.exceptions.ServiceException
34
+ | wordlift_client.exceptions.BadRequestException
35
+ | aiohttp.client_exceptions.ClientOSError
36
+ ),
37
+ wait=wait_fixed(2), # Wait 2 seconds between retries
38
+ after=after_log(logger, logging.WARNING),
39
+ reraise=True,
40
+ )
41
+ async def put(self, graph: Graph) -> None:
42
+ hash = GraphQueue.hash_graph(graph)
43
+ if hash not in self.hashes:
44
+ self.hashes.add(hash)
45
+
46
+ async with wordlift_client.ApiClient(
47
+ configuration=self.client_configuration
48
+ ) as api_client:
49
+ api_instance = wordlift_client.EntitiesApi(api_client)
50
+
51
+ try:
52
+ await api_instance.create_or_update_entities(
53
+ graph.serialize(format="turtle"),
54
+ _content_type="text/turtle",
55
+ )
56
+ except Exception as e:
57
+ logger.error(f"Failed to create entities: {e}", exc_info=e)
58
+ raise e
59
+
60
+ @staticmethod
61
+ def hash_graph(graph: Graph) -> str:
62
+ iso = to_isomorphic(graph)
63
+ canon = iso.serialize(format="nt")
64
+ return hashlib.sha256(canon.encode("utf-8")).hexdigest()
@@ -0,0 +1,30 @@
1
+ import os
2
+ import sys
3
+ from importlib.util import spec_from_file_location, module_from_spec
4
+ from pathlib import Path
5
+ from typing import Type, TypeVar
6
+
7
+ T = TypeVar("T")
8
+
9
+
10
+ def load_override_class(
11
+ name: str, class_name: str, default_class: Type[T], **kwargs
12
+ ) -> T:
13
+ override_dir = os.environ.get("WORDLIFT_OVERRIDE_DIR", "app/overrides")
14
+ override_path = Path(f"{override_dir}/{name}.py")
15
+
16
+ # Ensure the override directory is importable
17
+ abs_dir = str(Path(override_dir).resolve())
18
+ if abs_dir not in sys.path:
19
+ sys.path.insert(0, abs_dir)
20
+
21
+ if override_path.exists():
22
+ spec = spec_from_file_location(name, override_path)
23
+ mod = module_from_spec(spec)
24
+ sys.modules[name] = mod
25
+ spec.loader.exec_module(mod)
26
+
27
+ cls = getattr(mod, class_name)
28
+ return cls(**kwargs)
29
+
30
+ return default_class(**kwargs)
@@ -0,0 +1,23 @@
1
+ from abc import abstractmethod
2
+
3
+ from .context import Context
4
+
5
+ from typing import Protocol
6
+ from wordlift_client import WebPageImportResponse
7
+
8
+
9
+ class WebPageImportProtocolInterface(Protocol):
10
+ context: Context
11
+
12
+ def __init__(self, context: Context):
13
+ self.context = context
14
+
15
+ @abstractmethod
16
+ async def callback(self, web_page_import_response: WebPageImportResponse) -> None:
17
+ pass
18
+
19
+
20
+ class DefaultWebPageImportProtocol(WebPageImportProtocolInterface):
21
+
22
+ async def callback(self, web_page_import_response: WebPageImportResponse) -> None:
23
+ pass
@@ -0,0 +1,6 @@
1
+ from .google_sheets_url_source import GoogleSheetsUrlSource
2
+ from .list_url_source import ListUrlSource
3
+ from .sitemap_url_source import SitemapUrlSource
4
+ from .url_source import UrlSource, Url
5
+
6
+ __all__ = ["Url", "UrlSource", "GoogleSheetsUrlSource", "ListUrlSource", "SitemapUrlSource"]
@@ -0,0 +1,53 @@
1
+ from typing import AsyncGenerator
2
+
3
+ import pandas as pd
4
+ from google.auth.credentials import Credentials
5
+ from gspread import Client
6
+
7
+ from .url_source import UrlSource, Url
8
+ from ..utils.create_dataframe_from_google_sheets import create_dataframe_from_google_sheets
9
+
10
+
11
+ class GoogleSheetsUrlSource(UrlSource):
12
+ """
13
+ A URL provider that extracts URLs from a Google Sheet.
14
+
15
+ This class implements the UrlProvider interface to provide URLs from a Google Sheet.
16
+ It uses the create_dataframe_from_google_sheets function to fetch the sheet data
17
+ and extracts the URLs from the 'url' column.
18
+ """
19
+
20
+ def __init__(self, creds_or_client: Credentials | Client, url: str, sheet: str):
21
+ """
22
+ Initialize the GoogleSheetsUrlProvider with Google Sheets credentials and URL.
23
+
24
+ Args:
25
+ creds_or_client (Credentials | Client): Google Auth Credentials or gspread Client
26
+ url (str): The URL of the Google Sheet
27
+ sheet (str): The name of the worksheet
28
+ """
29
+ self.creds_or_client = creds_or_client
30
+ self.url = url
31
+ self.sheet = sheet
32
+
33
+ async def urls(self) -> AsyncGenerator[Url, None]:
34
+ """
35
+ Asynchronously yield URLs from the Google Sheet.
36
+
37
+ This method fetches the Google Sheet data using the create_dataframe_from_google_sheets function,
38
+ and yields each URL from the 'url' column as a Url object.
39
+
40
+ Returns:
41
+ AsyncGenerator[Url, None]: An asynchronous generator that yields Url objects.
42
+ """
43
+ # Get the dataframe from the Google Sheet
44
+ df = create_dataframe_from_google_sheets(self.creds_or_client, self.url, self.sheet)
45
+
46
+ # Check if 'url' column exists
47
+ if 'url' not in df.columns:
48
+ raise ValueError("The Google Sheet must contain a 'url' column")
49
+
50
+ # Yield each URL from the 'url' column
51
+ for url in df['url']:
52
+ if pd.notna(url) and url.strip(): # Skip empty or NaN values
53
+ yield Url(value=url.strip())
@@ -0,0 +1,28 @@
1
+ from typing import AsyncGenerator
2
+
3
+ from .url_source import UrlSource, Url
4
+
5
+
6
+ class ListUrlSource(UrlSource):
7
+ """A URL provider that yields URLs from a predefined list.
8
+
9
+ This provider takes a list of URL strings and provides them one by one
10
+ through the async generator method `urls()`.
11
+ """
12
+
13
+ def __init__(self, urls: list[str]):
14
+ """Initialize the ListUrlProvider with a list of URLs.
15
+
16
+ Args:
17
+ urls: A list of URL strings to be provided.
18
+ """
19
+ self._url_list = urls
20
+
21
+ async def urls(self) -> AsyncGenerator[Url, None]:
22
+ """Asynchronously yield Url objects from the predefined list.
23
+
24
+ Yields:
25
+ Url: A Url object for each URL string in the list.
26
+ """
27
+ for url in self._url_list:
28
+ yield Url(value=url)
@@ -0,0 +1,57 @@
1
+ from dataclasses import asdict
2
+ from typing import AsyncGenerator
3
+
4
+ import pandas as pd
5
+
6
+ from . import UrlSource, Url
7
+ from ..graphql.client import GraphQlClient
8
+
9
+
10
+ class NewOrChangedUrlSource(UrlSource):
11
+ graphql_client: GraphQlClient
12
+ url_provider: UrlSource
13
+ overwrite: bool
14
+
15
+ def __init__(
16
+ self, url_provider: UrlSource, graphql_client: GraphQlClient, overwrite: bool
17
+ ):
18
+ self.graphql_client = graphql_client
19
+ self.url_provider = url_provider
20
+ self.overwrite = overwrite
21
+
22
+ async def urls(self) -> AsyncGenerator[Url, None]:
23
+ # Get the list of URLs from the underlying provider.
24
+ url_df = pd.DataFrame([asdict(url) async for url in self.url_provider.urls()])
25
+ # Get the list of URLs from GraphQL.
26
+ list_records = await self.graphql_client.run(
27
+ "entities_url_iri_with_source_equal_to_web_page_import",
28
+ {"urls": url_df["value"].tolist() if "value" in url_df.columns else []},
29
+ )
30
+ graphql_df = pd.DataFrame.from_records(
31
+ data=[record for record in list_records],
32
+ columns=("url", "iri", "date_imported"),
33
+ )
34
+ graphql_df["date_imported"] = pd.to_datetime(
35
+ graphql_df["date_imported"], utc=True, errors="coerce"
36
+ )
37
+ merged_df = pd.merge(
38
+ url_df,
39
+ graphql_df,
40
+ left_on="value",
41
+ right_on="url",
42
+ how="left",
43
+ suffixes=("", "_graphql"),
44
+ )
45
+ filtered_df = merged_df[
46
+ self.overwrite
47
+ | merged_df["date_imported"].isna()
48
+ | (merged_df["date_imported"] < merged_df["date_modified"])
49
+ ]
50
+ for _, row in filtered_df.iterrows():
51
+ yield Url(
52
+ value=row["value"],
53
+ iri=None if pd.isna(row["iri_graphql"]) else row["iri_graphql"],
54
+ date_modified=None
55
+ if pd.isna(row["date_modified"])
56
+ else row["date_modified"],
57
+ )
@@ -0,0 +1,36 @@
1
+ import re
2
+ from typing import AsyncGenerator, Optional
3
+
4
+ import advertools as adv
5
+ import pandas as pd
6
+
7
+ from .url_source import UrlSource, Url
8
+
9
+
10
+ class SitemapUrlSource(UrlSource):
11
+ sitemap_url: str
12
+ pattern: re.Pattern | None
13
+
14
+ def __init__(self, sitemap_url: str, pattern: Optional[re.Pattern] = None):
15
+ self.pattern = pattern
16
+ self.sitemap_url = sitemap_url
17
+
18
+ async def urls(self) -> AsyncGenerator[Url, None]:
19
+ sitemap_df = adv.sitemaps.sitemap_to_df(sitemap_url=self.sitemap_url)
20
+ # Ensure 'lastmod' column exists
21
+ if "lastmod" not in sitemap_df.columns:
22
+ sitemap_df["lastmod"] = None
23
+ sitemap_df["lastmod_as_datetime"] = pd.to_datetime(
24
+ sitemap_df["lastmod"], errors="coerce"
25
+ )
26
+
27
+ for _, row in sitemap_df.iterrows():
28
+ url = row["loc"]
29
+ last_mod_as_datetime = row["lastmod_as_datetime"]
30
+ if self.pattern is None or self.pattern.search(url):
31
+ yield Url(
32
+ value=url,
33
+ date_modified=None
34
+ if pd.isna(last_mod_as_datetime)
35
+ else last_mod_as_datetime.to_pydatetime(),
36
+ )
@@ -0,0 +1,18 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import AsyncGenerator
5
+
6
+
7
+ @dataclass
8
+ class Url:
9
+ value: str
10
+ iri: str | None = None
11
+ date_modified: datetime | None = None
12
+
13
+
14
+ class UrlSource(ABC):
15
+ @abstractmethod
16
+ async def urls(self) -> AsyncGenerator[Url, None]:
17
+ """Asynchronously yields Url objects."""
18
+ pass
@@ -0,0 +1,6 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, List, Union
3
+
4
+ from google.auth.credentials import Credentials
5
+ from gspread import Client
6
+
@@ -0,0 +1,17 @@
1
+ from .create_dataframe_from_google_sheets import create_dataframe_from_google_sheets
2
+ from .create_dataframe_of_entities_by_types import create_dataframe_of_entities_by_types
3
+ from .create_dataframe_of_entities_with_embedding_vectors import create_dataframe_of_entities_with_embedding_vectors
4
+ from .create_dataframe_of_url_iri import create_dataframe_of_url_iri
5
+ from .create_entity_patch_request import create_entity_patch_request
6
+ from .delayed import create_delayed
7
+ from .get_me import get_me
8
+
9
+ __all__ = [
10
+ "create_dataframe_from_google_sheets",
11
+ "create_dataframe_of_entities_by_types",
12
+ "create_dataframe_of_entities_with_embedding_vectors",
13
+ "create_dataframe_of_url_iri",
14
+ "create_entity_patch_request",
15
+ "create_delayed",
16
+ "get_me"
17
+ ]
@@ -0,0 +1,26 @@
1
+ import gspread
2
+ import pandas as pd
3
+ from google.auth.credentials import Credentials
4
+ from gspread import Client
5
+
6
+
7
+ def create_dataframe_from_google_sheets(creds_or_client: Credentials | Client, url: str, sheet: str) -> pd.DataFrame:
8
+ if isinstance(creds_or_client, Credentials):
9
+ return create_dataframe_from_google_sheets_using_credentials(creds_or_client, url, sheet)
10
+ elif isinstance(creds_or_client, Client):
11
+ return create_dataframe_from_google_sheets_using_client(creds_or_client, url, sheet)
12
+ else:
13
+ raise TypeError("Expected creds_or_client to be of type Credentials or Client")
14
+
15
+
16
+ def create_dataframe_from_google_sheets_using_credentials(creds: Credentials, url: str, sheet: str) -> pd.DataFrame:
17
+ gc = gspread.authorize(creds)
18
+
19
+ return create_dataframe_from_google_sheets_using_client(gc, url, sheet)
20
+
21
+
22
+ def create_dataframe_from_google_sheets_using_client(gc: Client, url: str, sheet: str) -> pd.DataFrame:
23
+ sheet = gc.open_by_url(url).worksheet(sheet)
24
+ data = sheet.get_all_records()
25
+
26
+ return pd.DataFrame([{k.strip(): v for k, v in row.items()} for row in data])
@@ -0,0 +1,26 @@
1
+ import pandas as pd
2
+
3
+ from wordlift_sdk import graphql
4
+
5
+
6
+ async def create_dataframe_of_entities_by_types(key: str, types: set[str]) -> pd.DataFrame:
7
+ return await graphql.query(
8
+ key=key,
9
+ query_string="""
10
+ query getEntities($types: [String]!) {
11
+ entities(
12
+ query: { typeConstraint: { in: $types } }
13
+ ) {
14
+ iri
15
+ keywords: string(name: "schema:keywords")
16
+ url: string(name: "schema:url")
17
+ }
18
+ }
19
+ """,
20
+ root_element="entities",
21
+ columns=['iri', 'keywords', 'url'],
22
+ variable_values={
23
+ # `set` cannot be serialized in Python, so we convert to `list`
24
+ "types": list(types)
25
+ }
26
+ )
@@ -0,0 +1,28 @@
1
+ import pandas as pd
2
+ from tenacity import retry, stop_after_attempt, wait_fixed
3
+
4
+ from wordlift_sdk import graphql
5
+
6
+
7
+ @retry(
8
+ stop=stop_after_attempt(5),
9
+ wait=wait_fixed(2)
10
+ )
11
+ async def create_dataframe_of_entities_with_embedding_vectors(key: str) -> pd.DataFrame:
12
+ return await graphql.query(
13
+ key=key,
14
+ query_string="""
15
+ query {
16
+ entities(
17
+ query: {
18
+ embeddingValueConstraint: { exists: { exists: true, excludeEmpty: true } }
19
+ }
20
+ ) {
21
+ iri
22
+ url: string(name: "schema:url")
23
+ }
24
+ }
25
+ """,
26
+ root_element='entities',
27
+ columns=['iri', 'url'],
28
+ )
@@ -0,0 +1,14 @@
1
+ from typing import List
2
+
3
+ import pandas as pd
4
+
5
+ from ..graphql.client.factory import GraphQlClientFactory
6
+ from wordlift_sdk.kg import EntityStoreFactory
7
+
8
+
9
+ async def create_dataframe_of_url_iri(key: str, url_list: List[str]) -> pd.DataFrame:
10
+ graphql_client_factory = GraphQlClientFactory(key)
11
+ graphql_client = graphql_client_factory.create_gql_client()
12
+ entity_store_factory = EntityStoreFactory(graphql_client)
13
+ entity_store = entity_store_factory.create()
14
+ return await entity_store.url_iri_as_dataframe(url_list)
@@ -0,0 +1,14 @@
1
+ from rdflib import Graph, URIRef, Node
2
+ import wordlift_client
3
+ from wordlift_client.models.entity_patch_request import EntityPatchRequest
4
+
5
+ def create_entity_patch_request(resource: URIRef, prop: URIRef, value: Node) -> EntityPatchRequest:
6
+ g = Graph()
7
+ g.bind('schema', 'http://schema.org/')
8
+ g.add((resource, prop, value))
9
+
10
+ return wordlift_client.EntityPatchRequest(
11
+ op='add',
12
+ path='/' + str(prop),
13
+ value=g.serialize(format='json-ld', auto_compact=True)
14
+ )
@@ -0,0 +1,12 @@
1
+ import asyncio
2
+ from multiprocessing import cpu_count
3
+
4
+
5
+ def create_delayed(callback, concurrency=cpu_count() + 1):
6
+ sem = asyncio.Semaphore(concurrency)
7
+
8
+ async def callback_with_semaphore(*args, **kwargs):
9
+ async with sem:
10
+ return await callback(*args, **kwargs)
11
+
12
+ return callback_with_semaphore
@@ -0,0 +1,8 @@
1
+ import wordlift_client
2
+ from wordlift_client import Configuration, AccountInfo, AccountApi
3
+
4
+
5
+ async def get_me(configuration: Configuration) -> AccountInfo:
6
+ async with wordlift_client.ApiClient(configuration) as api_client:
7
+ api = AccountApi(api_client)
8
+ return await api.get_me()
@@ -0,0 +1,35 @@
1
+ import logging
2
+ from typing import Callable, Awaitable
3
+
4
+ from tenacity import retry, stop_after_attempt, wait_fixed
5
+ from wordlift_client import Configuration, SitemapImportsApi, SitemapImportRequest, EmbeddingRequest
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ async def import_url_factory(configuration: Configuration, types: set[str]) -> Callable[[set[str]], Awaitable[None]]:
11
+ @retry(
12
+ stop=stop_after_attempt(5),
13
+ wait=wait_fixed(2)
14
+ )
15
+ async def import_url(url_list: set[str]) -> None:
16
+ import wordlift_client
17
+
18
+ async with wordlift_client.ApiClient(configuration) as api_client:
19
+ imports_api = SitemapImportsApi(api_client)
20
+ request = SitemapImportRequest(
21
+ embedding=EmbeddingRequest(
22
+ properties=["http://schema.org/headline", "http://schema.org/abstract", "http://schema.org/text"]
23
+ ),
24
+ output_types=list(types),
25
+ urls=list(url_list),
26
+ overwrite=True,
27
+ id_generator="headline-with-url-hash"
28
+ )
29
+
30
+ try:
31
+ await imports_api.create_sitemap_import(sitemap_import_request=request)
32
+ except Exception as e:
33
+ logger.error("Error importing URLs: %s", e)
34
+
35
+ return import_url
File without changes
@@ -0,0 +1,3 @@
1
+ from .create_entity_gaps_factory import create_entity_gaps_factory
2
+
3
+ __all__ = ["create_entity_gaps_factory"]