wordlift-sdk 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. wordlift_sdk/__init__.py +3 -0
  2. wordlift_sdk/client/__init__.py +3 -0
  3. wordlift_sdk/client/client_configuration_factory.py +26 -0
  4. wordlift_sdk/configuration/__init__.py +4 -0
  5. wordlift_sdk/configuration/configuration_provider.py +44 -0
  6. wordlift_sdk/configuration/get_config_value.py +39 -0
  7. wordlift_sdk/container/__init__.py +3 -0
  8. wordlift_sdk/container/application_container.py +234 -0
  9. wordlift_sdk/deprecated/__init__.py +5 -0
  10. wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
  11. wordlift_sdk/entity/__init__.py +4 -0
  12. wordlift_sdk/entity/enrich.py +54 -0
  13. wordlift_sdk/entity/patch.py +14 -0
  14. wordlift_sdk/google_search_console/__init__.py +5 -0
  15. wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
  16. wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
  17. wordlift_sdk/graph/graph_bag.py +7 -0
  18. wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
  19. wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
  20. wordlift_sdk/graphql/__init__.py +3 -0
  21. wordlift_sdk/graphql/client/__init__.py +5 -0
  22. wordlift_sdk/graphql/client/client.py +69 -0
  23. wordlift_sdk/graphql/client/factory.py +36 -0
  24. wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
  25. wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
  26. wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
  27. wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
  28. wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
  29. wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
  30. wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
  31. wordlift_sdk/graphql/query.py +20 -0
  32. wordlift_sdk/graphql/utils/__init__.py +0 -0
  33. wordlift_sdk/graphql/utils/query/__init__.py +4 -0
  34. wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
  35. wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
  36. wordlift_sdk/id_generator/__init__.py +3 -0
  37. wordlift_sdk/id_generator/id_generator.py +40 -0
  38. wordlift_sdk/id_generator/id_generator_interface.py +8 -0
  39. wordlift_sdk/internal_link/__init__.py +3 -0
  40. wordlift_sdk/internal_link/utils.py +231 -0
  41. wordlift_sdk/kg/__init__.py +5 -0
  42. wordlift_sdk/kg/entity.py +17 -0
  43. wordlift_sdk/kg/entity_store.py +94 -0
  44. wordlift_sdk/kg/entity_store_factory.py +13 -0
  45. wordlift_sdk/kg/relation/__init__.py +0 -0
  46. wordlift_sdk/kg/relation/relation_service.py +78 -0
  47. wordlift_sdk/main.py +7 -0
  48. wordlift_sdk/namespace/SDO.py +3281 -0
  49. wordlift_sdk/namespace/__init__.py +3 -0
  50. wordlift_sdk/notebook/__init__.py +3 -0
  51. wordlift_sdk/notebook/install_if_missing.py +12 -0
  52. wordlift_sdk/protocol/__init__.py +5 -0
  53. wordlift_sdk/protocol/context.py +21 -0
  54. wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
  55. wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
  56. wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
  57. wordlift_sdk/protocol/graph/__init__.py +3 -0
  58. wordlift_sdk/protocol/graph/graph_queue.py +64 -0
  59. wordlift_sdk/protocol/load_override_class.py +30 -0
  60. wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
  61. wordlift_sdk/url_source/__init__.py +6 -0
  62. wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
  63. wordlift_sdk/url_source/list_url_source.py +28 -0
  64. wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
  65. wordlift_sdk/url_source/sitemap_url_source.py +36 -0
  66. wordlift_sdk/url_source/url_source.py +18 -0
  67. wordlift_sdk/url_source/url_source_input.py +6 -0
  68. wordlift_sdk/utils/__init__.py +17 -0
  69. wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
  70. wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
  71. wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
  72. wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
  73. wordlift_sdk/utils/create_entity_patch_request.py +14 -0
  74. wordlift_sdk/utils/delayed.py +12 -0
  75. wordlift_sdk/utils/get_me.py +8 -0
  76. wordlift_sdk/utils/import_url.py +35 -0
  77. wordlift_sdk/wordlift/__init__.py +0 -0
  78. wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
  79. wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
  80. wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
  81. wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  82. wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
  83. wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
  84. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
  85. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
  86. wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
  87. wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
  88. wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
  89. wordlift_sdk/workflow/__init__.py +3 -0
  90. wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
  91. wordlift_sdk/workflow/kg_import_workflow.py +49 -0
  92. wordlift_sdk/workflow/patch_entities_factory.py +16 -0
  93. wordlift_sdk/workflow/url_handler/__init__.py +3 -0
  94. wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
  95. wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
  96. wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
  97. wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
  98. wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
  99. wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
  100. wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,69 @@
1
+ import os.path
2
+ from typing import Any, Optional, Dict, List
3
+
4
+ from gql import gql
5
+ from graphql import parse, OperationDefinitionNode, FieldNode, DocumentNode
6
+
7
+ from .gql_client_provider import GqlClientProvider
8
+
9
+
10
+ class GraphQlQuery:
11
+ query: DocumentNode
12
+ fields: List[str]
13
+
14
+ def __init__(self, query: str):
15
+ self.query = gql(query)
16
+ self.fields = self.extract_field_names(query)
17
+
18
+ def get_query(self) -> DocumentNode:
19
+ return self.query
20
+
21
+ def get_fields(self) -> List[str]:
22
+ return self.fields
23
+
24
+ def extract_field_names(self, query_str):
25
+ parsed = parse(query_str)
26
+ for definition in parsed.definitions:
27
+ if isinstance(definition, OperationDefinitionNode):
28
+ for selection in definition.selection_set.selections:
29
+ if selection.name.value == "entities":
30
+ return [
31
+ field.alias.value if field.alias else field.name.value
32
+ for field in selection.selection_set.selections
33
+ if isinstance(field, FieldNode)
34
+ ]
35
+ return []
36
+
37
+
38
+ file_contents = {}
39
+
40
+ filenames = [
41
+ "entities_top_query.graphql",
42
+ "entities_url_id.graphql",
43
+ "entities_url_iri.graphql",
44
+ ]
45
+ base_dir = os.path.dirname(os.path.abspath(__file__))
46
+
47
+ for filename in filenames:
48
+ filepath = os.path.join(base_dir, "../data", filename)
49
+ with open(filepath, "r", encoding="utf-8") as f:
50
+ query = f.read()
51
+ file_contents[filename] = GraphQlQuery(query)
52
+
53
+
54
+ class GraphQlClient:
55
+ _client_provider: GqlClientProvider
56
+
57
+ def __init__(self, client_provider: GqlClientProvider):
58
+ self._client_provider = client_provider
59
+
60
+ async def run(
61
+ self, graphql: str, variables: Optional[Dict[str, Any]] = None
62
+ ) -> list[dict[str, Any]]:
63
+ query = file_contents[graphql + ".graphql"]
64
+
65
+ # Asynchronous function to execute the query
66
+ async with self._client_provider.create() as session:
67
+ response = await session.execute(query.query, variable_values=variables)
68
+
69
+ return response["entities"]
@@ -0,0 +1,36 @@
1
+ from gql import Client
2
+ from gql.transport.aiohttp import AIOHTTPTransport
3
+
4
+ from .client import GraphQlClient
5
+ from .gql_client_provider import GqlClientProvider
6
+
7
+
8
+ class GraphQlClientFactory:
9
+ _api_url: str
10
+ _key: str
11
+
12
+ def __init__(self, key: str, api_url: str = "https://api.wordlift.io/graphql"):
13
+ self._api_url = api_url
14
+ self._key = key
15
+
16
+ def create(self) -> GraphQlClient:
17
+ return GraphQlClient(self.create_provider())
18
+
19
+ def create_transport(self) -> AIOHTTPTransport:
20
+ # Select your transport with a defined url endpoint
21
+ return AIOHTTPTransport(
22
+ url=self._api_url,
23
+ ssl=True,
24
+ headers={"Authorization": f"Key {self._key}", "X-include-Private": "true"},
25
+ )
26
+
27
+ def create_gql_client(self) -> Client:
28
+ # Create a GraphQL client using the defined transport
29
+ return Client(
30
+ transport=self.create_transport(),
31
+ fetch_schema_from_transport=False,
32
+ execute_timeout=120,
33
+ )
34
+
35
+ def create_provider(self) -> GqlClientProvider:
36
+ return GqlClientProvider(key=self._key, api_url=self._api_url)
@@ -0,0 +1,26 @@
1
+ from gql import Client
2
+ from gql.transport.aiohttp import AIOHTTPTransport
3
+
4
+
5
+ class GqlClientProvider:
6
+ _api_url: str
7
+ _key: str
8
+
9
+ def __init__(self, key: str, api_url: str = "https://api.wordlift.io/graphql"):
10
+ self._api_url = api_url
11
+ self._key = key
12
+
13
+ def _create_transport(self) -> AIOHTTPTransport:
14
+ # Select your transport with a defined url endpoint
15
+ return AIOHTTPTransport(
16
+ url=self._api_url,
17
+ ssl=True,
18
+ headers={"Authorization": f"Key {self._key}", "X-include-Private": "true"},
19
+ )
20
+
21
+ def create(self) -> Client:
22
+ return Client(
23
+ transport=self._create_transport(),
24
+ fetch_schema_from_transport=False,
25
+ execute_timeout=120,
26
+ )
@@ -0,0 +1,9 @@
1
+ query entities_by_type($types: [String]!) {
2
+ entities(
3
+ query: { typeConstraint: { in: $types } }
4
+ ) {
5
+ iri
6
+ keywords: string(name: "schema:keywords")
7
+ url: string(name: "schema:url")
8
+ }
9
+ }
@@ -0,0 +1,15 @@
1
+ query entities_embedding_value($url: String!) {
2
+ entities(
3
+ page: 0
4
+ rows: 1
5
+ query: {
6
+ urlConstraint: { in: [$url] }
7
+ embeddingValueConstraint: {
8
+ exists: { exists: true, excludeEmpty: false }
9
+ }
10
+ }
11
+ ) {
12
+ iri
13
+ embedding_value: string(name: "seovoc:embeddingValue")
14
+ }
15
+ }
@@ -0,0 +1,20 @@
1
+ query entities_top_query($urls: [String]!) {
2
+ entities(query: { urlConstraint: { in: $urls } }) {
3
+ iri
4
+ url: string(name: "schema:url")
5
+ name: string(name: "schema:name")
6
+ headline: string(name: "schema:headline")
7
+ title: string(name: "schema:title")
8
+ top_query: topN(
9
+ name: "seovoc:hasQuery"
10
+ sort: { field: "seovoc:impressions3Months", direction: DESC }
11
+ limit: 1
12
+ ) {
13
+ iri
14
+ name: string(name: "seovoc:name")
15
+ impressions: int(name: "seovoc:impressions3Months")
16
+ clicks: int(name: "seovoc:clicks3Months")
17
+ date_created: date(name: "seovoc:dateCreated")
18
+ }
19
+ }
20
+ }
@@ -0,0 +1,6 @@
1
+ query entities_url_id($urls: [String]!) {
2
+ entities(query: { urlConstraint: { in: $urls } } ) {
3
+ url: string(name:"schema:url")
4
+ id: iri
5
+ }
6
+ }
@@ -0,0 +1,7 @@
1
+ query entities_url_id($urls: [String]!) {
2
+ entities(query: { urlConstraint: { in: $urls } } ) {
3
+ iri
4
+ url: string(name: "schema:url")
5
+ date_imported: dateTime(name: "seovoc:dateImported")
6
+ }
7
+ }
@@ -0,0 +1,12 @@
1
+ query entities_url_iri_with_source_equal_to_web_page_import($urls: [String]!) {
2
+ entities(
3
+ query: {
4
+ urlConstraint: { in: $urls }
5
+ typeConstraint: { in: [ "web-page-import" ] }
6
+ }
7
+ ) {
8
+ iri
9
+ url: string(name: "schema:url")
10
+ date_imported: dateTime(name: "seovoc:dateImported")
11
+ }
12
+ }
@@ -0,0 +1,20 @@
1
+ from typing import Dict, Optional, Any
2
+ from ..graphql.client import GraphQlClientFactory
3
+
4
+ import pandas as pd
5
+
6
+
7
+ async def query(key: str, query_string: str, root_element: str, columns: list[str],
8
+ variable_values: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
9
+ from gql import gql
10
+
11
+ # Create a GraphQL client using the defined transport
12
+ client = GraphQlClientFactory(key=key).create_gql_client()
13
+
14
+ # Define the GraphQL query
15
+ gql_query = gql(query_string)
16
+
17
+ # Asynchronous function to execute the query
18
+ async with client as session:
19
+ response = await session.execute(gql_query, variable_values=variable_values)
20
+ return pd.DataFrame(response[root_element], columns=columns)
File without changes
@@ -0,0 +1,4 @@
1
+ from .entity_top_query import EntityTopQuery
2
+ from .entity_with_top_query import entity_with_top_query_factory
3
+
4
+ __all__ = ['EntityTopQuery', 'entity_with_top_query_factory']
@@ -0,0 +1,56 @@
1
+ from dataclasses import dataclass, field, asdict
2
+ from typing import Optional
3
+
4
+ import pandas as pd
5
+
6
+
7
+ @dataclass
8
+ class EntityTopQuery:
9
+ iri: str
10
+ url: str
11
+ name: str
12
+ headline: str
13
+ title: str
14
+ top_query_iri: Optional[str] = field(default=None)
15
+ top_query_name: Optional[str] = field(default=None)
16
+ top_query_impressions: Optional[int] = field(default=None)
17
+ top_query_clicks: Optional[int] = field(default=None)
18
+ top_query_date_created: Optional[str] = field(default=None)
19
+
20
+ @staticmethod
21
+ def from_graphql_response(entity_data: dict) -> "EntityTopQuery":
22
+ # Initialize top_query fields with default values
23
+ top_query_iri = top_query_name = top_query_impressions = top_query_clicks = top_query_date_created = None
24
+
25
+ # Check if there are any top queries
26
+ if entity_data.get('top_query'):
27
+ top_query_data = entity_data['top_query'][0]
28
+ top_query_iri = top_query_data.get('iri')
29
+ top_query_name = top_query_data.get('name')
30
+ top_query_impressions = top_query_data.get('impressions')
31
+ top_query_clicks = top_query_data.get('clicks')
32
+ top_query_date_created = top_query_data.get('date_created')
33
+
34
+ # Create an Entity instance
35
+ return EntityTopQuery(
36
+ iri=entity_data['iri'],
37
+ url=entity_data['url'],
38
+ name=entity_data['name'],
39
+ headline=entity_data['headline'],
40
+ title=entity_data['title'],
41
+ top_query_iri=top_query_iri,
42
+ top_query_name=top_query_name,
43
+ top_query_impressions=top_query_impressions,
44
+ top_query_clicks=top_query_clicks,
45
+ top_query_date_created=top_query_date_created
46
+ )
47
+
48
+ def to_dataframe(self) -> pd.DataFrame:
49
+ entities_with_top_query_df = pd.DataFrame([asdict(self)])
50
+ entities_with_top_query_df['calc_name'] = entities_with_top_query_df[
51
+ ['name', 'headline', 'title', 'url']].bfill(
52
+ axis=1).iloc[:, 0]
53
+ entities_with_top_query_df['top_query_date_created'] = pd.to_datetime(
54
+ entities_with_top_query_df['top_query_date_created'], errors='coerce')
55
+
56
+ return entities_with_top_query_df
@@ -0,0 +1,52 @@
1
+ from typing import Optional, Awaitable, Callable
2
+
3
+ from tenacity import stop_after_attempt, retry, wait_fixed
4
+
5
+ from .entity_top_query import EntityTopQuery
6
+ from ...client import GraphQlClientFactory
7
+
8
+
9
+ async def entity_with_top_query_factory(
10
+ key: str,
11
+ ) -> Callable[[str], Awaitable[Optional[EntityTopQuery]]]:
12
+ @retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
13
+ async def entity_with_top_query(url: str) -> Optional[EntityTopQuery]:
14
+ from gql import gql
15
+
16
+ # Create a GraphQL client using the defined transport
17
+ client = GraphQlClientFactory(key=key).create_gql_client()
18
+
19
+ # Define the GraphQL query
20
+ gql_query = gql("""
21
+ query($url: String!) {
22
+ entities(query: { urlConstraint: { in: [$url] } }) {
23
+ iri
24
+ url: string(name: "schema:url")
25
+ name: string(name: "schema:name")
26
+ headline: string(name: "schema:headline")
27
+ title: string(name: "schema:title")
28
+ top_query: topN(
29
+ name: "seovoc:hasQuery"
30
+ sort: { field: "seovoc:impressions3Months", direction: DESC }
31
+ limit: 1
32
+ ) {
33
+ iri
34
+ name: string(name: "seovoc:name")
35
+ impressions: int(name: "seovoc:impressions3Months")
36
+ clicks: int(name: "seovoc:clicks3Months")
37
+ date_created: date(name: "seovoc:dateCreated")
38
+ }
39
+ }
40
+ }
41
+ """)
42
+
43
+ # Asynchronous function to execute the query
44
+ async with client as session:
45
+ response = await session.execute(gql_query, variable_values={"url": url})
46
+
47
+ if len(response["entities"]) == 0:
48
+ return None
49
+
50
+ return EntityTopQuery.from_graphql_response(response["entities"][0])
51
+
52
+ return entity_with_top_query
@@ -0,0 +1,3 @@
1
+ from .id_generator import IdGenerator
2
+
3
+ __all__ = ["IdGenerator"]
@@ -0,0 +1,40 @@
1
+ import re
2
+ import unicodedata
3
+ from urllib.parse import urljoin
4
+
5
+ from wordlift_sdk.id_generator.id_generator_interface import IdGeneratorInterface
6
+ from wordlift_client import AccountInfo
7
+
8
+
9
+ class IdGenerator(IdGeneratorInterface):
10
+ account: AccountInfo
11
+
12
+ def __init__(self, account: AccountInfo):
13
+ self.account = account
14
+
15
+ def slugify(self, input_string: str) -> str:
16
+ if not isinstance(input_string, str):
17
+ return ''
18
+
19
+ # Insert dash between camelCase or PascalCase transitions: e.g. "ProfilePage" -> "Profile-Page"
20
+ input_string = re.sub(r'(?<=[a-z])(?=[A-Z])', '-', input_string)
21
+
22
+ # Normalize diacritics
23
+ slug = unicodedata.normalize('NFD', input_string)
24
+ slug = ''.join(c for c in slug if unicodedata.category(c) != 'Mn')
25
+
26
+ # Remove punctuation, convert to lowercase, format dashes
27
+ slug = re.sub(r'[^\w\s-]', '', slug) # remove punctuation
28
+ slug = re.sub(r'\s+', '-', slug) # replace spaces with dashes
29
+ slug = re.sub(r'-+', '-', slug) # collapse multiple dashes
30
+ slug = slug.strip('-') # trim leading/trailing dashes
31
+ slug = slug.lower()
32
+
33
+ return slug
34
+
35
+ def create(self, *args):
36
+ full_url = self.account.dataset_uri
37
+ for arg in args:
38
+ full_url = full_url.rstrip('/') + '/' + self.slugify(arg)
39
+
40
+ return full_url
@@ -0,0 +1,8 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class IdGeneratorInterface(ABC):
5
+
6
+ @abstractmethod
7
+ def create(self, *args):
8
+ pass
@@ -0,0 +1,3 @@
1
+ from .utils import create_internal_link_handler
2
+
3
+ __all__ = ['create_internal_link_handler']
@@ -0,0 +1,231 @@
1
+ import logging
2
+ import re
3
+ from typing import Callable, Awaitable
4
+ from urllib.parse import quote
5
+
6
+ import wordlift_client
7
+ from pandas import Series
8
+ from rdflib import Graph, URIRef, RDF, Literal, XSD
9
+ from tenacity import retry, stop_after_attempt, wait_fixed
10
+ from wordlift_client import InternalLinkRequest, InternalLink, InternalLinksApi, AnchorText, Item, \
11
+ VectorSearchQueryRequest, EntityPatchRequest, Configuration
12
+
13
+ from wordlift_sdk import entity
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ async def create_internal_link_request_default_filter(row: Series, request: InternalLinkRequest) -> InternalLinkRequest:
19
+ return request
20
+
21
+
22
+ @retry(
23
+ stop=stop_after_attempt(5),
24
+ wait=wait_fixed(2)
25
+ )
26
+ async def create_internal_link(
27
+ configuration: Configuration,
28
+ row: Series,
29
+ no_links: int = 10,
30
+ internal_link_request_filter: Callable[
31
+ [Series, InternalLinkRequest], Awaitable[
32
+ InternalLinkRequest]] = create_internal_link_request_default_filter
33
+ ) -> InternalLink | None:
34
+ import wordlift_client
35
+ entity_url = row['url']
36
+ entity_id = row['iri']
37
+
38
+ async with wordlift_client.ApiClient(configuration) as api_client:
39
+ api = InternalLinksApi(api_client)
40
+ request = await internal_link_request_filter(
41
+ row,
42
+ InternalLinkRequest(
43
+ anchor_text=AnchorText(
44
+ enabled=True
45
+ ),
46
+ items=[
47
+ Item(
48
+ id=entity_id,
49
+ query=VectorSearchQueryRequest(
50
+ query_url=entity_url,
51
+ similarity_top_k=no_links
52
+ )
53
+ )
54
+ ]
55
+ )
56
+ )
57
+
58
+ try:
59
+ results = await api.create_internal_link_suggestion(internal_link_request=request, _request_timeout=120)
60
+ return results[0]
61
+ except Exception as e:
62
+ logger.error("Error creating Internal Links: %s", e)
63
+ raise e
64
+
65
+
66
+ class InternalLinkData:
67
+ source_id: str
68
+ source_patch_request: EntityPatchRequest
69
+ link_group_graph: Graph
70
+
71
+ def __init__(self, source_id: str, source_patch_request: EntityPatchRequest, link_group_graph: Graph):
72
+ self.source_id = source_id
73
+ self.source_patch_request = source_patch_request
74
+ self.link_group_graph = link_group_graph
75
+
76
+
77
+ async def create_internal_link_data(internal_link: InternalLink, group_id: str) -> InternalLinkData:
78
+ """
79
+ Create an RDFlib Graph from an InternalLink object using the SEO vocabulary.
80
+
81
+ Args:
82
+ internal_link: InternalLink object from wordlift_client
83
+
84
+ Returns:
85
+ RDFlib Graph containing the mapped data
86
+ :param group_id:
87
+ """
88
+
89
+ # This is an example structure:
90
+ #
91
+ # InternalLink(
92
+ # destinations=[
93
+ # InternalLinkDestination(
94
+ # name='SEO Strategies',
95
+ # position=1,
96
+ # url='https://wordlift.io/blog/en/advanced-seo-natural-language-processing/'
97
+ # ),
98
+ # InternalLinkDestination(
99
+ # name='SERP Analysis',
100
+ # position=2,
101
+ # url='https://wordlift.io/blog/en/serp-analysis/'
102
+ # ),
103
+ # InternalLinkDestination(
104
+ # name='Semantic Search',
105
+ # position=3,
106
+ # url='https://wordlift.io/blog/en/semantic-search/'
107
+ # ),
108
+ # InternalLinkDestination(
109
+ # name='Text Summarize',
110
+ # position=4,
111
+ # url='https://wordlift.io/blog/en/text-summarization-in-seo/'
112
+ # ),
113
+ # InternalLinkDestination(
114
+ # name='RankBrain In SEO',
115
+ # position=5,
116
+ # url='https://wordlift.io/blog/en/rankbrain-will-make-blog-worthless-unless/'
117
+ # ),
118
+ # InternalLinkDestination(
119
+ # name='SEO and AI',
120
+ # position=6,
121
+ # url='https://wordlift.io/blog/en/how-expert-professional-seo-evolves-with-ai/'
122
+ # ),
123
+ # InternalLinkDestination(
124
+ # name='Content Optimize',
125
+ # position=7,
126
+ # url='https://wordlift.io/blog/en/seo-content-optimization/'
127
+ # ),
128
+ # InternalLinkDestination(
129
+ # name='Google Advances',
130
+ # position=8,
131
+ # url='https://wordlift.io/blog/en/advances-in-image-understanding/'
132
+ # ),
133
+ # InternalLinkDestination(
134
+ # name='Knowledge Graphs',
135
+ # position=9,
136
+ # url='https://wordlift.io/blog/en/finding-entities-knowledge-graphs/'
137
+ # )
138
+ # ],
139
+ # source=InternalLinkSource(
140
+ # id='https://data.wordlift.io/wl1505904/title-tag-seo-using-deep-learning-and-tensorflow-3e9202b7c7a6fde83605021a5820ab04',
141
+ # name=None,
142
+ # url='https://wordlift.io/blog/en/title-tag-seo-using-ai/'
143
+ # )
144
+ # )
145
+
146
+ # Validate group_id
147
+ if not group_id or not isinstance(group_id, str):
148
+ raise ValueError("group_id must be a non-empty string")
149
+
150
+ # Check for valid characters (alphanumeric, hyphen, underscore)
151
+ if not re.match(r'^[a-zA-Z0-9\-_]+$', group_id):
152
+ raise ValueError("group_id must contain only alphanumeric characters, hyphens, or underscores")
153
+
154
+ # URL encode the group_id for extra safety
155
+ safe_group_id = quote(group_id)
156
+
157
+ link_group_graph = Graph()
158
+ source_graph = Graph()
159
+
160
+ source_graph.bind("seovoc", "https://w3id.org/seovoc/")
161
+
162
+ # Define namespaces
163
+ link_group_graph.bind("seovoc", "https://w3id.org/seovoc/")
164
+ link_group_graph.bind("xsd", "http://www.w3.org/2001/XMLSchema#")
165
+
166
+ # Create source resource
167
+ source = internal_link.source
168
+ source_resource = URIRef(source.id)
169
+
170
+ # Create a default link group for the destinations
171
+ link_group_id = f"{source.id}/linkgroup_{safe_group_id}"
172
+ link_group = URIRef(link_group_id)
173
+
174
+ has_link_group = URIRef("https://w3id.org/seovoc/hasLinkGroup")
175
+ source_graph.add((source_resource, has_link_group, link_group))
176
+
177
+ link_group_graph.add((link_group, RDF.type, URIRef("https://w3id.org/seovoc/LinkGroup")))
178
+ link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/identifier"), Literal(group_id)))
179
+ link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/name"), Literal("Related Links")))
180
+ link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/isLinkGroupOf"), source_resource))
181
+
182
+ # Add destinations as links
183
+ for dest in internal_link.destinations:
184
+ # Create link resource
185
+ link_id = f"{link_group_id}/link_{dest.position}"
186
+ link_resource = URIRef(link_id)
187
+ link_group_graph.add((link_resource, RDF.type, URIRef("https://w3id.org/seovoc/Link")))
188
+
189
+ # Add link properties
190
+ link_group_graph.add(
191
+ (link_resource, URIRef("https://w3id.org/seovoc/position"), Literal(dest.position, datatype=XSD.integer)))
192
+ link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/name"), Literal(dest.name)))
193
+ link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorText"), Literal(dest.name)))
194
+ link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorValue"), URIRef(dest.url)))
195
+ link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorResource"), URIRef(dest.id)))
196
+ link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/isLinkOf"), link_group))
197
+ link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/hasLink"), link_resource))
198
+
199
+ source_patch_request = EntityPatchRequest(
200
+ op='add',
201
+ path='/' + str(has_link_group),
202
+ value=source_graph.serialize(format='json-ld', auto_compact=True)
203
+ )
204
+
205
+ return InternalLinkData(source.id, source_patch_request, link_group_graph)
206
+
207
+
208
+ def create_internal_link_handler(
209
+ configuration: Configuration,
210
+ link_group_id: str,
211
+ no_links: int = 10,
212
+ internal_link_request_filter: Callable[
213
+ [Series, InternalLinkRequest], Awaitable[InternalLinkRequest]] = create_internal_link_request_default_filter
214
+ ) -> Callable[[Series], Awaitable[None]]:
215
+ async def handle(row: Series) -> None:
216
+ response = await create_internal_link(configuration, row, no_links, internal_link_request_filter)
217
+
218
+ if not response:
219
+ return
220
+
221
+ data = await create_internal_link_data(response, link_group_id)
222
+
223
+ await entity.patch(configuration, data.source_id, [data.source_patch_request])
224
+
225
+ async with wordlift_client.ApiClient(configuration) as api_client:
226
+ # Create an instance of the API class
227
+ api_instance = wordlift_client.EntitiesApi(api_client)
228
+ body = data.link_group_graph.serialize(format="turtle")
229
+ await api_instance.create_or_update_entities(body, _content_type="text/turtle")
230
+
231
+ return handle
@@ -0,0 +1,5 @@
1
+ from .entity import Entity
2
+ from .entity_store import EntityStore
3
+ from .entity_store_factory import EntityStoreFactory
4
+
5
+ __all__ = ["Entity", "EntityStore", "EntityStoreFactory"]
@@ -0,0 +1,17 @@
1
+ class Entity:
2
+ _props: dict[str, str]
3
+
4
+ def __init__(self, props: dict[str, str]):
5
+ self._props = props
6
+
7
+ @property
8
+ def iri(self) -> str:
9
+ return self._props['iri']
10
+
11
+ @property
12
+ def url(self) -> str:
13
+ return self._props['url']
14
+
15
+ @staticmethod
16
+ def from_dict(values: dict[str, str]):
17
+ return Entity(values)