wordlift-sdk 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +3 -0
- wordlift_sdk/client/__init__.py +3 -0
- wordlift_sdk/client/client_configuration_factory.py +26 -0
- wordlift_sdk/configuration/__init__.py +4 -0
- wordlift_sdk/configuration/configuration_provider.py +44 -0
- wordlift_sdk/configuration/get_config_value.py +39 -0
- wordlift_sdk/container/__init__.py +3 -0
- wordlift_sdk/container/application_container.py +234 -0
- wordlift_sdk/deprecated/__init__.py +5 -0
- wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
- wordlift_sdk/entity/__init__.py +4 -0
- wordlift_sdk/entity/enrich.py +54 -0
- wordlift_sdk/entity/patch.py +14 -0
- wordlift_sdk/google_search_console/__init__.py +5 -0
- wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
- wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
- wordlift_sdk/graph/graph_bag.py +7 -0
- wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
- wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
- wordlift_sdk/graphql/__init__.py +3 -0
- wordlift_sdk/graphql/client/__init__.py +5 -0
- wordlift_sdk/graphql/client/client.py +69 -0
- wordlift_sdk/graphql/client/factory.py +36 -0
- wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
- wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
- wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
- wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
- wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
- wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
- wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
- wordlift_sdk/graphql/query.py +20 -0
- wordlift_sdk/graphql/utils/__init__.py +0 -0
- wordlift_sdk/graphql/utils/query/__init__.py +4 -0
- wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
- wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
- wordlift_sdk/id_generator/__init__.py +3 -0
- wordlift_sdk/id_generator/id_generator.py +40 -0
- wordlift_sdk/id_generator/id_generator_interface.py +8 -0
- wordlift_sdk/internal_link/__init__.py +3 -0
- wordlift_sdk/internal_link/utils.py +231 -0
- wordlift_sdk/kg/__init__.py +5 -0
- wordlift_sdk/kg/entity.py +17 -0
- wordlift_sdk/kg/entity_store.py +94 -0
- wordlift_sdk/kg/entity_store_factory.py +13 -0
- wordlift_sdk/kg/relation/__init__.py +0 -0
- wordlift_sdk/kg/relation/relation_service.py +78 -0
- wordlift_sdk/main.py +7 -0
- wordlift_sdk/namespace/SDO.py +3281 -0
- wordlift_sdk/namespace/__init__.py +3 -0
- wordlift_sdk/notebook/__init__.py +3 -0
- wordlift_sdk/notebook/install_if_missing.py +12 -0
- wordlift_sdk/protocol/__init__.py +5 -0
- wordlift_sdk/protocol/context.py +21 -0
- wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
- wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
- wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
- wordlift_sdk/protocol/graph/__init__.py +3 -0
- wordlift_sdk/protocol/graph/graph_queue.py +64 -0
- wordlift_sdk/protocol/load_override_class.py +30 -0
- wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
- wordlift_sdk/url_source/__init__.py +6 -0
- wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
- wordlift_sdk/url_source/list_url_source.py +28 -0
- wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
- wordlift_sdk/url_source/sitemap_url_source.py +36 -0
- wordlift_sdk/url_source/url_source.py +18 -0
- wordlift_sdk/url_source/url_source_input.py +6 -0
- wordlift_sdk/utils/__init__.py +17 -0
- wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
- wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
- wordlift_sdk/utils/create_entity_patch_request.py +14 -0
- wordlift_sdk/utils/delayed.py +12 -0
- wordlift_sdk/utils/get_me.py +8 -0
- wordlift_sdk/utils/import_url.py +35 -0
- wordlift_sdk/wordlift/__init__.py +0 -0
- wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
- wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
- wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
- wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
- wordlift_sdk/workflow/__init__.py +3 -0
- wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
- wordlift_sdk/workflow/kg_import_workflow.py +49 -0
- wordlift_sdk/workflow/patch_entities_factory.py +16 -0
- wordlift_sdk/workflow/url_handler/__init__.py +3 -0
- wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
- wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
- wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
- wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
- wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
- wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
- wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import Any, Optional, Dict, List
|
|
3
|
+
|
|
4
|
+
from gql import gql
|
|
5
|
+
from graphql import parse, OperationDefinitionNode, FieldNode, DocumentNode
|
|
6
|
+
|
|
7
|
+
from .gql_client_provider import GqlClientProvider
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GraphQlQuery:
|
|
11
|
+
query: DocumentNode
|
|
12
|
+
fields: List[str]
|
|
13
|
+
|
|
14
|
+
def __init__(self, query: str):
|
|
15
|
+
self.query = gql(query)
|
|
16
|
+
self.fields = self.extract_field_names(query)
|
|
17
|
+
|
|
18
|
+
def get_query(self) -> DocumentNode:
|
|
19
|
+
return self.query
|
|
20
|
+
|
|
21
|
+
def get_fields(self) -> List[str]:
|
|
22
|
+
return self.fields
|
|
23
|
+
|
|
24
|
+
def extract_field_names(self, query_str):
|
|
25
|
+
parsed = parse(query_str)
|
|
26
|
+
for definition in parsed.definitions:
|
|
27
|
+
if isinstance(definition, OperationDefinitionNode):
|
|
28
|
+
for selection in definition.selection_set.selections:
|
|
29
|
+
if selection.name.value == "entities":
|
|
30
|
+
return [
|
|
31
|
+
field.alias.value if field.alias else field.name.value
|
|
32
|
+
for field in selection.selection_set.selections
|
|
33
|
+
if isinstance(field, FieldNode)
|
|
34
|
+
]
|
|
35
|
+
return []
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
file_contents = {}
|
|
39
|
+
|
|
40
|
+
filenames = [
|
|
41
|
+
"entities_top_query.graphql",
|
|
42
|
+
"entities_url_id.graphql",
|
|
43
|
+
"entities_url_iri.graphql",
|
|
44
|
+
]
|
|
45
|
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
46
|
+
|
|
47
|
+
for filename in filenames:
|
|
48
|
+
filepath = os.path.join(base_dir, "../data", filename)
|
|
49
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
50
|
+
query = f.read()
|
|
51
|
+
file_contents[filename] = GraphQlQuery(query)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class GraphQlClient:
|
|
55
|
+
_client_provider: GqlClientProvider
|
|
56
|
+
|
|
57
|
+
def __init__(self, client_provider: GqlClientProvider):
|
|
58
|
+
self._client_provider = client_provider
|
|
59
|
+
|
|
60
|
+
async def run(
|
|
61
|
+
self, graphql: str, variables: Optional[Dict[str, Any]] = None
|
|
62
|
+
) -> list[dict[str, Any]]:
|
|
63
|
+
query = file_contents[graphql + ".graphql"]
|
|
64
|
+
|
|
65
|
+
# Asynchronous function to execute the query
|
|
66
|
+
async with self._client_provider.create() as session:
|
|
67
|
+
response = await session.execute(query.query, variable_values=variables)
|
|
68
|
+
|
|
69
|
+
return response["entities"]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from gql import Client
|
|
2
|
+
from gql.transport.aiohttp import AIOHTTPTransport
|
|
3
|
+
|
|
4
|
+
from .client import GraphQlClient
|
|
5
|
+
from .gql_client_provider import GqlClientProvider
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GraphQlClientFactory:
|
|
9
|
+
_api_url: str
|
|
10
|
+
_key: str
|
|
11
|
+
|
|
12
|
+
def __init__(self, key: str, api_url: str = "https://api.wordlift.io/graphql"):
|
|
13
|
+
self._api_url = api_url
|
|
14
|
+
self._key = key
|
|
15
|
+
|
|
16
|
+
def create(self) -> GraphQlClient:
|
|
17
|
+
return GraphQlClient(self.create_provider())
|
|
18
|
+
|
|
19
|
+
def create_transport(self) -> AIOHTTPTransport:
|
|
20
|
+
# Select your transport with a defined url endpoint
|
|
21
|
+
return AIOHTTPTransport(
|
|
22
|
+
url=self._api_url,
|
|
23
|
+
ssl=True,
|
|
24
|
+
headers={"Authorization": f"Key {self._key}", "X-include-Private": "true"},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def create_gql_client(self) -> Client:
|
|
28
|
+
# Create a GraphQL client using the defined transport
|
|
29
|
+
return Client(
|
|
30
|
+
transport=self.create_transport(),
|
|
31
|
+
fetch_schema_from_transport=False,
|
|
32
|
+
execute_timeout=120,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def create_provider(self) -> GqlClientProvider:
|
|
36
|
+
return GqlClientProvider(key=self._key, api_url=self._api_url)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from gql import Client
|
|
2
|
+
from gql.transport.aiohttp import AIOHTTPTransport
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GqlClientProvider:
|
|
6
|
+
_api_url: str
|
|
7
|
+
_key: str
|
|
8
|
+
|
|
9
|
+
def __init__(self, key: str, api_url: str = "https://api.wordlift.io/graphql"):
|
|
10
|
+
self._api_url = api_url
|
|
11
|
+
self._key = key
|
|
12
|
+
|
|
13
|
+
def _create_transport(self) -> AIOHTTPTransport:
|
|
14
|
+
# Select your transport with a defined url endpoint
|
|
15
|
+
return AIOHTTPTransport(
|
|
16
|
+
url=self._api_url,
|
|
17
|
+
ssl=True,
|
|
18
|
+
headers={"Authorization": f"Key {self._key}", "X-include-Private": "true"},
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def create(self) -> Client:
|
|
22
|
+
return Client(
|
|
23
|
+
transport=self._create_transport(),
|
|
24
|
+
fetch_schema_from_transport=False,
|
|
25
|
+
execute_timeout=120,
|
|
26
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
query entities_embedding_value($url: String!) {
|
|
2
|
+
entities(
|
|
3
|
+
page: 0
|
|
4
|
+
rows: 1
|
|
5
|
+
query: {
|
|
6
|
+
urlConstraint: { in: [$url] }
|
|
7
|
+
embeddingValueConstraint: {
|
|
8
|
+
exists: { exists: true, excludeEmpty: false }
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
) {
|
|
12
|
+
iri
|
|
13
|
+
embedding_value: string(name: "seovoc:embeddingValue")
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
query entities_top_query($urls: [String]!) {
|
|
2
|
+
entities(query: { urlConstraint: { in: $urls } }) {
|
|
3
|
+
iri
|
|
4
|
+
url: string(name: "schema:url")
|
|
5
|
+
name: string(name: "schema:name")
|
|
6
|
+
headline: string(name: "schema:headline")
|
|
7
|
+
title: string(name: "schema:title")
|
|
8
|
+
top_query: topN(
|
|
9
|
+
name: "seovoc:hasQuery"
|
|
10
|
+
sort: { field: "seovoc:impressions3Months", direction: DESC }
|
|
11
|
+
limit: 1
|
|
12
|
+
) {
|
|
13
|
+
iri
|
|
14
|
+
name: string(name: "seovoc:name")
|
|
15
|
+
impressions: int(name: "seovoc:impressions3Months")
|
|
16
|
+
clicks: int(name: "seovoc:clicks3Months")
|
|
17
|
+
date_created: date(name: "seovoc:dateCreated")
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
query entities_url_iri_with_source_equal_to_web_page_import($urls: [String]!) {
|
|
2
|
+
entities(
|
|
3
|
+
query: {
|
|
4
|
+
urlConstraint: { in: $urls }
|
|
5
|
+
typeConstraint: { in: [ "web-page-import" ] }
|
|
6
|
+
}
|
|
7
|
+
) {
|
|
8
|
+
iri
|
|
9
|
+
url: string(name: "schema:url")
|
|
10
|
+
date_imported: dateTime(name: "seovoc:dateImported")
|
|
11
|
+
}
|
|
12
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Dict, Optional, Any
|
|
2
|
+
from ..graphql.client import GraphQlClientFactory
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def query(key: str, query_string: str, root_element: str, columns: list[str],
|
|
8
|
+
variable_values: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
9
|
+
from gql import gql
|
|
10
|
+
|
|
11
|
+
# Create a GraphQL client using the defined transport
|
|
12
|
+
client = GraphQlClientFactory(key=key).create_gql_client()
|
|
13
|
+
|
|
14
|
+
# Define the GraphQL query
|
|
15
|
+
gql_query = gql(query_string)
|
|
16
|
+
|
|
17
|
+
# Asynchronous function to execute the query
|
|
18
|
+
async with client as session:
|
|
19
|
+
response = await session.execute(gql_query, variable_values=variable_values)
|
|
20
|
+
return pd.DataFrame(response[root_element], columns=columns)
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from dataclasses import dataclass, field, asdict
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class EntityTopQuery:
|
|
9
|
+
iri: str
|
|
10
|
+
url: str
|
|
11
|
+
name: str
|
|
12
|
+
headline: str
|
|
13
|
+
title: str
|
|
14
|
+
top_query_iri: Optional[str] = field(default=None)
|
|
15
|
+
top_query_name: Optional[str] = field(default=None)
|
|
16
|
+
top_query_impressions: Optional[int] = field(default=None)
|
|
17
|
+
top_query_clicks: Optional[int] = field(default=None)
|
|
18
|
+
top_query_date_created: Optional[str] = field(default=None)
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def from_graphql_response(entity_data: dict) -> "EntityTopQuery":
|
|
22
|
+
# Initialize top_query fields with default values
|
|
23
|
+
top_query_iri = top_query_name = top_query_impressions = top_query_clicks = top_query_date_created = None
|
|
24
|
+
|
|
25
|
+
# Check if there are any top queries
|
|
26
|
+
if entity_data.get('top_query'):
|
|
27
|
+
top_query_data = entity_data['top_query'][0]
|
|
28
|
+
top_query_iri = top_query_data.get('iri')
|
|
29
|
+
top_query_name = top_query_data.get('name')
|
|
30
|
+
top_query_impressions = top_query_data.get('impressions')
|
|
31
|
+
top_query_clicks = top_query_data.get('clicks')
|
|
32
|
+
top_query_date_created = top_query_data.get('date_created')
|
|
33
|
+
|
|
34
|
+
# Create an Entity instance
|
|
35
|
+
return EntityTopQuery(
|
|
36
|
+
iri=entity_data['iri'],
|
|
37
|
+
url=entity_data['url'],
|
|
38
|
+
name=entity_data['name'],
|
|
39
|
+
headline=entity_data['headline'],
|
|
40
|
+
title=entity_data['title'],
|
|
41
|
+
top_query_iri=top_query_iri,
|
|
42
|
+
top_query_name=top_query_name,
|
|
43
|
+
top_query_impressions=top_query_impressions,
|
|
44
|
+
top_query_clicks=top_query_clicks,
|
|
45
|
+
top_query_date_created=top_query_date_created
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
49
|
+
entities_with_top_query_df = pd.DataFrame([asdict(self)])
|
|
50
|
+
entities_with_top_query_df['calc_name'] = entities_with_top_query_df[
|
|
51
|
+
['name', 'headline', 'title', 'url']].bfill(
|
|
52
|
+
axis=1).iloc[:, 0]
|
|
53
|
+
entities_with_top_query_df['top_query_date_created'] = pd.to_datetime(
|
|
54
|
+
entities_with_top_query_df['top_query_date_created'], errors='coerce')
|
|
55
|
+
|
|
56
|
+
return entities_with_top_query_df
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import Optional, Awaitable, Callable
|
|
2
|
+
|
|
3
|
+
from tenacity import stop_after_attempt, retry, wait_fixed
|
|
4
|
+
|
|
5
|
+
from .entity_top_query import EntityTopQuery
|
|
6
|
+
from ...client import GraphQlClientFactory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def entity_with_top_query_factory(
|
|
10
|
+
key: str,
|
|
11
|
+
) -> Callable[[str], Awaitable[Optional[EntityTopQuery]]]:
|
|
12
|
+
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
|
|
13
|
+
async def entity_with_top_query(url: str) -> Optional[EntityTopQuery]:
|
|
14
|
+
from gql import gql
|
|
15
|
+
|
|
16
|
+
# Create a GraphQL client using the defined transport
|
|
17
|
+
client = GraphQlClientFactory(key=key).create_gql_client()
|
|
18
|
+
|
|
19
|
+
# Define the GraphQL query
|
|
20
|
+
gql_query = gql("""
|
|
21
|
+
query($url: String!) {
|
|
22
|
+
entities(query: { urlConstraint: { in: [$url] } }) {
|
|
23
|
+
iri
|
|
24
|
+
url: string(name: "schema:url")
|
|
25
|
+
name: string(name: "schema:name")
|
|
26
|
+
headline: string(name: "schema:headline")
|
|
27
|
+
title: string(name: "schema:title")
|
|
28
|
+
top_query: topN(
|
|
29
|
+
name: "seovoc:hasQuery"
|
|
30
|
+
sort: { field: "seovoc:impressions3Months", direction: DESC }
|
|
31
|
+
limit: 1
|
|
32
|
+
) {
|
|
33
|
+
iri
|
|
34
|
+
name: string(name: "seovoc:name")
|
|
35
|
+
impressions: int(name: "seovoc:impressions3Months")
|
|
36
|
+
clicks: int(name: "seovoc:clicks3Months")
|
|
37
|
+
date_created: date(name: "seovoc:dateCreated")
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
""")
|
|
42
|
+
|
|
43
|
+
# Asynchronous function to execute the query
|
|
44
|
+
async with client as session:
|
|
45
|
+
response = await session.execute(gql_query, variable_values={"url": url})
|
|
46
|
+
|
|
47
|
+
if len(response["entities"]) == 0:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
return EntityTopQuery.from_graphql_response(response["entities"][0])
|
|
51
|
+
|
|
52
|
+
return entity_with_top_query
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import unicodedata
|
|
3
|
+
from urllib.parse import urljoin
|
|
4
|
+
|
|
5
|
+
from wordlift_sdk.id_generator.id_generator_interface import IdGeneratorInterface
|
|
6
|
+
from wordlift_client import AccountInfo
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class IdGenerator(IdGeneratorInterface):
|
|
10
|
+
account: AccountInfo
|
|
11
|
+
|
|
12
|
+
def __init__(self, account: AccountInfo):
|
|
13
|
+
self.account = account
|
|
14
|
+
|
|
15
|
+
def slugify(self, input_string: str) -> str:
|
|
16
|
+
if not isinstance(input_string, str):
|
|
17
|
+
return ''
|
|
18
|
+
|
|
19
|
+
# Insert dash between camelCase or PascalCase transitions: e.g. "ProfilePage" -> "Profile-Page"
|
|
20
|
+
input_string = re.sub(r'(?<=[a-z])(?=[A-Z])', '-', input_string)
|
|
21
|
+
|
|
22
|
+
# Normalize diacritics
|
|
23
|
+
slug = unicodedata.normalize('NFD', input_string)
|
|
24
|
+
slug = ''.join(c for c in slug if unicodedata.category(c) != 'Mn')
|
|
25
|
+
|
|
26
|
+
# Remove punctuation, convert to lowercase, format dashes
|
|
27
|
+
slug = re.sub(r'[^\w\s-]', '', slug) # remove punctuation
|
|
28
|
+
slug = re.sub(r'\s+', '-', slug) # replace spaces with dashes
|
|
29
|
+
slug = re.sub(r'-+', '-', slug) # collapse multiple dashes
|
|
30
|
+
slug = slug.strip('-') # trim leading/trailing dashes
|
|
31
|
+
slug = slug.lower()
|
|
32
|
+
|
|
33
|
+
return slug
|
|
34
|
+
|
|
35
|
+
def create(self, *args):
|
|
36
|
+
full_url = self.account.dataset_uri
|
|
37
|
+
for arg in args:
|
|
38
|
+
full_url = full_url.rstrip('/') + '/' + self.slugify(arg)
|
|
39
|
+
|
|
40
|
+
return full_url
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from typing import Callable, Awaitable
|
|
4
|
+
from urllib.parse import quote
|
|
5
|
+
|
|
6
|
+
import wordlift_client
|
|
7
|
+
from pandas import Series
|
|
8
|
+
from rdflib import Graph, URIRef, RDF, Literal, XSD
|
|
9
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
10
|
+
from wordlift_client import InternalLinkRequest, InternalLink, InternalLinksApi, AnchorText, Item, \
|
|
11
|
+
VectorSearchQueryRequest, EntityPatchRequest, Configuration
|
|
12
|
+
|
|
13
|
+
from wordlift_sdk import entity
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def create_internal_link_request_default_filter(row: Series, request: InternalLinkRequest) -> InternalLinkRequest:
|
|
19
|
+
return request
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@retry(
|
|
23
|
+
stop=stop_after_attempt(5),
|
|
24
|
+
wait=wait_fixed(2)
|
|
25
|
+
)
|
|
26
|
+
async def create_internal_link(
|
|
27
|
+
configuration: Configuration,
|
|
28
|
+
row: Series,
|
|
29
|
+
no_links: int = 10,
|
|
30
|
+
internal_link_request_filter: Callable[
|
|
31
|
+
[Series, InternalLinkRequest], Awaitable[
|
|
32
|
+
InternalLinkRequest]] = create_internal_link_request_default_filter
|
|
33
|
+
) -> InternalLink | None:
|
|
34
|
+
import wordlift_client
|
|
35
|
+
entity_url = row['url']
|
|
36
|
+
entity_id = row['iri']
|
|
37
|
+
|
|
38
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
39
|
+
api = InternalLinksApi(api_client)
|
|
40
|
+
request = await internal_link_request_filter(
|
|
41
|
+
row,
|
|
42
|
+
InternalLinkRequest(
|
|
43
|
+
anchor_text=AnchorText(
|
|
44
|
+
enabled=True
|
|
45
|
+
),
|
|
46
|
+
items=[
|
|
47
|
+
Item(
|
|
48
|
+
id=entity_id,
|
|
49
|
+
query=VectorSearchQueryRequest(
|
|
50
|
+
query_url=entity_url,
|
|
51
|
+
similarity_top_k=no_links
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
results = await api.create_internal_link_suggestion(internal_link_request=request, _request_timeout=120)
|
|
60
|
+
return results[0]
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.error("Error creating Internal Links: %s", e)
|
|
63
|
+
raise e
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class InternalLinkData:
|
|
67
|
+
source_id: str
|
|
68
|
+
source_patch_request: EntityPatchRequest
|
|
69
|
+
link_group_graph: Graph
|
|
70
|
+
|
|
71
|
+
def __init__(self, source_id: str, source_patch_request: EntityPatchRequest, link_group_graph: Graph):
|
|
72
|
+
self.source_id = source_id
|
|
73
|
+
self.source_patch_request = source_patch_request
|
|
74
|
+
self.link_group_graph = link_group_graph
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def create_internal_link_data(internal_link: InternalLink, group_id: str) -> InternalLinkData:
|
|
78
|
+
"""
|
|
79
|
+
Create an RDFlib Graph from an InternalLink object using the SEO vocabulary.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
internal_link: InternalLink object from wordlift_client
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
RDFlib Graph containing the mapped data
|
|
86
|
+
:param group_id:
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
# This is an example structure:
|
|
90
|
+
#
|
|
91
|
+
# InternalLink(
|
|
92
|
+
# destinations=[
|
|
93
|
+
# InternalLinkDestination(
|
|
94
|
+
# name='SEO Strategies',
|
|
95
|
+
# position=1,
|
|
96
|
+
# url='https://wordlift.io/blog/en/advanced-seo-natural-language-processing/'
|
|
97
|
+
# ),
|
|
98
|
+
# InternalLinkDestination(
|
|
99
|
+
# name='SERP Analysis',
|
|
100
|
+
# position=2,
|
|
101
|
+
# url='https://wordlift.io/blog/en/serp-analysis/'
|
|
102
|
+
# ),
|
|
103
|
+
# InternalLinkDestination(
|
|
104
|
+
# name='Semantic Search',
|
|
105
|
+
# position=3,
|
|
106
|
+
# url='https://wordlift.io/blog/en/semantic-search/'
|
|
107
|
+
# ),
|
|
108
|
+
# InternalLinkDestination(
|
|
109
|
+
# name='Text Summarize',
|
|
110
|
+
# position=4,
|
|
111
|
+
# url='https://wordlift.io/blog/en/text-summarization-in-seo/'
|
|
112
|
+
# ),
|
|
113
|
+
# InternalLinkDestination(
|
|
114
|
+
# name='RankBrain In SEO',
|
|
115
|
+
# position=5,
|
|
116
|
+
# url='https://wordlift.io/blog/en/rankbrain-will-make-blog-worthless-unless/'
|
|
117
|
+
# ),
|
|
118
|
+
# InternalLinkDestination(
|
|
119
|
+
# name='SEO and AI',
|
|
120
|
+
# position=6,
|
|
121
|
+
# url='https://wordlift.io/blog/en/how-expert-professional-seo-evolves-with-ai/'
|
|
122
|
+
# ),
|
|
123
|
+
# InternalLinkDestination(
|
|
124
|
+
# name='Content Optimize',
|
|
125
|
+
# position=7,
|
|
126
|
+
# url='https://wordlift.io/blog/en/seo-content-optimization/'
|
|
127
|
+
# ),
|
|
128
|
+
# InternalLinkDestination(
|
|
129
|
+
# name='Google Advances',
|
|
130
|
+
# position=8,
|
|
131
|
+
# url='https://wordlift.io/blog/en/advances-in-image-understanding/'
|
|
132
|
+
# ),
|
|
133
|
+
# InternalLinkDestination(
|
|
134
|
+
# name='Knowledge Graphs',
|
|
135
|
+
# position=9,
|
|
136
|
+
# url='https://wordlift.io/blog/en/finding-entities-knowledge-graphs/'
|
|
137
|
+
# )
|
|
138
|
+
# ],
|
|
139
|
+
# source=InternalLinkSource(
|
|
140
|
+
# id='https://data.wordlift.io/wl1505904/title-tag-seo-using-deep-learning-and-tensorflow-3e9202b7c7a6fde83605021a5820ab04',
|
|
141
|
+
# name=None,
|
|
142
|
+
# url='https://wordlift.io/blog/en/title-tag-seo-using-ai/'
|
|
143
|
+
# )
|
|
144
|
+
# )
|
|
145
|
+
|
|
146
|
+
# Validate group_id
|
|
147
|
+
if not group_id or not isinstance(group_id, str):
|
|
148
|
+
raise ValueError("group_id must be a non-empty string")
|
|
149
|
+
|
|
150
|
+
# Check for valid characters (alphanumeric, hyphen, underscore)
|
|
151
|
+
if not re.match(r'^[a-zA-Z0-9\-_]+$', group_id):
|
|
152
|
+
raise ValueError("group_id must contain only alphanumeric characters, hyphens, or underscores")
|
|
153
|
+
|
|
154
|
+
# URL encode the group_id for extra safety
|
|
155
|
+
safe_group_id = quote(group_id)
|
|
156
|
+
|
|
157
|
+
link_group_graph = Graph()
|
|
158
|
+
source_graph = Graph()
|
|
159
|
+
|
|
160
|
+
source_graph.bind("seovoc", "https://w3id.org/seovoc/")
|
|
161
|
+
|
|
162
|
+
# Define namespaces
|
|
163
|
+
link_group_graph.bind("seovoc", "https://w3id.org/seovoc/")
|
|
164
|
+
link_group_graph.bind("xsd", "http://www.w3.org/2001/XMLSchema#")
|
|
165
|
+
|
|
166
|
+
# Create source resource
|
|
167
|
+
source = internal_link.source
|
|
168
|
+
source_resource = URIRef(source.id)
|
|
169
|
+
|
|
170
|
+
# Create a default link group for the destinations
|
|
171
|
+
link_group_id = f"{source.id}/linkgroup_{safe_group_id}"
|
|
172
|
+
link_group = URIRef(link_group_id)
|
|
173
|
+
|
|
174
|
+
has_link_group = URIRef("https://w3id.org/seovoc/hasLinkGroup")
|
|
175
|
+
source_graph.add((source_resource, has_link_group, link_group))
|
|
176
|
+
|
|
177
|
+
link_group_graph.add((link_group, RDF.type, URIRef("https://w3id.org/seovoc/LinkGroup")))
|
|
178
|
+
link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/identifier"), Literal(group_id)))
|
|
179
|
+
link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/name"), Literal("Related Links")))
|
|
180
|
+
link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/isLinkGroupOf"), source_resource))
|
|
181
|
+
|
|
182
|
+
# Add destinations as links
|
|
183
|
+
for dest in internal_link.destinations:
|
|
184
|
+
# Create link resource
|
|
185
|
+
link_id = f"{link_group_id}/link_{dest.position}"
|
|
186
|
+
link_resource = URIRef(link_id)
|
|
187
|
+
link_group_graph.add((link_resource, RDF.type, URIRef("https://w3id.org/seovoc/Link")))
|
|
188
|
+
|
|
189
|
+
# Add link properties
|
|
190
|
+
link_group_graph.add(
|
|
191
|
+
(link_resource, URIRef("https://w3id.org/seovoc/position"), Literal(dest.position, datatype=XSD.integer)))
|
|
192
|
+
link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/name"), Literal(dest.name)))
|
|
193
|
+
link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorText"), Literal(dest.name)))
|
|
194
|
+
link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorValue"), URIRef(dest.url)))
|
|
195
|
+
link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorResource"), URIRef(dest.id)))
|
|
196
|
+
link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/isLinkOf"), link_group))
|
|
197
|
+
link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/hasLink"), link_resource))
|
|
198
|
+
|
|
199
|
+
source_patch_request = EntityPatchRequest(
|
|
200
|
+
op='add',
|
|
201
|
+
path='/' + str(has_link_group),
|
|
202
|
+
value=source_graph.serialize(format='json-ld', auto_compact=True)
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return InternalLinkData(source.id, source_patch_request, link_group_graph)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def create_internal_link_handler(
|
|
209
|
+
configuration: Configuration,
|
|
210
|
+
link_group_id: str,
|
|
211
|
+
no_links: int = 10,
|
|
212
|
+
internal_link_request_filter: Callable[
|
|
213
|
+
[Series, InternalLinkRequest], Awaitable[InternalLinkRequest]] = create_internal_link_request_default_filter
|
|
214
|
+
) -> Callable[[Series], Awaitable[None]]:
|
|
215
|
+
async def handle(row: Series) -> None:
|
|
216
|
+
response = await create_internal_link(configuration, row, no_links, internal_link_request_filter)
|
|
217
|
+
|
|
218
|
+
if not response:
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
data = await create_internal_link_data(response, link_group_id)
|
|
222
|
+
|
|
223
|
+
await entity.patch(configuration, data.source_id, [data.source_patch_request])
|
|
224
|
+
|
|
225
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
226
|
+
# Create an instance of the API class
|
|
227
|
+
api_instance = wordlift_client.EntitiesApi(api_client)
|
|
228
|
+
body = data.link_group_graph.serialize(format="turtle")
|
|
229
|
+
await api_instance.create_or_update_entities(body, _content_type="text/turtle")
|
|
230
|
+
|
|
231
|
+
return handle
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
class Entity:
|
|
2
|
+
_props: dict[str, str]
|
|
3
|
+
|
|
4
|
+
def __init__(self, props: dict[str, str]):
|
|
5
|
+
self._props = props
|
|
6
|
+
|
|
7
|
+
@property
|
|
8
|
+
def iri(self) -> str:
|
|
9
|
+
return self._props['iri']
|
|
10
|
+
|
|
11
|
+
@property
|
|
12
|
+
def url(self) -> str:
|
|
13
|
+
return self._props['url']
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def from_dict(values: dict[str, str]):
|
|
17
|
+
return Entity(values)
|