wordlift-sdk 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +3 -0
- wordlift_sdk/client/__init__.py +3 -0
- wordlift_sdk/client/client_configuration_factory.py +26 -0
- wordlift_sdk/configuration/__init__.py +4 -0
- wordlift_sdk/configuration/configuration_provider.py +44 -0
- wordlift_sdk/configuration/get_config_value.py +39 -0
- wordlift_sdk/container/__init__.py +3 -0
- wordlift_sdk/container/application_container.py +234 -0
- wordlift_sdk/deprecated/__init__.py +5 -0
- wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
- wordlift_sdk/entity/__init__.py +4 -0
- wordlift_sdk/entity/enrich.py +54 -0
- wordlift_sdk/entity/patch.py +14 -0
- wordlift_sdk/google_search_console/__init__.py +5 -0
- wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
- wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
- wordlift_sdk/graph/graph_bag.py +7 -0
- wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
- wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
- wordlift_sdk/graphql/__init__.py +3 -0
- wordlift_sdk/graphql/client/__init__.py +5 -0
- wordlift_sdk/graphql/client/client.py +69 -0
- wordlift_sdk/graphql/client/factory.py +36 -0
- wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
- wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
- wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
- wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
- wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
- wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
- wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
- wordlift_sdk/graphql/query.py +20 -0
- wordlift_sdk/graphql/utils/__init__.py +0 -0
- wordlift_sdk/graphql/utils/query/__init__.py +4 -0
- wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
- wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
- wordlift_sdk/id_generator/__init__.py +3 -0
- wordlift_sdk/id_generator/id_generator.py +40 -0
- wordlift_sdk/id_generator/id_generator_interface.py +8 -0
- wordlift_sdk/internal_link/__init__.py +3 -0
- wordlift_sdk/internal_link/utils.py +231 -0
- wordlift_sdk/kg/__init__.py +5 -0
- wordlift_sdk/kg/entity.py +17 -0
- wordlift_sdk/kg/entity_store.py +94 -0
- wordlift_sdk/kg/entity_store_factory.py +13 -0
- wordlift_sdk/kg/relation/__init__.py +0 -0
- wordlift_sdk/kg/relation/relation_service.py +78 -0
- wordlift_sdk/main.py +7 -0
- wordlift_sdk/namespace/SDO.py +3281 -0
- wordlift_sdk/namespace/__init__.py +3 -0
- wordlift_sdk/notebook/__init__.py +3 -0
- wordlift_sdk/notebook/install_if_missing.py +12 -0
- wordlift_sdk/protocol/__init__.py +5 -0
- wordlift_sdk/protocol/context.py +21 -0
- wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
- wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
- wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
- wordlift_sdk/protocol/graph/__init__.py +3 -0
- wordlift_sdk/protocol/graph/graph_queue.py +64 -0
- wordlift_sdk/protocol/load_override_class.py +30 -0
- wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
- wordlift_sdk/url_source/__init__.py +6 -0
- wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
- wordlift_sdk/url_source/list_url_source.py +28 -0
- wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
- wordlift_sdk/url_source/sitemap_url_source.py +36 -0
- wordlift_sdk/url_source/url_source.py +18 -0
- wordlift_sdk/url_source/url_source_input.py +6 -0
- wordlift_sdk/utils/__init__.py +17 -0
- wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
- wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
- wordlift_sdk/utils/create_entity_patch_request.py +14 -0
- wordlift_sdk/utils/delayed.py +12 -0
- wordlift_sdk/utils/get_me.py +8 -0
- wordlift_sdk/utils/import_url.py +35 -0
- wordlift_sdk/wordlift/__init__.py +0 -0
- wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
- wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
- wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
- wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
- wordlift_sdk/workflow/__init__.py +3 -0
- wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
- wordlift_sdk/workflow/kg_import_workflow.py +49 -0
- wordlift_sdk/workflow/patch_entities_factory.py +16 -0
- wordlift_sdk/workflow/url_handler/__init__.py +3 -0
- wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
- wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
- wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
- wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
- wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
- wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
- wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, AsyncGenerator
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from gql import Client, gql
|
|
6
|
+
from pandas.core.interchange.dataframe_protocol import DataFrame
|
|
7
|
+
from tenacity import AsyncRetrying, stop_after_attempt, wait_fixed, RetryError
|
|
8
|
+
|
|
9
|
+
from wordlift_sdk.kg import Entity
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EntityStore:
|
|
15
|
+
_gql_client: Client
|
|
16
|
+
|
|
17
|
+
def __init__(self, gql_client: Client):
|
|
18
|
+
self._gql_client = gql_client
|
|
19
|
+
|
|
20
|
+
async def url_id(self, url_list: List[str] = None) -> AsyncGenerator[Entity, None]:
|
|
21
|
+
# the query.
|
|
22
|
+
query = gql(
|
|
23
|
+
"""
|
|
24
|
+
query entities_url_id($urls: [String]!) {
|
|
25
|
+
entities(query: { urlConstraint: { in: $urls } } ) {
|
|
26
|
+
url: string(name:"schema:url")
|
|
27
|
+
id: iri
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
"""
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# the variables.
|
|
34
|
+
values = {"urls": url_list}
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
async for attempt in AsyncRetrying(stop=stop_after_attempt(3), wait=wait_fixed(2)):
|
|
38
|
+
with attempt:
|
|
39
|
+
logger.debug(
|
|
40
|
+
'Loading data from GraphQL with attempt %d', attempt.retry_state.attempt_number)
|
|
41
|
+
response = await self._gql_client.execute_async(query, variable_values=values)
|
|
42
|
+
for item in response['entities']:
|
|
43
|
+
yield Entity(item)
|
|
44
|
+
except RetryError as e:
|
|
45
|
+
logger.error('Error loading data from GraphQL', exc_info=True)
|
|
46
|
+
|
|
47
|
+
async def url_iri(self, url_list: List[str] = None) -> AsyncGenerator[Entity, None]:
|
|
48
|
+
# the query.
|
|
49
|
+
query = gql(
|
|
50
|
+
"""
|
|
51
|
+
query entities_url_iri($urls: [String]!) {
|
|
52
|
+
entities(query: { urlConstraint: { in: $urls } } ) {
|
|
53
|
+
url: string(name:"schema:url")
|
|
54
|
+
iri
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
"""
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# the variables.
|
|
61
|
+
values = {"urls": url_list}
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
async for attempt in AsyncRetrying(stop=stop_after_attempt(3), wait=wait_fixed(2)):
|
|
65
|
+
with attempt:
|
|
66
|
+
logger.debug(
|
|
67
|
+
'Loading data from GraphQL with attempt %d', attempt.retry_state.attempt_number)
|
|
68
|
+
response = await self._gql_client.execute_async(query, variable_values=values)
|
|
69
|
+
for item in response['entities']:
|
|
70
|
+
yield Entity(item)
|
|
71
|
+
except RetryError as e:
|
|
72
|
+
logger.error('Error loading data from GraphQL', exc_info=True)
|
|
73
|
+
|
|
74
|
+
async def url_id_as_dataframe(self, url_list: List[str] = None) -> DataFrame:
|
|
75
|
+
"""
|
|
76
|
+
Get the Entity URL ID maps as a Pandas dataframe.
|
|
77
|
+
|
|
78
|
+
:param url_list:
|
|
79
|
+
:return:
|
|
80
|
+
"""
|
|
81
|
+
return pd.DataFrame.from_records(
|
|
82
|
+
data=[(entity.url, entity.iri) async for entity in self.url_id(url_list)],
|
|
83
|
+
columns=("url", "id"))
|
|
84
|
+
|
|
85
|
+
async def url_iri_as_dataframe(self, url_list: List[str] = None) -> DataFrame:
|
|
86
|
+
"""
|
|
87
|
+
Get the Entity URL ID maps as a Pandas dataframe.
|
|
88
|
+
|
|
89
|
+
:param url_list:
|
|
90
|
+
:return:
|
|
91
|
+
"""
|
|
92
|
+
return pd.DataFrame.from_records(
|
|
93
|
+
data=[(entity.url, entity.iri) async for entity in self.url_iri(url_list)],
|
|
94
|
+
columns=("url", "iri"))
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from gql import Client
|
|
2
|
+
|
|
3
|
+
from wordlift_sdk.kg import EntityStore
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EntityStoreFactory:
|
|
7
|
+
_gql_client: Client
|
|
8
|
+
|
|
9
|
+
def __init__(self, gql_client: Client):
|
|
10
|
+
self._gql_client = gql_client
|
|
11
|
+
|
|
12
|
+
def create(self):
|
|
13
|
+
return EntityStore(self._gql_client)
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import wordlift_client
|
|
2
|
+
from wordlift_client import Configuration, VectorSearchQueriesApi, VectorSearchQueryRequest
|
|
3
|
+
|
|
4
|
+
from wordlift_sdk.kg import Entity
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RelationService:
|
|
8
|
+
_configuration: Configuration
|
|
9
|
+
|
|
10
|
+
def __init__(self, configuration: Configuration):
|
|
11
|
+
self._configuration = configuration
|
|
12
|
+
|
|
13
|
+
def get_relations(self, entity: Entity):
|
|
14
|
+
async with wordlift_client.ApiClient(self._configuration) as api_client:
|
|
15
|
+
# Search for related pages
|
|
16
|
+
search_api = VectorSearchQueriesApi(api_client)
|
|
17
|
+
|
|
18
|
+
related_request = VectorSearchQueryRequest(
|
|
19
|
+
query_url=entity_url,
|
|
20
|
+
similarity_top_k=100,
|
|
21
|
+
fields=["schema:url", "schema:headline", "ex-private:category", "ex-private:subCategory",
|
|
22
|
+
"ex-private:subSubCategory", "ex-private:subSubSubCategory", "ex-private:location",
|
|
23
|
+
"ex-private:pagesSection"]
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
related_page = await search_api.create_query(vector_search_query_request=related_request)
|
|
28
|
+
print(f"Number of related items found: {len(related_page.items)}")
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.error(f"Error during vector search: {e}")
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
# Filter and re-rank results
|
|
34
|
+
filtered_results = []
|
|
35
|
+
for item in related_page.items:
|
|
36
|
+
item_url = safe_get_field(item, "schema:url")
|
|
37
|
+
if item_url == entity_url:
|
|
38
|
+
print(f"Skipping original entity: {item_url}")
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
item_category = safe_get_field(item, "ex-private:category")
|
|
42
|
+
item_sub_category = safe_get_field(item, "ex-private:subCategory")
|
|
43
|
+
item_sub_sub_category = safe_get_field(item, "ex-private:subSubCategory")
|
|
44
|
+
item_sub_sub_sub_category = safe_get_field(item, "ex-private:subSubSubCategory")
|
|
45
|
+
|
|
46
|
+
print(f"Processing item: {item_url}")
|
|
47
|
+
print(
|
|
48
|
+
f"Item categories: {item_category} > {item_sub_category} > {item_sub_sub_category} > {item_sub_sub_sub_category}")
|
|
49
|
+
|
|
50
|
+
if (item_category == main_category or
|
|
51
|
+
item_sub_category == sub_category or
|
|
52
|
+
item_sub_sub_category == sub_sub_category or
|
|
53
|
+
item_sub_sub_sub_category == sub_sub_sub_category):
|
|
54
|
+
|
|
55
|
+
score = item.score
|
|
56
|
+
item_location = safe_get_field(item, "ex-private:location")
|
|
57
|
+
item_pages_section = safe_get_field(item, "ex-private:pagesSection")
|
|
58
|
+
|
|
59
|
+
print(f"Item location: {item_location}")
|
|
60
|
+
print(f"Item pages section: {item_pages_section}")
|
|
61
|
+
|
|
62
|
+
if item_location == location:
|
|
63
|
+
score += 0.2
|
|
64
|
+
print("Location match, score boosted")
|
|
65
|
+
if item_pages_section == "Top":
|
|
66
|
+
score += 0.1
|
|
67
|
+
print("Top page, score boosted")
|
|
68
|
+
|
|
69
|
+
filtered_results.append({
|
|
70
|
+
"url": item_url,
|
|
71
|
+
"headline": safe_get_field(item, "schema:headline"),
|
|
72
|
+
"score": score
|
|
73
|
+
})
|
|
74
|
+
print(f"Item added to filtered results. Score: {score}")
|
|
75
|
+
else:
|
|
76
|
+
print("Item categories don't match any of our category levels, skipped")
|
|
77
|
+
|
|
78
|
+
print("---")
|
wordlift_sdk/main.py
ADDED