wordlift-sdk 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. wordlift_sdk/__init__.py +3 -0
  2. wordlift_sdk/client/__init__.py +3 -0
  3. wordlift_sdk/client/client_configuration_factory.py +26 -0
  4. wordlift_sdk/configuration/__init__.py +4 -0
  5. wordlift_sdk/configuration/configuration_provider.py +44 -0
  6. wordlift_sdk/configuration/get_config_value.py +39 -0
  7. wordlift_sdk/container/__init__.py +3 -0
  8. wordlift_sdk/container/application_container.py +234 -0
  9. wordlift_sdk/deprecated/__init__.py +5 -0
  10. wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
  11. wordlift_sdk/entity/__init__.py +4 -0
  12. wordlift_sdk/entity/enrich.py +54 -0
  13. wordlift_sdk/entity/patch.py +14 -0
  14. wordlift_sdk/google_search_console/__init__.py +5 -0
  15. wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
  16. wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
  17. wordlift_sdk/graph/graph_bag.py +7 -0
  18. wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
  19. wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
  20. wordlift_sdk/graphql/__init__.py +3 -0
  21. wordlift_sdk/graphql/client/__init__.py +5 -0
  22. wordlift_sdk/graphql/client/client.py +69 -0
  23. wordlift_sdk/graphql/client/factory.py +36 -0
  24. wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
  25. wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
  26. wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
  27. wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
  28. wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
  29. wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
  30. wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
  31. wordlift_sdk/graphql/query.py +20 -0
  32. wordlift_sdk/graphql/utils/__init__.py +0 -0
  33. wordlift_sdk/graphql/utils/query/__init__.py +4 -0
  34. wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
  35. wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
  36. wordlift_sdk/id_generator/__init__.py +3 -0
  37. wordlift_sdk/id_generator/id_generator.py +40 -0
  38. wordlift_sdk/id_generator/id_generator_interface.py +8 -0
  39. wordlift_sdk/internal_link/__init__.py +3 -0
  40. wordlift_sdk/internal_link/utils.py +231 -0
  41. wordlift_sdk/kg/__init__.py +5 -0
  42. wordlift_sdk/kg/entity.py +17 -0
  43. wordlift_sdk/kg/entity_store.py +94 -0
  44. wordlift_sdk/kg/entity_store_factory.py +13 -0
  45. wordlift_sdk/kg/relation/__init__.py +0 -0
  46. wordlift_sdk/kg/relation/relation_service.py +78 -0
  47. wordlift_sdk/main.py +7 -0
  48. wordlift_sdk/namespace/SDO.py +3281 -0
  49. wordlift_sdk/namespace/__init__.py +3 -0
  50. wordlift_sdk/notebook/__init__.py +3 -0
  51. wordlift_sdk/notebook/install_if_missing.py +12 -0
  52. wordlift_sdk/protocol/__init__.py +5 -0
  53. wordlift_sdk/protocol/context.py +21 -0
  54. wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
  55. wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
  56. wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
  57. wordlift_sdk/protocol/graph/__init__.py +3 -0
  58. wordlift_sdk/protocol/graph/graph_queue.py +64 -0
  59. wordlift_sdk/protocol/load_override_class.py +30 -0
  60. wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
  61. wordlift_sdk/url_source/__init__.py +6 -0
  62. wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
  63. wordlift_sdk/url_source/list_url_source.py +28 -0
  64. wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
  65. wordlift_sdk/url_source/sitemap_url_source.py +36 -0
  66. wordlift_sdk/url_source/url_source.py +18 -0
  67. wordlift_sdk/url_source/url_source_input.py +6 -0
  68. wordlift_sdk/utils/__init__.py +17 -0
  69. wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
  70. wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
  71. wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
  72. wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
  73. wordlift_sdk/utils/create_entity_patch_request.py +14 -0
  74. wordlift_sdk/utils/delayed.py +12 -0
  75. wordlift_sdk/utils/get_me.py +8 -0
  76. wordlift_sdk/utils/import_url.py +35 -0
  77. wordlift_sdk/wordlift/__init__.py +0 -0
  78. wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
  79. wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
  80. wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
  81. wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  82. wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
  83. wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
  84. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
  85. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
  86. wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
  87. wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
  88. wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
  89. wordlift_sdk/workflow/__init__.py +3 -0
  90. wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
  91. wordlift_sdk/workflow/kg_import_workflow.py +49 -0
  92. wordlift_sdk/workflow/patch_entities_factory.py +16 -0
  93. wordlift_sdk/workflow/url_handler/__init__.py +3 -0
  94. wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
  95. wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
  96. wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
  97. wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
  98. wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
  99. wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
  100. wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,94 @@
1
+ import logging
2
+ from typing import List, AsyncGenerator
3
+
4
+ import pandas as pd
5
+ from gql import Client, gql
6
+ from pandas.core.interchange.dataframe_protocol import DataFrame
7
+ from tenacity import AsyncRetrying, stop_after_attempt, wait_fixed, RetryError
8
+
9
+ from wordlift_sdk.kg import Entity
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class EntityStore:
15
+ _gql_client: Client
16
+
17
+ def __init__(self, gql_client: Client):
18
+ self._gql_client = gql_client
19
+
20
+ async def url_id(self, url_list: List[str] = None) -> AsyncGenerator[Entity, None]:
21
+ # the query.
22
+ query = gql(
23
+ """
24
+ query entities_url_id($urls: [String]!) {
25
+ entities(query: { urlConstraint: { in: $urls } } ) {
26
+ url: string(name:"schema:url")
27
+ id: iri
28
+ }
29
+ }
30
+ """
31
+ )
32
+
33
+ # the variables.
34
+ values = {"urls": url_list}
35
+
36
+ try:
37
+ async for attempt in AsyncRetrying(stop=stop_after_attempt(3), wait=wait_fixed(2)):
38
+ with attempt:
39
+ logger.debug(
40
+ 'Loading data from GraphQL with attempt %d', attempt.retry_state.attempt_number)
41
+ response = await self._gql_client.execute_async(query, variable_values=values)
42
+ for item in response['entities']:
43
+ yield Entity(item)
44
+ except RetryError as e:
45
+ logger.error('Error loading data from GraphQL', exc_info=True)
46
+
47
+ async def url_iri(self, url_list: List[str] = None) -> AsyncGenerator[Entity, None]:
48
+ # the query.
49
+ query = gql(
50
+ """
51
+ query entities_url_iri($urls: [String]!) {
52
+ entities(query: { urlConstraint: { in: $urls } } ) {
53
+ url: string(name:"schema:url")
54
+ iri
55
+ }
56
+ }
57
+ """
58
+ )
59
+
60
+ # the variables.
61
+ values = {"urls": url_list}
62
+
63
+ try:
64
+ async for attempt in AsyncRetrying(stop=stop_after_attempt(3), wait=wait_fixed(2)):
65
+ with attempt:
66
+ logger.debug(
67
+ 'Loading data from GraphQL with attempt %d', attempt.retry_state.attempt_number)
68
+ response = await self._gql_client.execute_async(query, variable_values=values)
69
+ for item in response['entities']:
70
+ yield Entity(item)
71
+ except RetryError as e:
72
+ logger.error('Error loading data from GraphQL', exc_info=True)
73
+
74
+ async def url_id_as_dataframe(self, url_list: List[str] = None) -> DataFrame:
75
+ """
76
+ Get the Entity URL ID maps as a Pandas dataframe.
77
+
78
+ :param url_list:
79
+ :return:
80
+ """
81
+ return pd.DataFrame.from_records(
82
+ data=[(entity.url, entity.iri) async for entity in self.url_id(url_list)],
83
+ columns=("url", "id"))
84
+
85
+ async def url_iri_as_dataframe(self, url_list: List[str] = None) -> DataFrame:
86
+ """
87
+ Get the Entity URL ID maps as a Pandas dataframe.
88
+
89
+ :param url_list:
90
+ :return:
91
+ """
92
+ return pd.DataFrame.from_records(
93
+ data=[(entity.url, entity.iri) async for entity in self.url_iri(url_list)],
94
+ columns=("url", "iri"))
@@ -0,0 +1,13 @@
1
+ from gql import Client
2
+
3
+ from wordlift_sdk.kg import EntityStore
4
+
5
+
6
+ class EntityStoreFactory:
7
+ _gql_client: Client
8
+
9
+ def __init__(self, gql_client: Client):
10
+ self._gql_client = gql_client
11
+
12
+ def create(self):
13
+ return EntityStore(self._gql_client)
File without changes
@@ -0,0 +1,78 @@
1
+ import wordlift_client
2
+ from wordlift_client import Configuration, VectorSearchQueriesApi, VectorSearchQueryRequest
3
+
4
+ from wordlift_sdk.kg import Entity
5
+
6
+
7
+ class RelationService:
8
+ _configuration: Configuration
9
+
10
+ def __init__(self, configuration: Configuration):
11
+ self._configuration = configuration
12
+
13
+ def get_relations(self, entity: Entity):
14
+ async with wordlift_client.ApiClient(self._configuration) as api_client:
15
+ # Search for related pages
16
+ search_api = VectorSearchQueriesApi(api_client)
17
+
18
+ related_request = VectorSearchQueryRequest(
19
+ query_url=entity_url,
20
+ similarity_top_k=100,
21
+ fields=["schema:url", "schema:headline", "ex-private:category", "ex-private:subCategory",
22
+ "ex-private:subSubCategory", "ex-private:subSubSubCategory", "ex-private:location",
23
+ "ex-private:pagesSection"]
24
+ )
25
+
26
+ try:
27
+ related_page = await search_api.create_query(vector_search_query_request=related_request)
28
+ print(f"Number of related items found: {len(related_page.items)}")
29
+ except Exception as e:
30
+ logger.error(f"Error during vector search: {e}")
31
+ return False
32
+
33
+ # Filter and re-rank results
34
+ filtered_results = []
35
+ for item in related_page.items:
36
+ item_url = safe_get_field(item, "schema:url")
37
+ if item_url == entity_url:
38
+ print(f"Skipping original entity: {item_url}")
39
+ continue
40
+
41
+ item_category = safe_get_field(item, "ex-private:category")
42
+ item_sub_category = safe_get_field(item, "ex-private:subCategory")
43
+ item_sub_sub_category = safe_get_field(item, "ex-private:subSubCategory")
44
+ item_sub_sub_sub_category = safe_get_field(item, "ex-private:subSubSubCategory")
45
+
46
+ print(f"Processing item: {item_url}")
47
+ print(
48
+ f"Item categories: {item_category} > {item_sub_category} > {item_sub_sub_category} > {item_sub_sub_sub_category}")
49
+
50
+ if (item_category == main_category or
51
+ item_sub_category == sub_category or
52
+ item_sub_sub_category == sub_sub_category or
53
+ item_sub_sub_sub_category == sub_sub_sub_category):
54
+
55
+ score = item.score
56
+ item_location = safe_get_field(item, "ex-private:location")
57
+ item_pages_section = safe_get_field(item, "ex-private:pagesSection")
58
+
59
+ print(f"Item location: {item_location}")
60
+ print(f"Item pages section: {item_pages_section}")
61
+
62
+ if item_location == location:
63
+ score += 0.2
64
+ print("Location match, score boosted")
65
+ if item_pages_section == "Top":
66
+ score += 0.1
67
+ print("Top page, score boosted")
68
+
69
+ filtered_results.append({
70
+ "url": item_url,
71
+ "headline": safe_get_field(item, "schema:headline"),
72
+ "score": score
73
+ })
74
+ print(f"Item added to filtered results. Score: {score}")
75
+ else:
76
+ print("Item categories don't match any of our category levels, skipped")
77
+
78
+ print("---")
wordlift_sdk/main.py ADDED
@@ -0,0 +1,7 @@
1
+ from .container.application_container import ApplicationContainer
2
+
3
+
4
+ async def run_kg_import_workflow():
5
+ application_container = ApplicationContainer()
6
+ workflow = await application_container.create_kg_import_workflow()
7
+ await workflow.run()