wordlift-sdk 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. wordlift_sdk/__init__.py +3 -0
  2. wordlift_sdk/client/__init__.py +3 -0
  3. wordlift_sdk/client/client_configuration_factory.py +26 -0
  4. wordlift_sdk/configuration/__init__.py +4 -0
  5. wordlift_sdk/configuration/configuration_provider.py +44 -0
  6. wordlift_sdk/configuration/get_config_value.py +39 -0
  7. wordlift_sdk/container/__init__.py +3 -0
  8. wordlift_sdk/container/application_container.py +234 -0
  9. wordlift_sdk/deprecated/__init__.py +5 -0
  10. wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
  11. wordlift_sdk/entity/__init__.py +4 -0
  12. wordlift_sdk/entity/enrich.py +54 -0
  13. wordlift_sdk/entity/patch.py +14 -0
  14. wordlift_sdk/google_search_console/__init__.py +5 -0
  15. wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
  16. wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
  17. wordlift_sdk/graph/graph_bag.py +7 -0
  18. wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
  19. wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
  20. wordlift_sdk/graphql/__init__.py +3 -0
  21. wordlift_sdk/graphql/client/__init__.py +5 -0
  22. wordlift_sdk/graphql/client/client.py +69 -0
  23. wordlift_sdk/graphql/client/factory.py +36 -0
  24. wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
  25. wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
  26. wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
  27. wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
  28. wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
  29. wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
  30. wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
  31. wordlift_sdk/graphql/query.py +20 -0
  32. wordlift_sdk/graphql/utils/__init__.py +0 -0
  33. wordlift_sdk/graphql/utils/query/__init__.py +4 -0
  34. wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
  35. wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
  36. wordlift_sdk/id_generator/__init__.py +3 -0
  37. wordlift_sdk/id_generator/id_generator.py +40 -0
  38. wordlift_sdk/id_generator/id_generator_interface.py +8 -0
  39. wordlift_sdk/internal_link/__init__.py +3 -0
  40. wordlift_sdk/internal_link/utils.py +231 -0
  41. wordlift_sdk/kg/__init__.py +5 -0
  42. wordlift_sdk/kg/entity.py +17 -0
  43. wordlift_sdk/kg/entity_store.py +94 -0
  44. wordlift_sdk/kg/entity_store_factory.py +13 -0
  45. wordlift_sdk/kg/relation/__init__.py +0 -0
  46. wordlift_sdk/kg/relation/relation_service.py +78 -0
  47. wordlift_sdk/main.py +7 -0
  48. wordlift_sdk/namespace/SDO.py +3281 -0
  49. wordlift_sdk/namespace/__init__.py +3 -0
  50. wordlift_sdk/notebook/__init__.py +3 -0
  51. wordlift_sdk/notebook/install_if_missing.py +12 -0
  52. wordlift_sdk/protocol/__init__.py +5 -0
  53. wordlift_sdk/protocol/context.py +21 -0
  54. wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
  55. wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
  56. wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
  57. wordlift_sdk/protocol/graph/__init__.py +3 -0
  58. wordlift_sdk/protocol/graph/graph_queue.py +64 -0
  59. wordlift_sdk/protocol/load_override_class.py +30 -0
  60. wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
  61. wordlift_sdk/url_source/__init__.py +6 -0
  62. wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
  63. wordlift_sdk/url_source/list_url_source.py +28 -0
  64. wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
  65. wordlift_sdk/url_source/sitemap_url_source.py +36 -0
  66. wordlift_sdk/url_source/url_source.py +18 -0
  67. wordlift_sdk/url_source/url_source_input.py +6 -0
  68. wordlift_sdk/utils/__init__.py +17 -0
  69. wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
  70. wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
  71. wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
  72. wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
  73. wordlift_sdk/utils/create_entity_patch_request.py +14 -0
  74. wordlift_sdk/utils/delayed.py +12 -0
  75. wordlift_sdk/utils/get_me.py +8 -0
  76. wordlift_sdk/utils/import_url.py +35 -0
  77. wordlift_sdk/wordlift/__init__.py +0 -0
  78. wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
  79. wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
  80. wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
  81. wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  82. wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
  83. wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
  84. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
  85. wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
  86. wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
  87. wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
  88. wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
  89. wordlift_sdk/workflow/__init__.py +3 -0
  90. wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
  91. wordlift_sdk/workflow/kg_import_workflow.py +49 -0
  92. wordlift_sdk/workflow/patch_entities_factory.py +16 -0
  93. wordlift_sdk/workflow/url_handler/__init__.py +3 -0
  94. wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
  95. wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
  96. wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
  97. wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
  98. wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
  99. wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
  100. wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,51 @@
1
+ from typing import Callable, Awaitable, Optional
2
+
3
+ from pandas import Series, DataFrame
4
+ from pycountry import countries
5
+ from tqdm.asyncio import tqdm
6
+ from twisted.mail.scripts.mailmail import Configuration
7
+ from wordlift_client import AccountInfo, AnalysesResponse
8
+
9
+ from .entity_gaps_callback import entity_gaps_callback_factory
10
+ from ...deprecated import create_entities_with_top_query_dataframe
11
+ from ...utils import create_delayed
12
+
13
+
14
+ async def create_entity_gaps_factory(
15
+ key: str, configuration: Configuration, account: AccountInfo
16
+ ):
17
+ async def callback(url_list: list[str]) -> DataFrame:
18
+ # Get the entity data with the top query.
19
+ entities_with_top_query_df = await create_entities_with_top_query_dataframe(
20
+ key=key, url_list=url_list
21
+ )
22
+
23
+ country = countries.get(alpha_2=account.country_code.upper())
24
+ delayed = create_delayed(
25
+ await append_entity_gaps_response_to_row_factory(
26
+ entity_gaps_callback=await entity_gaps_callback_factory(
27
+ configuration=configuration, query_location_name=country.name
28
+ )
29
+ ),
30
+ 2,
31
+ )
32
+ series = await tqdm.gather(
33
+ *[delayed(row) for index, row in entities_with_top_query_df.iterrows()],
34
+ total=len(entities_with_top_query_df),
35
+ )
36
+
37
+ return DataFrame(series)
38
+
39
+ return callback
40
+
41
+
42
+ async def append_entity_gaps_response_to_row_factory(
43
+ entity_gaps_callback: Callable[[Series], Awaitable[Optional[AnalysesResponse]]],
44
+ ) -> Callable[[Series], Awaitable[Series]]:
45
+ async def append_entity_gaps_response_to_row(row: Series) -> Series:
46
+ response = await entity_gaps_callback(row)
47
+ if response:
48
+ row["entity_gaps"] = response.items
49
+ return row
50
+
51
+ return append_entity_gaps_response_to_row
@@ -0,0 +1,32 @@
1
+ from typing import Callable, Awaitable, Optional
2
+
3
+ import wordlift_client
4
+ from pandas import Series
5
+ from tenacity import retry, stop_after_attempt, wait_fixed
6
+ from twisted.mail.scripts.mailmail import Configuration
7
+ from wordlift_client import AnalysesResponse, EntityGapsApi, EntityGapRequest
8
+
9
+
10
+ async def entity_gaps_callback_factory(configuration: Configuration, query_location_name: str) -> Callable[
11
+ [Series], Awaitable[Optional[AnalysesResponse]]]:
12
+ @retry(
13
+ stop=stop_after_attempt(10),
14
+ wait=wait_fixed(2)
15
+ )
16
+ async def entity_gaps_callback(row: Series) -> Optional[AnalysesResponse]:
17
+ url = row['url']
18
+ query = row['top_query_name']
19
+ if query is None:
20
+ return None
21
+
22
+ async with wordlift_client.ApiClient(configuration) as api_client:
23
+ api = EntityGapsApi(api_client)
24
+ return await api.create_entity_gap(
25
+ EntityGapRequest(
26
+ url=url,
27
+ query=query,
28
+ query_location_name=query_location_name
29
+ )
30
+ )
31
+
32
+ return entity_gaps_callback
File without changes
@@ -0,0 +1,14 @@
1
+ from .default import DefaultImportUrlProtocol, DefaultParseHtmlProtocol
2
+ from .import_url_protocol_interface import ImportUrlProtocolInterface, ImportUrlInput
3
+ from .parse_html_protocol_interface import ParseHtmlProtocolInterface, ParseHtmlInput
4
+ from .protocol_context import ProtocolContext
5
+
6
+ __all__ = [
7
+ 'ImportUrlProtocolInterface',
8
+ 'ImportUrlInput',
9
+ 'ParseHtmlProtocolInterface',
10
+ 'ParseHtmlInput',
11
+ 'ProtocolContext',
12
+ 'DefaultImportUrlProtocol',
13
+ 'DefaultParseHtmlProtocol',
14
+ ]
@@ -0,0 +1,4 @@
1
+ from .default_import_url_protocol import DefaultImportUrlProtocol
2
+ from .default_parse_html_protocol import DefaultParseHtmlProtocol
3
+
4
+ __all__ = ['DefaultParseHtmlProtocol', 'DefaultImportUrlProtocol']
@@ -0,0 +1,39 @@
1
+ import logging
2
+
3
+ import wordlift_client
4
+ from tenacity import retry, stop_after_attempt, wait_fixed
5
+ from wordlift_client import SitemapImportsApi, SitemapImportRequest, EmbeddingRequest
6
+
7
+ from ..import_url_protocol_interface import ImportUrlProtocolInterface, ImportUrlInput
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DefaultImportUrlProtocol(ImportUrlProtocolInterface):
13
+
14
+ @retry(
15
+ stop=stop_after_attempt(5),
16
+ wait=wait_fixed(2)
17
+ )
18
+ async def import_url(self, import_url_input: ImportUrlInput) -> None:
19
+ configuration = self.context.configuration
20
+ types = self.context.types
21
+ url_list = import_url_input.url_list
22
+
23
+ async with wordlift_client.ApiClient(configuration) as api_client:
24
+ imports_api = SitemapImportsApi(api_client)
25
+ request = SitemapImportRequest(
26
+ embedding=EmbeddingRequest(
27
+ properties=["http://schema.org/headline", "http://schema.org/abstract",
28
+ "http://schema.org/text"]
29
+ ),
30
+ output_types=list(types),
31
+ urls=list(url_list),
32
+ overwrite=True,
33
+ id_generator="headline-with-url-hash"
34
+ )
35
+
36
+ try:
37
+ await imports_api.create_sitemap_import(sitemap_import_request=request)
38
+ except Exception as e:
39
+ logger.error("Error importing URLs: %s", e)
@@ -0,0 +1,41 @@
1
+ """
2
+ This module provides a default implementation of the ParseHtmlProtocolInterface.
3
+
4
+ The DefaultParseHtmlProtocol class implements a basic HTML parsing protocol
5
+ that can be used as a fallback or starting point for more complex implementations.
6
+ """
7
+
8
+ from wordlift_client import EntityPatchRequest
9
+ from ..parse_html_protocol_interface import ParseHtmlProtocolInterface, ParseHtmlInput
10
+
11
+
12
+ class DefaultParseHtmlProtocol(ParseHtmlProtocolInterface):
13
+ """
14
+ Default implementation of the ParseHtmlProtocolInterface.
15
+
16
+ This class provides a minimal implementation of the HTML parsing protocol
17
+ that returns an empty list of entity patch requests. It can be used as a
18
+ base class for more complex implementations or as a fallback when no
19
+ specific parsing logic is required.
20
+
21
+ Attributes:
22
+ context (ProtocolContext): The protocol context containing configuration
23
+ and entity types information.
24
+ """
25
+
26
+ async def parse_html(self, parse_html_input: ParseHtmlInput) -> list[EntityPatchRequest]:
27
+ """
28
+ Parse HTML content and extract entity information.
29
+
30
+ This default implementation returns an empty list. Override this method
31
+ to implement custom HTML parsing logic.
32
+
33
+ Args:
34
+ parse_html_input (ParseHtmlInput): An object containing the HTML content to parse,
35
+ along with entity ID, URL, and additional data.
36
+
37
+ Returns:
38
+ list[EntityPatchRequest]: A list of entity patch requests to update
39
+ the knowledge graph. This implementation returns an empty list.
40
+ """
41
+ return list()
@@ -0,0 +1,21 @@
1
+ from abc import abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Protocol
4
+
5
+ from .protocol_context import ProtocolContext
6
+
7
+
8
+ @dataclass
9
+ class ImportUrlInput:
10
+ url_list: list[str]
11
+
12
+
13
+ class ImportUrlProtocolInterface(Protocol):
14
+ context: ProtocolContext
15
+
16
+ def __init__(self, context: ProtocolContext):
17
+ self.context = context
18
+
19
+ @abstractmethod
20
+ async def import_url(self, import_url_input: ImportUrlInput) -> None:
21
+ ...
@@ -0,0 +1,27 @@
1
+ from abc import abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Protocol
4
+
5
+ from pandas import Series
6
+ from wordlift_client import EntityPatchRequest
7
+
8
+ from .protocol_context import ProtocolContext
9
+
10
+
11
+ @dataclass
12
+ class ParseHtmlInput:
13
+ entity_id: str
14
+ entity_url: str
15
+ html: str
16
+ row: Series
17
+
18
+
19
+ class ParseHtmlProtocolInterface(Protocol):
20
+ context: ProtocolContext
21
+
22
+ def __init__(self, context: ProtocolContext):
23
+ self.context = context
24
+
25
+ @abstractmethod
26
+ async def parse_html(self, parse_html_input: ParseHtmlInput) -> list[EntityPatchRequest]:
27
+ ...
@@ -0,0 +1,16 @@
1
+ from dataclasses import dataclass
2
+
3
+ from wordlift_client import AccountInfo
4
+ from wordlift_client import Configuration
5
+
6
+ from ....id_generator.id_generator_interface import IdGeneratorInterface
7
+ from ....protocol.graph import GraphQueue
8
+
9
+
10
+ @dataclass
11
+ class ProtocolContext:
12
+ account: AccountInfo
13
+ configuration: Configuration
14
+ id_generator: IdGeneratorInterface
15
+ types: list[str]
16
+ graph_queue: GraphQueue
@@ -0,0 +1,3 @@
1
+ from .kg_import_workflow import KgImportWorkflow
2
+
3
+ __all__ = ["KgImportWorkflow"]
@@ -0,0 +1,16 @@
1
+ from wordlift_client import Configuration
2
+ from rdflib import Graph
3
+ import wordlift_client
4
+
5
+
6
+ async def create_or_update_entities_factory(configuration: Configuration):
7
+ async def callback(graph: Graph):
8
+ # Run all the queued graphs.
9
+ async with wordlift_client.ApiClient(configuration=configuration) as api_client:
10
+ api_instance = wordlift_client.EntitiesApi(api_client)
11
+ await api_instance.create_or_update_entities(
12
+ graph.serialize(format="turtle"),
13
+ _content_type="text/turtle",
14
+ )
15
+
16
+ return callback
@@ -0,0 +1,49 @@
1
+ import logging
2
+ from os import cpu_count
3
+ from pathlib import Path
4
+
5
+ from tqdm.asyncio import tqdm
6
+
7
+ from .url_handler.url_handler import UrlHandler
8
+ from ..graph.ttl_liquid import TtlLiquidGraphFactory
9
+ from ..protocol import (
10
+ Context,
11
+ )
12
+ from ..url_source import UrlSource
13
+ from ..utils import create_delayed
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class KgImportWorkflow:
19
+ _concurrency: int
20
+ _context: Context
21
+ _url_handler: UrlHandler
22
+ _url_source: UrlSource
23
+
24
+ def __init__(
25
+ self,
26
+ context: Context,
27
+ url_source: UrlSource,
28
+ url_handler: UrlHandler,
29
+ concurrency: int = min(cpu_count(), 4),
30
+ ) -> None:
31
+ self._context = context
32
+ self._url_source = url_source
33
+ self._url_handler = url_handler
34
+ self._concurrency = concurrency
35
+
36
+ async def run(self):
37
+ await TtlLiquidGraphFactory(
38
+ context=self._context, path=Path("data/templates")
39
+ ).graphs()
40
+
41
+ url_list = [url async for url in self._url_source.urls()]
42
+
43
+ logger.info("Applying %d URL import request(s)" % len(url_list))
44
+
45
+ delayed = create_delayed(self._url_handler, self._concurrency)
46
+ await tqdm.gather(
47
+ *[delayed(url) for url in list(url_list)],
48
+ total=len(url_list),
49
+ )
@@ -0,0 +1,16 @@
1
+ import wordlift_client
2
+ from wordlift_client import Configuration
3
+
4
+ from ..protocol.entity_patch import EntityPatch
5
+
6
+
7
+ async def patch_entities_factory(configuration: Configuration):
8
+ async def callback(entity_patch: EntityPatch):
9
+ # Run all the queued graphs.
10
+ async with wordlift_client.ApiClient(configuration=configuration) as api_client:
11
+ api_instance = wordlift_client.EntitiesApi(api_client)
12
+ await api_instance.patch_entities(
13
+ id=entity_patch.iri, entity=entity_patch.requests
14
+ )
15
+
16
+ return callback
@@ -0,0 +1,3 @@
1
+ from .web_page_import_url_handler import WebPageImportUrlHandler
2
+
3
+ __all__ = ["WebPageImportUrlHandler"]
@@ -0,0 +1,23 @@
1
+ import logging
2
+
3
+ from ...url_source import Url
4
+ from ...workflow.url_handler.url_handler import UrlHandler
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class DefaultUrlHandler(UrlHandler):
10
+ _url_handler_list: list[UrlHandler]
11
+
12
+ def __init__(self, url_handler_list: list[UrlHandler]):
13
+ super().__init__()
14
+ self._url_handler_list = url_handler_list
15
+
16
+ async def __call__(self, url: Url) -> None:
17
+ for url_handler in self._url_handler_list:
18
+ try:
19
+ await url_handler.__call__(url)
20
+ except Exception as e:
21
+ logger.error(
22
+ f"Handler {type(url_handler).__name__} errored while handling url {url}: {e}"
23
+ )
@@ -0,0 +1,79 @@
1
+ import asyncio
2
+ import logging
3
+ from datetime import datetime, timedelta
4
+
5
+ import gql.transport.exceptions
6
+ import aiohttp
7
+ import pydantic_core
8
+ import wordlift_client
9
+ from tenacity import (
10
+ retry,
11
+ retry_if_exception_type,
12
+ wait_fixed,
13
+ after_log,
14
+ stop_after_attempt,
15
+ )
16
+ from wordlift_client import AnalyticsImportRequest
17
+
18
+ from .url_handler import UrlHandler
19
+ from ...graphql.client import GraphQlClient
20
+ from ...graphql.utils.query import EntityTopQuery
21
+ from ...protocol import Context
22
+ from ...url_source import Url
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class SearchConsoleUrlHandler(UrlHandler):
28
+ _context: Context
29
+ _graphql_client: GraphQlClient
30
+
31
+ def __init__(self, context: Context, graphql_client: GraphQlClient) -> None:
32
+ self._context = context
33
+ self._graphql_client = graphql_client
34
+
35
+ @retry(
36
+ retry=retry_if_exception_type(
37
+ asyncio.TimeoutError
38
+ | aiohttp.client_exceptions.ServerDisconnectedError
39
+ | aiohttp.client_exceptions.ClientConnectorError
40
+ | aiohttp.client_exceptions.ClientPayloadError
41
+ | aiohttp.client_exceptions.ClientConnectorDNSError
42
+ | pydantic_core._pydantic_core.ValidationError
43
+ | wordlift_client.exceptions.ServiceException
44
+ | wordlift_client.exceptions.BadRequestException
45
+ | aiohttp.client_exceptions.ClientOSError
46
+ | gql.transport.exceptions.TransportServerError
47
+ ),
48
+ wait=wait_fixed(2), # Wait 2 seconds between retries
49
+ stop=stop_after_attempt(3), # Max 3 retries
50
+ after=after_log(logger, logging.WARNING),
51
+ reraise=True,
52
+ )
53
+ async def __call__(self, url: Url) -> None:
54
+ if not self._context.account.google_search_console_site_url:
55
+ return
56
+
57
+ entities = await self._graphql_client.run(
58
+ graphql="entities_top_query", variables={"urls": [url.value]}
59
+ )
60
+
61
+ if not entities:
62
+ return
63
+
64
+ # Calculate the date 7 days ago from today
65
+ seven_days_ago = datetime.now() - timedelta(days=7)
66
+ entity_top_query = EntityTopQuery.from_graphql_response(entities[0])
67
+ if (
68
+ entity_top_query.top_query_date_created
69
+ and datetime.fromisoformat(entity_top_query.top_query_date_created)
70
+ > seven_days_ago
71
+ ):
72
+ return
73
+
74
+ async with wordlift_client.ApiClient(
75
+ self._context.client_configuration
76
+ ) as api_client:
77
+ api_instance = wordlift_client.AnalyticsImportsApi(api_client)
78
+ request = AnalyticsImportRequest(urls=[entity_top_query.url])
79
+ await api_instance.create_analytics_import(request)
@@ -0,0 +1,8 @@
1
+ from abc import ABC
2
+
3
+ from ...url_source import Url
4
+
5
+
6
+ class UrlHandler(ABC):
7
+ async def __call__(self, url: Url):
8
+ pass
@@ -0,0 +1,104 @@
1
+ import asyncio
2
+ import logging
3
+
4
+ import aiohttp
5
+ import pydantic_core
6
+ from tenacity import (
7
+ retry,
8
+ retry_if_exception_type,
9
+ wait_fixed,
10
+ after_log,
11
+ stop_after_attempt,
12
+ )
13
+ from wordlift_client import (
14
+ ApiClient,
15
+ WebPagesImportsApi,
16
+ WebPageImportRequest,
17
+ EmbeddingRequest,
18
+ )
19
+ import wordlift_client
20
+ import gql.transport.exceptions
21
+
22
+ from .url_handler import UrlHandler
23
+ from ...protocol import (
24
+ Context,
25
+ WebPageImportProtocolInterface,
26
+ load_override_class,
27
+ DefaultWebPageImportProtocol,
28
+ )
29
+ from ...url_source import Url
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class WebPageImportUrlHandler(UrlHandler):
35
+ _context: Context
36
+ _embedding_request: EmbeddingRequest
37
+ _web_page_import_callback: WebPageImportProtocolInterface
38
+ _web_page_types: list[str]
39
+ _write_strategy: str
40
+
41
+ def __init__(
42
+ self,
43
+ context: Context,
44
+ embedding_properties: list[str],
45
+ web_page_types: list[str],
46
+ web_page_import_callback: WebPageImportProtocolInterface | None = None,
47
+ write_strategy: str = "createOrUpdateModel",
48
+ ):
49
+ self._context = context
50
+ self._embedding_request = EmbeddingRequest(
51
+ properties=embedding_properties,
52
+ )
53
+ self._web_page_types = web_page_types
54
+ if web_page_import_callback is None:
55
+ self._web_page_import_callback = load_override_class(
56
+ name="web_page_import_protocol",
57
+ class_name="WebPageImportProtocol",
58
+ # Default class to use in case of missing override.
59
+ default_class=DefaultWebPageImportProtocol,
60
+ context=context,
61
+ )
62
+ else:
63
+ self._web_page_import_callback = web_page_import_callback
64
+
65
+ self._write_strategy = write_strategy
66
+
67
+ @retry(
68
+ retry=retry_if_exception_type(
69
+ asyncio.TimeoutError
70
+ | aiohttp.client_exceptions.ServerDisconnectedError
71
+ | aiohttp.client_exceptions.ClientConnectorError
72
+ | aiohttp.client_exceptions.ClientPayloadError
73
+ | aiohttp.client_exceptions.ClientConnectorDNSError
74
+ | pydantic_core._pydantic_core.ValidationError
75
+ | wordlift_client.exceptions.ServiceException
76
+ | gql.transport.exceptions.TransportServerError
77
+ | wordlift_client.exceptions.BadRequestException
78
+ | aiohttp.client_exceptions.ClientOSError
79
+ ),
80
+ wait=wait_fixed(2), # Wait 2 seconds between retries
81
+ after=after_log(logger, logging.WARNING),
82
+ stop=stop_after_attempt(5),
83
+ )
84
+ async def __call__(self, url: Url) -> None:
85
+ async with ApiClient(self._context.client_configuration) as client:
86
+ api_instance = WebPagesImportsApi(client)
87
+
88
+ request = WebPageImportRequest(
89
+ url=url.value,
90
+ id=None if url.iri is None else url.iri,
91
+ embedding=self._embedding_request,
92
+ output_types=self._web_page_types,
93
+ id_generator="headline-with-url-hash",
94
+ write_strategy=self._write_strategy,
95
+ )
96
+
97
+ try:
98
+ response = await api_instance.create_web_page_imports(
99
+ web_page_import_request=request, _request_timeout=120.0
100
+ )
101
+ await self._web_page_import_callback.callback(response)
102
+ except Exception as e:
103
+ logger.error("Error importing Web Page %s" % url.value, exc_info=e)
104
+ raise e