wordlift-sdk 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk/__init__.py +3 -0
- wordlift_sdk/client/__init__.py +3 -0
- wordlift_sdk/client/client_configuration_factory.py +26 -0
- wordlift_sdk/configuration/__init__.py +4 -0
- wordlift_sdk/configuration/configuration_provider.py +44 -0
- wordlift_sdk/configuration/get_config_value.py +39 -0
- wordlift_sdk/container/__init__.py +3 -0
- wordlift_sdk/container/application_container.py +234 -0
- wordlift_sdk/deprecated/__init__.py +5 -0
- wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
- wordlift_sdk/entity/__init__.py +4 -0
- wordlift_sdk/entity/enrich.py +54 -0
- wordlift_sdk/entity/patch.py +14 -0
- wordlift_sdk/google_search_console/__init__.py +5 -0
- wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
- wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
- wordlift_sdk/graph/graph_bag.py +7 -0
- wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
- wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
- wordlift_sdk/graphql/__init__.py +3 -0
- wordlift_sdk/graphql/client/__init__.py +5 -0
- wordlift_sdk/graphql/client/client.py +69 -0
- wordlift_sdk/graphql/client/factory.py +36 -0
- wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
- wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
- wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
- wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
- wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
- wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
- wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
- wordlift_sdk/graphql/query.py +20 -0
- wordlift_sdk/graphql/utils/__init__.py +0 -0
- wordlift_sdk/graphql/utils/query/__init__.py +4 -0
- wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
- wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
- wordlift_sdk/id_generator/__init__.py +3 -0
- wordlift_sdk/id_generator/id_generator.py +40 -0
- wordlift_sdk/id_generator/id_generator_interface.py +8 -0
- wordlift_sdk/internal_link/__init__.py +3 -0
- wordlift_sdk/internal_link/utils.py +231 -0
- wordlift_sdk/kg/__init__.py +5 -0
- wordlift_sdk/kg/entity.py +17 -0
- wordlift_sdk/kg/entity_store.py +94 -0
- wordlift_sdk/kg/entity_store_factory.py +13 -0
- wordlift_sdk/kg/relation/__init__.py +0 -0
- wordlift_sdk/kg/relation/relation_service.py +78 -0
- wordlift_sdk/main.py +7 -0
- wordlift_sdk/namespace/SDO.py +3281 -0
- wordlift_sdk/namespace/__init__.py +3 -0
- wordlift_sdk/notebook/__init__.py +3 -0
- wordlift_sdk/notebook/install_if_missing.py +12 -0
- wordlift_sdk/protocol/__init__.py +5 -0
- wordlift_sdk/protocol/context.py +21 -0
- wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
- wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
- wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
- wordlift_sdk/protocol/graph/__init__.py +3 -0
- wordlift_sdk/protocol/graph/graph_queue.py +64 -0
- wordlift_sdk/protocol/load_override_class.py +30 -0
- wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
- wordlift_sdk/url_source/__init__.py +6 -0
- wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
- wordlift_sdk/url_source/list_url_source.py +28 -0
- wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
- wordlift_sdk/url_source/sitemap_url_source.py +36 -0
- wordlift_sdk/url_source/url_source.py +18 -0
- wordlift_sdk/url_source/url_source_input.py +6 -0
- wordlift_sdk/utils/__init__.py +17 -0
- wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
- wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
- wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
- wordlift_sdk/utils/create_entity_patch_request.py +14 -0
- wordlift_sdk/utils/delayed.py +12 -0
- wordlift_sdk/utils/get_me.py +8 -0
- wordlift_sdk/utils/import_url.py +35 -0
- wordlift_sdk/wordlift/__init__.py +0 -0
- wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
- wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
- wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
- wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
- wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
- wordlift_sdk/workflow/__init__.py +3 -0
- wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
- wordlift_sdk/workflow/kg_import_workflow.py +49 -0
- wordlift_sdk/workflow/patch_entities_factory.py +16 -0
- wordlift_sdk/workflow/url_handler/__init__.py +3 -0
- wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
- wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
- wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
- wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
- wordlift_sdk-2.7.1.dist-info/METADATA +125 -0
- wordlift_sdk-2.7.1.dist-info/RECORD +100 -0
- wordlift_sdk-2.7.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import Callable, Awaitable, Optional
|
|
2
|
+
|
|
3
|
+
from pandas import Series, DataFrame
|
|
4
|
+
from pycountry import countries
|
|
5
|
+
from tqdm.asyncio import tqdm
|
|
6
|
+
from twisted.mail.scripts.mailmail import Configuration
|
|
7
|
+
from wordlift_client import AccountInfo, AnalysesResponse
|
|
8
|
+
|
|
9
|
+
from .entity_gaps_callback import entity_gaps_callback_factory
|
|
10
|
+
from ...deprecated import create_entities_with_top_query_dataframe
|
|
11
|
+
from ...utils import create_delayed
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def create_entity_gaps_factory(
|
|
15
|
+
key: str, configuration: Configuration, account: AccountInfo
|
|
16
|
+
):
|
|
17
|
+
async def callback(url_list: list[str]) -> DataFrame:
|
|
18
|
+
# Get the entity data with the top query.
|
|
19
|
+
entities_with_top_query_df = await create_entities_with_top_query_dataframe(
|
|
20
|
+
key=key, url_list=url_list
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
country = countries.get(alpha_2=account.country_code.upper())
|
|
24
|
+
delayed = create_delayed(
|
|
25
|
+
await append_entity_gaps_response_to_row_factory(
|
|
26
|
+
entity_gaps_callback=await entity_gaps_callback_factory(
|
|
27
|
+
configuration=configuration, query_location_name=country.name
|
|
28
|
+
)
|
|
29
|
+
),
|
|
30
|
+
2,
|
|
31
|
+
)
|
|
32
|
+
series = await tqdm.gather(
|
|
33
|
+
*[delayed(row) for index, row in entities_with_top_query_df.iterrows()],
|
|
34
|
+
total=len(entities_with_top_query_df),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return DataFrame(series)
|
|
38
|
+
|
|
39
|
+
return callback
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def append_entity_gaps_response_to_row_factory(
|
|
43
|
+
entity_gaps_callback: Callable[[Series], Awaitable[Optional[AnalysesResponse]]],
|
|
44
|
+
) -> Callable[[Series], Awaitable[Series]]:
|
|
45
|
+
async def append_entity_gaps_response_to_row(row: Series) -> Series:
|
|
46
|
+
response = await entity_gaps_callback(row)
|
|
47
|
+
if response:
|
|
48
|
+
row["entity_gaps"] = response.items
|
|
49
|
+
return row
|
|
50
|
+
|
|
51
|
+
return append_entity_gaps_response_to_row
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Callable, Awaitable, Optional
|
|
2
|
+
|
|
3
|
+
import wordlift_client
|
|
4
|
+
from pandas import Series
|
|
5
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
6
|
+
from twisted.mail.scripts.mailmail import Configuration
|
|
7
|
+
from wordlift_client import AnalysesResponse, EntityGapsApi, EntityGapRequest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def entity_gaps_callback_factory(configuration: Configuration, query_location_name: str) -> Callable[
|
|
11
|
+
[Series], Awaitable[Optional[AnalysesResponse]]]:
|
|
12
|
+
@retry(
|
|
13
|
+
stop=stop_after_attempt(10),
|
|
14
|
+
wait=wait_fixed(2)
|
|
15
|
+
)
|
|
16
|
+
async def entity_gaps_callback(row: Series) -> Optional[AnalysesResponse]:
|
|
17
|
+
url = row['url']
|
|
18
|
+
query = row['top_query_name']
|
|
19
|
+
if query is None:
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
23
|
+
api = EntityGapsApi(api_client)
|
|
24
|
+
return await api.create_entity_gap(
|
|
25
|
+
EntityGapRequest(
|
|
26
|
+
url=url,
|
|
27
|
+
query=query,
|
|
28
|
+
query_location_name=query_location_name
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
return entity_gaps_callback
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .default import DefaultImportUrlProtocol, DefaultParseHtmlProtocol
|
|
2
|
+
from .import_url_protocol_interface import ImportUrlProtocolInterface, ImportUrlInput
|
|
3
|
+
from .parse_html_protocol_interface import ParseHtmlProtocolInterface, ParseHtmlInput
|
|
4
|
+
from .protocol_context import ProtocolContext
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
'ImportUrlProtocolInterface',
|
|
8
|
+
'ImportUrlInput',
|
|
9
|
+
'ParseHtmlProtocolInterface',
|
|
10
|
+
'ParseHtmlInput',
|
|
11
|
+
'ProtocolContext',
|
|
12
|
+
'DefaultImportUrlProtocol',
|
|
13
|
+
'DefaultParseHtmlProtocol',
|
|
14
|
+
]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import wordlift_client
|
|
4
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
5
|
+
from wordlift_client import SitemapImportsApi, SitemapImportRequest, EmbeddingRequest
|
|
6
|
+
|
|
7
|
+
from ..import_url_protocol_interface import ImportUrlProtocolInterface, ImportUrlInput
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DefaultImportUrlProtocol(ImportUrlProtocolInterface):
|
|
13
|
+
|
|
14
|
+
@retry(
|
|
15
|
+
stop=stop_after_attempt(5),
|
|
16
|
+
wait=wait_fixed(2)
|
|
17
|
+
)
|
|
18
|
+
async def import_url(self, import_url_input: ImportUrlInput) -> None:
|
|
19
|
+
configuration = self.context.configuration
|
|
20
|
+
types = self.context.types
|
|
21
|
+
url_list = import_url_input.url_list
|
|
22
|
+
|
|
23
|
+
async with wordlift_client.ApiClient(configuration) as api_client:
|
|
24
|
+
imports_api = SitemapImportsApi(api_client)
|
|
25
|
+
request = SitemapImportRequest(
|
|
26
|
+
embedding=EmbeddingRequest(
|
|
27
|
+
properties=["http://schema.org/headline", "http://schema.org/abstract",
|
|
28
|
+
"http://schema.org/text"]
|
|
29
|
+
),
|
|
30
|
+
output_types=list(types),
|
|
31
|
+
urls=list(url_list),
|
|
32
|
+
overwrite=True,
|
|
33
|
+
id_generator="headline-with-url-hash"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
await imports_api.create_sitemap_import(sitemap_import_request=request)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.error("Error importing URLs: %s", e)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides a default implementation of the ParseHtmlProtocolInterface.
|
|
3
|
+
|
|
4
|
+
The DefaultParseHtmlProtocol class implements a basic HTML parsing protocol
|
|
5
|
+
that can be used as a fallback or starting point for more complex implementations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from wordlift_client import EntityPatchRequest
|
|
9
|
+
from ..parse_html_protocol_interface import ParseHtmlProtocolInterface, ParseHtmlInput
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DefaultParseHtmlProtocol(ParseHtmlProtocolInterface):
|
|
13
|
+
"""
|
|
14
|
+
Default implementation of the ParseHtmlProtocolInterface.
|
|
15
|
+
|
|
16
|
+
This class provides a minimal implementation of the HTML parsing protocol
|
|
17
|
+
that returns an empty list of entity patch requests. It can be used as a
|
|
18
|
+
base class for more complex implementations or as a fallback when no
|
|
19
|
+
specific parsing logic is required.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
context (ProtocolContext): The protocol context containing configuration
|
|
23
|
+
and entity types information.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
async def parse_html(self, parse_html_input: ParseHtmlInput) -> list[EntityPatchRequest]:
|
|
27
|
+
"""
|
|
28
|
+
Parse HTML content and extract entity information.
|
|
29
|
+
|
|
30
|
+
This default implementation returns an empty list. Override this method
|
|
31
|
+
to implement custom HTML parsing logic.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
parse_html_input (ParseHtmlInput): An object containing the HTML content to parse,
|
|
35
|
+
along with entity ID, URL, and additional data.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
list[EntityPatchRequest]: A list of entity patch requests to update
|
|
39
|
+
the knowledge graph. This implementation returns an empty list.
|
|
40
|
+
"""
|
|
41
|
+
return list()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from .protocol_context import ProtocolContext
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class ImportUrlInput:
|
|
10
|
+
url_list: list[str]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ImportUrlProtocolInterface(Protocol):
|
|
14
|
+
context: ProtocolContext
|
|
15
|
+
|
|
16
|
+
def __init__(self, context: ProtocolContext):
|
|
17
|
+
self.context = context
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def import_url(self, import_url_input: ImportUrlInput) -> None:
|
|
21
|
+
...
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from pandas import Series
|
|
6
|
+
from wordlift_client import EntityPatchRequest
|
|
7
|
+
|
|
8
|
+
from .protocol_context import ProtocolContext
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ParseHtmlInput:
|
|
13
|
+
entity_id: str
|
|
14
|
+
entity_url: str
|
|
15
|
+
html: str
|
|
16
|
+
row: Series
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ParseHtmlProtocolInterface(Protocol):
|
|
20
|
+
context: ProtocolContext
|
|
21
|
+
|
|
22
|
+
def __init__(self, context: ProtocolContext):
|
|
23
|
+
self.context = context
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
async def parse_html(self, parse_html_input: ParseHtmlInput) -> list[EntityPatchRequest]:
|
|
27
|
+
...
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from wordlift_client import AccountInfo
|
|
4
|
+
from wordlift_client import Configuration
|
|
5
|
+
|
|
6
|
+
from ....id_generator.id_generator_interface import IdGeneratorInterface
|
|
7
|
+
from ....protocol.graph import GraphQueue
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ProtocolContext:
|
|
12
|
+
account: AccountInfo
|
|
13
|
+
configuration: Configuration
|
|
14
|
+
id_generator: IdGeneratorInterface
|
|
15
|
+
types: list[str]
|
|
16
|
+
graph_queue: GraphQueue
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from wordlift_client import Configuration
|
|
2
|
+
from rdflib import Graph
|
|
3
|
+
import wordlift_client
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
async def create_or_update_entities_factory(configuration: Configuration):
|
|
7
|
+
async def callback(graph: Graph):
|
|
8
|
+
# Run all the queued graphs.
|
|
9
|
+
async with wordlift_client.ApiClient(configuration=configuration) as api_client:
|
|
10
|
+
api_instance = wordlift_client.EntitiesApi(api_client)
|
|
11
|
+
await api_instance.create_or_update_entities(
|
|
12
|
+
graph.serialize(format="turtle"),
|
|
13
|
+
_content_type="text/turtle",
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
return callback
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from os import cpu_count
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from tqdm.asyncio import tqdm
|
|
6
|
+
|
|
7
|
+
from .url_handler.url_handler import UrlHandler
|
|
8
|
+
from ..graph.ttl_liquid import TtlLiquidGraphFactory
|
|
9
|
+
from ..protocol import (
|
|
10
|
+
Context,
|
|
11
|
+
)
|
|
12
|
+
from ..url_source import UrlSource
|
|
13
|
+
from ..utils import create_delayed
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class KgImportWorkflow:
|
|
19
|
+
_concurrency: int
|
|
20
|
+
_context: Context
|
|
21
|
+
_url_handler: UrlHandler
|
|
22
|
+
_url_source: UrlSource
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
context: Context,
|
|
27
|
+
url_source: UrlSource,
|
|
28
|
+
url_handler: UrlHandler,
|
|
29
|
+
concurrency: int = min(cpu_count(), 4),
|
|
30
|
+
) -> None:
|
|
31
|
+
self._context = context
|
|
32
|
+
self._url_source = url_source
|
|
33
|
+
self._url_handler = url_handler
|
|
34
|
+
self._concurrency = concurrency
|
|
35
|
+
|
|
36
|
+
async def run(self):
|
|
37
|
+
await TtlLiquidGraphFactory(
|
|
38
|
+
context=self._context, path=Path("data/templates")
|
|
39
|
+
).graphs()
|
|
40
|
+
|
|
41
|
+
url_list = [url async for url in self._url_source.urls()]
|
|
42
|
+
|
|
43
|
+
logger.info("Applying %d URL import request(s)" % len(url_list))
|
|
44
|
+
|
|
45
|
+
delayed = create_delayed(self._url_handler, self._concurrency)
|
|
46
|
+
await tqdm.gather(
|
|
47
|
+
*[delayed(url) for url in list(url_list)],
|
|
48
|
+
total=len(url_list),
|
|
49
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import wordlift_client
|
|
2
|
+
from wordlift_client import Configuration
|
|
3
|
+
|
|
4
|
+
from ..protocol.entity_patch import EntityPatch
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def patch_entities_factory(configuration: Configuration):
|
|
8
|
+
async def callback(entity_patch: EntityPatch):
|
|
9
|
+
# Run all the queued graphs.
|
|
10
|
+
async with wordlift_client.ApiClient(configuration=configuration) as api_client:
|
|
11
|
+
api_instance = wordlift_client.EntitiesApi(api_client)
|
|
12
|
+
await api_instance.patch_entities(
|
|
13
|
+
id=entity_patch.iri, entity=entity_patch.requests
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
return callback
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from ...url_source import Url
|
|
4
|
+
from ...workflow.url_handler.url_handler import UrlHandler
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DefaultUrlHandler(UrlHandler):
|
|
10
|
+
_url_handler_list: list[UrlHandler]
|
|
11
|
+
|
|
12
|
+
def __init__(self, url_handler_list: list[UrlHandler]):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self._url_handler_list = url_handler_list
|
|
15
|
+
|
|
16
|
+
async def __call__(self, url: Url) -> None:
|
|
17
|
+
for url_handler in self._url_handler_list:
|
|
18
|
+
try:
|
|
19
|
+
await url_handler.__call__(url)
|
|
20
|
+
except Exception as e:
|
|
21
|
+
logger.error(
|
|
22
|
+
f"Handler {type(url_handler).__name__} errored while handling url {url}: {e}"
|
|
23
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
|
|
5
|
+
import gql.transport.exceptions
|
|
6
|
+
import aiohttp
|
|
7
|
+
import pydantic_core
|
|
8
|
+
import wordlift_client
|
|
9
|
+
from tenacity import (
|
|
10
|
+
retry,
|
|
11
|
+
retry_if_exception_type,
|
|
12
|
+
wait_fixed,
|
|
13
|
+
after_log,
|
|
14
|
+
stop_after_attempt,
|
|
15
|
+
)
|
|
16
|
+
from wordlift_client import AnalyticsImportRequest
|
|
17
|
+
|
|
18
|
+
from .url_handler import UrlHandler
|
|
19
|
+
from ...graphql.client import GraphQlClient
|
|
20
|
+
from ...graphql.utils.query import EntityTopQuery
|
|
21
|
+
from ...protocol import Context
|
|
22
|
+
from ...url_source import Url
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SearchConsoleUrlHandler(UrlHandler):
|
|
28
|
+
_context: Context
|
|
29
|
+
_graphql_client: GraphQlClient
|
|
30
|
+
|
|
31
|
+
def __init__(self, context: Context, graphql_client: GraphQlClient) -> None:
|
|
32
|
+
self._context = context
|
|
33
|
+
self._graphql_client = graphql_client
|
|
34
|
+
|
|
35
|
+
@retry(
|
|
36
|
+
retry=retry_if_exception_type(
|
|
37
|
+
asyncio.TimeoutError
|
|
38
|
+
| aiohttp.client_exceptions.ServerDisconnectedError
|
|
39
|
+
| aiohttp.client_exceptions.ClientConnectorError
|
|
40
|
+
| aiohttp.client_exceptions.ClientPayloadError
|
|
41
|
+
| aiohttp.client_exceptions.ClientConnectorDNSError
|
|
42
|
+
| pydantic_core._pydantic_core.ValidationError
|
|
43
|
+
| wordlift_client.exceptions.ServiceException
|
|
44
|
+
| wordlift_client.exceptions.BadRequestException
|
|
45
|
+
| aiohttp.client_exceptions.ClientOSError
|
|
46
|
+
| gql.transport.exceptions.TransportServerError
|
|
47
|
+
),
|
|
48
|
+
wait=wait_fixed(2), # Wait 2 seconds between retries
|
|
49
|
+
stop=stop_after_attempt(3), # Max 3 retries
|
|
50
|
+
after=after_log(logger, logging.WARNING),
|
|
51
|
+
reraise=True,
|
|
52
|
+
)
|
|
53
|
+
async def __call__(self, url: Url) -> None:
|
|
54
|
+
if not self._context.account.google_search_console_site_url:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
entities = await self._graphql_client.run(
|
|
58
|
+
graphql="entities_top_query", variables={"urls": [url.value]}
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if not entities:
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
# Calculate the date 7 days ago from today
|
|
65
|
+
seven_days_ago = datetime.now() - timedelta(days=7)
|
|
66
|
+
entity_top_query = EntityTopQuery.from_graphql_response(entities[0])
|
|
67
|
+
if (
|
|
68
|
+
entity_top_query.top_query_date_created
|
|
69
|
+
and datetime.fromisoformat(entity_top_query.top_query_date_created)
|
|
70
|
+
> seven_days_ago
|
|
71
|
+
):
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
async with wordlift_client.ApiClient(
|
|
75
|
+
self._context.client_configuration
|
|
76
|
+
) as api_client:
|
|
77
|
+
api_instance = wordlift_client.AnalyticsImportsApi(api_client)
|
|
78
|
+
request = AnalyticsImportRequest(urls=[entity_top_query.url])
|
|
79
|
+
await api_instance.create_analytics_import(request)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import aiohttp
|
|
5
|
+
import pydantic_core
|
|
6
|
+
from tenacity import (
|
|
7
|
+
retry,
|
|
8
|
+
retry_if_exception_type,
|
|
9
|
+
wait_fixed,
|
|
10
|
+
after_log,
|
|
11
|
+
stop_after_attempt,
|
|
12
|
+
)
|
|
13
|
+
from wordlift_client import (
|
|
14
|
+
ApiClient,
|
|
15
|
+
WebPagesImportsApi,
|
|
16
|
+
WebPageImportRequest,
|
|
17
|
+
EmbeddingRequest,
|
|
18
|
+
)
|
|
19
|
+
import wordlift_client
|
|
20
|
+
import gql.transport.exceptions
|
|
21
|
+
|
|
22
|
+
from .url_handler import UrlHandler
|
|
23
|
+
from ...protocol import (
|
|
24
|
+
Context,
|
|
25
|
+
WebPageImportProtocolInterface,
|
|
26
|
+
load_override_class,
|
|
27
|
+
DefaultWebPageImportProtocol,
|
|
28
|
+
)
|
|
29
|
+
from ...url_source import Url
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class WebPageImportUrlHandler(UrlHandler):
|
|
35
|
+
_context: Context
|
|
36
|
+
_embedding_request: EmbeddingRequest
|
|
37
|
+
_web_page_import_callback: WebPageImportProtocolInterface
|
|
38
|
+
_web_page_types: list[str]
|
|
39
|
+
_write_strategy: str
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
context: Context,
|
|
44
|
+
embedding_properties: list[str],
|
|
45
|
+
web_page_types: list[str],
|
|
46
|
+
web_page_import_callback: WebPageImportProtocolInterface | None = None,
|
|
47
|
+
write_strategy: str = "createOrUpdateModel",
|
|
48
|
+
):
|
|
49
|
+
self._context = context
|
|
50
|
+
self._embedding_request = EmbeddingRequest(
|
|
51
|
+
properties=embedding_properties,
|
|
52
|
+
)
|
|
53
|
+
self._web_page_types = web_page_types
|
|
54
|
+
if web_page_import_callback is None:
|
|
55
|
+
self._web_page_import_callback = load_override_class(
|
|
56
|
+
name="web_page_import_protocol",
|
|
57
|
+
class_name="WebPageImportProtocol",
|
|
58
|
+
# Default class to use in case of missing override.
|
|
59
|
+
default_class=DefaultWebPageImportProtocol,
|
|
60
|
+
context=context,
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
self._web_page_import_callback = web_page_import_callback
|
|
64
|
+
|
|
65
|
+
self._write_strategy = write_strategy
|
|
66
|
+
|
|
67
|
+
@retry(
|
|
68
|
+
retry=retry_if_exception_type(
|
|
69
|
+
asyncio.TimeoutError
|
|
70
|
+
| aiohttp.client_exceptions.ServerDisconnectedError
|
|
71
|
+
| aiohttp.client_exceptions.ClientConnectorError
|
|
72
|
+
| aiohttp.client_exceptions.ClientPayloadError
|
|
73
|
+
| aiohttp.client_exceptions.ClientConnectorDNSError
|
|
74
|
+
| pydantic_core._pydantic_core.ValidationError
|
|
75
|
+
| wordlift_client.exceptions.ServiceException
|
|
76
|
+
| gql.transport.exceptions.TransportServerError
|
|
77
|
+
| wordlift_client.exceptions.BadRequestException
|
|
78
|
+
| aiohttp.client_exceptions.ClientOSError
|
|
79
|
+
),
|
|
80
|
+
wait=wait_fixed(2), # Wait 2 seconds between retries
|
|
81
|
+
after=after_log(logger, logging.WARNING),
|
|
82
|
+
stop=stop_after_attempt(5),
|
|
83
|
+
)
|
|
84
|
+
async def __call__(self, url: Url) -> None:
|
|
85
|
+
async with ApiClient(self._context.client_configuration) as client:
|
|
86
|
+
api_instance = WebPagesImportsApi(client)
|
|
87
|
+
|
|
88
|
+
request = WebPageImportRequest(
|
|
89
|
+
url=url.value,
|
|
90
|
+
id=None if url.iri is None else url.iri,
|
|
91
|
+
embedding=self._embedding_request,
|
|
92
|
+
output_types=self._web_page_types,
|
|
93
|
+
id_generator="headline-with-url-hash",
|
|
94
|
+
write_strategy=self._write_strategy,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
response = await api_instance.create_web_page_imports(
|
|
99
|
+
web_page_import_request=request, _request_timeout=120.0
|
|
100
|
+
)
|
|
101
|
+
await self._web_page_import_callback.callback(response)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error("Error importing Web Page %s" % url.value, exc_info=e)
|
|
104
|
+
raise e
|