wordlift-sdk 2.7.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wordlift_sdk-2.7.5/PKG-INFO +125 -0
- wordlift_sdk-2.7.5/README.md +98 -0
- wordlift_sdk-2.7.5/pyproject.toml +40 -0
- wordlift_sdk-2.7.5/wordlift_sdk/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/client/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/client/client_configuration_factory.py +26 -0
- wordlift_sdk-2.7.5/wordlift_sdk/configuration/__init__.py +4 -0
- wordlift_sdk-2.7.5/wordlift_sdk/configuration/configuration_provider.py +44 -0
- wordlift_sdk-2.7.5/wordlift_sdk/configuration/get_config_value.py +39 -0
- wordlift_sdk-2.7.5/wordlift_sdk/container/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/container/application_container.py +234 -0
- wordlift_sdk-2.7.5/wordlift_sdk/deprecated/__init__.py +5 -0
- wordlift_sdk-2.7.5/wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
- wordlift_sdk-2.7.5/wordlift_sdk/entity/__init__.py +4 -0
- wordlift_sdk-2.7.5/wordlift_sdk/entity/enrich.py +54 -0
- wordlift_sdk-2.7.5/wordlift_sdk/entity/patch.py +14 -0
- wordlift_sdk-2.7.5/wordlift_sdk/google_search_console/__init__.py +5 -0
- wordlift_sdk-2.7.5/wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
- wordlift_sdk-2.7.5/wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graph/graph_bag.py +7 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/__init__.py +5 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/client.py +70 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/factory.py +36 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/query.py +20 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/__init__.py +0 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/query/__init__.py +4 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
- wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
- wordlift_sdk-2.7.5/wordlift_sdk/id_generator/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/id_generator/id_generator.py +40 -0
- wordlift_sdk-2.7.5/wordlift_sdk/id_generator/id_generator_interface.py +8 -0
- wordlift_sdk-2.7.5/wordlift_sdk/internal_link/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/internal_link/utils.py +231 -0
- wordlift_sdk-2.7.5/wordlift_sdk/kg/__init__.py +5 -0
- wordlift_sdk-2.7.5/wordlift_sdk/kg/entity.py +17 -0
- wordlift_sdk-2.7.5/wordlift_sdk/kg/entity_store.py +94 -0
- wordlift_sdk-2.7.5/wordlift_sdk/kg/entity_store_factory.py +13 -0
- wordlift_sdk-2.7.5/wordlift_sdk/kg/relation/__init__.py +0 -0
- wordlift_sdk-2.7.5/wordlift_sdk/kg/relation/relation_service.py +78 -0
- wordlift_sdk-2.7.5/wordlift_sdk/main.py +7 -0
- wordlift_sdk-2.7.5/wordlift_sdk/namespace/SDO.py +3281 -0
- wordlift_sdk-2.7.5/wordlift_sdk/namespace/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/notebook/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/notebook/install_if_missing.py +12 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/__init__.py +5 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/context.py +21 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/graph/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/graph/graph_queue.py +64 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/load_override_class.py +30 -0
- wordlift_sdk-2.7.5/wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
- wordlift_sdk-2.7.5/wordlift_sdk/url_source/__init__.py +6 -0
- wordlift_sdk-2.7.5/wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
- wordlift_sdk-2.7.5/wordlift_sdk/url_source/list_url_source.py +28 -0
- wordlift_sdk-2.7.5/wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
- wordlift_sdk-2.7.5/wordlift_sdk/url_source/sitemap_url_source.py +36 -0
- wordlift_sdk-2.7.5/wordlift_sdk/url_source/url_source.py +18 -0
- wordlift_sdk-2.7.5/wordlift_sdk/url_source/url_source_input.py +6 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/__init__.py +17 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/create_entity_patch_request.py +14 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/delayed.py +12 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/get_me.py +8 -0
- wordlift_sdk-2.7.5/wordlift_sdk/utils/import_url.py +35 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/__init__.py +0 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
- wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/kg_import_workflow.py +49 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/patch_entities_factory.py +16 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/__init__.py +3 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
- wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wordlift-sdk
|
|
3
|
+
Version: 2.7.5
|
|
4
|
+
Summary:
|
|
5
|
+
Author: David Riccitelli
|
|
6
|
+
Author-email: david@wordlift.io
|
|
7
|
+
Requires-Python: >=3.10,<3.14
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: advertools (>0.16.6,<1.0.0)
|
|
14
|
+
Requires-Dist: aiohttp (>=3.10.5,<4.0.0)
|
|
15
|
+
Requires-Dist: google-auth (>=2.35.0,<3.0.0)
|
|
16
|
+
Requires-Dist: gql[aiohttp] (>=3.5.2,<4.0.0)
|
|
17
|
+
Requires-Dist: gspread (>=6.1.2,<7.0.0)
|
|
18
|
+
Requires-Dist: pandas (>=2.1.4,<2.3.0)
|
|
19
|
+
Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
|
20
|
+
Requires-Dist: python-liquid (>=2.0.1,<3.0.0)
|
|
21
|
+
Requires-Dist: rdflib (>=7.0.0,<8.0.0)
|
|
22
|
+
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
23
|
+
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
|
24
|
+
Requires-Dist: wordlift-client (>=1.117.0,<2.0.0)
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# WordLift Python SDK
|
|
28
|
+
|
|
29
|
+
A Python toolkit for orchestrating WordLift imports: fetch URLs from sitemaps, Google Sheets, or explicit lists, filter out already imported pages, enqueue search console jobs, push RDF graphs, and call the WordLift APIs to import web pages.
|
|
30
|
+
|
|
31
|
+
## Features
|
|
32
|
+
- URL sources: XML sitemaps (with optional regex filtering), Google Sheets (`url` column), or Python lists.
|
|
33
|
+
- Change detection: skips URLs that are already imported unless `OVERWRITE` is enabled; re-imports when `lastmod` is newer.
|
|
34
|
+
- Web page imports: sends URLs to WordLift with embedding requests, output types, retry logic, and pluggable callbacks.
|
|
35
|
+
- Search Console refresh: triggers analytics imports when top queries are stale.
|
|
36
|
+
- Graph templates: renders `.ttl.liquid` templates under `data/templates` with account data and uploads the resulting RDF graphs.
|
|
37
|
+
- Extensible: override protocols via `WORDLIFT_OVERRIDE_DIR` without changing the library code.
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install wordlift-sdk
|
|
43
|
+
# or
|
|
44
|
+
poetry add wordlift-sdk
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Requires Python 3.10–3.13.
|
|
48
|
+
|
|
49
|
+
## Configuration
|
|
50
|
+
|
|
51
|
+
Settings are read in order: `config/default.py` (or a custom path you pass to `ConfigurationProvider.create`), environment variables, then (when available) Google Colab `userdata`.
|
|
52
|
+
|
|
53
|
+
Common options:
|
|
54
|
+
- `WORDLIFT_KEY` (required): WordLift API key.
|
|
55
|
+
- `API_URL`: WordLift API base URL, defaults to `https://api.wordlift.io`.
|
|
56
|
+
- `SITEMAP_URL`: XML sitemap to crawl; `SITEMAP_URL_PATTERN` optional regex to filter URLs.
|
|
57
|
+
- `SHEETS_URL`, `SHEETS_NAME`, `SHEETS_SERVICE_ACCOUNT`: use a Google Sheet as source; service account points to credentials file.
|
|
58
|
+
- `URLS`: list of URLs (e.g., `["https://example.com/a", "https://example.com/b"]`).
|
|
59
|
+
- `OVERWRITE`: re-import URLs even if already present (default `False`).
|
|
60
|
+
- `WEB_PAGE_IMPORT_WRITE_STRATEGY`: WordLift write strategy (default `createOrUpdateModel`).
|
|
61
|
+
- `EMBEDDING_PROPERTIES`: list of schema properties to embed.
|
|
62
|
+
- `WEB_PAGE_TYPES`: output schema types, defaults to `["http://schema.org/Article"]`.
|
|
63
|
+
- `GOOGLE_SEARCH_CONSOLE`: enable/disable Search Console handler (default `True`).
|
|
64
|
+
- `CONCURRENCY`: max concurrent handlers, defaults to `min(cpu_count(), 4)`.
|
|
65
|
+
- `WORDLIFT_OVERRIDE_DIR`: folder containing protocol overrides (default `app/overrides`).
|
|
66
|
+
|
|
67
|
+
Example `config/default.py`:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
WORDLIFT_KEY = "your-api-key"
|
|
71
|
+
SITEMAP_URL = "https://example.com/sitemap.xml"
|
|
72
|
+
SITEMAP_URL_PATTERN = r"^https://example.com/article/.*$"
|
|
73
|
+
GOOGLE_SEARCH_CONSOLE = True
|
|
74
|
+
WEB_PAGE_TYPES = ["http://schema.org/Article"]
|
|
75
|
+
EMBEDDING_PROPERTIES = [
|
|
76
|
+
"http://schema.org/headline",
|
|
77
|
+
"http://schema.org/abstract",
|
|
78
|
+
"http://schema.org/text",
|
|
79
|
+
]
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Running the import workflow
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import asyncio
|
|
86
|
+
from wordlift_sdk import run_kg_import_workflow
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
asyncio.run(run_kg_import_workflow())
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The workflow:
|
|
93
|
+
1. Renders and uploads RDF graphs from `data/templates/*.ttl.liquid` using account info.
|
|
94
|
+
2. Builds the configured URL source and filters out unchanged URLs (unless `OVERWRITE`).
|
|
95
|
+
3. Sends each URL to WordLift for import with retries and optional Search Console refresh.
|
|
96
|
+
|
|
97
|
+
You can build components yourself when you need more control:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import asyncio
|
|
101
|
+
from wordlift_sdk.container.application_container import ApplicationContainer
|
|
102
|
+
|
|
103
|
+
async def main():
|
|
104
|
+
container = ApplicationContainer()
|
|
105
|
+
workflow = await container.create_kg_import_workflow()
|
|
106
|
+
await workflow.run()
|
|
107
|
+
|
|
108
|
+
asyncio.run(main())
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Custom callbacks and overrides
|
|
112
|
+
|
|
113
|
+
Override the web page import callback by placing `web_page_import_protocol.py` with a `WebPageImportProtocol` class under `WORDLIFT_OVERRIDE_DIR` (default `app/overrides`). The callback receives a `WebPageImportResponse` and can push to `graph_queue` or `entity_patch_queue`.
|
|
114
|
+
|
|
115
|
+
## Templates
|
|
116
|
+
|
|
117
|
+
Add `.ttl.liquid` files under `data/templates`. Templates render with `account` fields available (e.g., `{{ account.dataset_uri }}`) and are uploaded before URL handling begins.
|
|
118
|
+
|
|
119
|
+
## Testing
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
poetry install --with dev
|
|
123
|
+
poetry run pytest
|
|
124
|
+
```
|
|
125
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# WordLift Python SDK
|
|
2
|
+
|
|
3
|
+
A Python toolkit for orchestrating WordLift imports: fetch URLs from sitemaps, Google Sheets, or explicit lists, filter out already imported pages, enqueue search console jobs, push RDF graphs, and call the WordLift APIs to import web pages.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
- URL sources: XML sitemaps (with optional regex filtering), Google Sheets (`url` column), or Python lists.
|
|
7
|
+
- Change detection: skips URLs that are already imported unless `OVERWRITE` is enabled; re-imports when `lastmod` is newer.
|
|
8
|
+
- Web page imports: sends URLs to WordLift with embedding requests, output types, retry logic, and pluggable callbacks.
|
|
9
|
+
- Search Console refresh: triggers analytics imports when top queries are stale.
|
|
10
|
+
- Graph templates: renders `.ttl.liquid` templates under `data/templates` with account data and uploads the resulting RDF graphs.
|
|
11
|
+
- Extensible: override protocols via `WORDLIFT_OVERRIDE_DIR` without changing the library code.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install wordlift-sdk
|
|
17
|
+
# or
|
|
18
|
+
poetry add wordlift-sdk
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Requires Python 3.10–3.13.
|
|
22
|
+
|
|
23
|
+
## Configuration
|
|
24
|
+
|
|
25
|
+
Settings are read in order: `config/default.py` (or a custom path you pass to `ConfigurationProvider.create`), environment variables, then (when available) Google Colab `userdata`.
|
|
26
|
+
|
|
27
|
+
Common options:
|
|
28
|
+
- `WORDLIFT_KEY` (required): WordLift API key.
|
|
29
|
+
- `API_URL`: WordLift API base URL, defaults to `https://api.wordlift.io`.
|
|
30
|
+
- `SITEMAP_URL`: XML sitemap to crawl; `SITEMAP_URL_PATTERN` optional regex to filter URLs.
|
|
31
|
+
- `SHEETS_URL`, `SHEETS_NAME`, `SHEETS_SERVICE_ACCOUNT`: use a Google Sheet as source; service account points to credentials file.
|
|
32
|
+
- `URLS`: list of URLs (e.g., `["https://example.com/a", "https://example.com/b"]`).
|
|
33
|
+
- `OVERWRITE`: re-import URLs even if already present (default `False`).
|
|
34
|
+
- `WEB_PAGE_IMPORT_WRITE_STRATEGY`: WordLift write strategy (default `createOrUpdateModel`).
|
|
35
|
+
- `EMBEDDING_PROPERTIES`: list of schema properties to embed.
|
|
36
|
+
- `WEB_PAGE_TYPES`: output schema types, defaults to `["http://schema.org/Article"]`.
|
|
37
|
+
- `GOOGLE_SEARCH_CONSOLE`: enable/disable Search Console handler (default `True`).
|
|
38
|
+
- `CONCURRENCY`: max concurrent handlers, defaults to `min(cpu_count(), 4)`.
|
|
39
|
+
- `WORDLIFT_OVERRIDE_DIR`: folder containing protocol overrides (default `app/overrides`).
|
|
40
|
+
|
|
41
|
+
Example `config/default.py`:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
WORDLIFT_KEY = "your-api-key"
|
|
45
|
+
SITEMAP_URL = "https://example.com/sitemap.xml"
|
|
46
|
+
SITEMAP_URL_PATTERN = r"^https://example.com/article/.*$"
|
|
47
|
+
GOOGLE_SEARCH_CONSOLE = True
|
|
48
|
+
WEB_PAGE_TYPES = ["http://schema.org/Article"]
|
|
49
|
+
EMBEDDING_PROPERTIES = [
|
|
50
|
+
"http://schema.org/headline",
|
|
51
|
+
"http://schema.org/abstract",
|
|
52
|
+
"http://schema.org/text",
|
|
53
|
+
]
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Running the import workflow
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import asyncio
|
|
60
|
+
from wordlift_sdk import run_kg_import_workflow
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
asyncio.run(run_kg_import_workflow())
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
The workflow:
|
|
67
|
+
1. Renders and uploads RDF graphs from `data/templates/*.ttl.liquid` using account info.
|
|
68
|
+
2. Builds the configured URL source and filters out unchanged URLs (unless `OVERWRITE`).
|
|
69
|
+
3. Sends each URL to WordLift for import with retries and optional Search Console refresh.
|
|
70
|
+
|
|
71
|
+
You can build components yourself when you need more control:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import asyncio
|
|
75
|
+
from wordlift_sdk.container.application_container import ApplicationContainer
|
|
76
|
+
|
|
77
|
+
async def main():
|
|
78
|
+
container = ApplicationContainer()
|
|
79
|
+
workflow = await container.create_kg_import_workflow()
|
|
80
|
+
await workflow.run()
|
|
81
|
+
|
|
82
|
+
asyncio.run(main())
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Custom callbacks and overrides
|
|
86
|
+
|
|
87
|
+
Override the web page import callback by placing `web_page_import_protocol.py` with a `WebPageImportProtocol` class under `WORDLIFT_OVERRIDE_DIR` (default `app/overrides`). The callback receives a `WebPageImportResponse` and can push to `graph_queue` or `entity_patch_queue`.
|
|
88
|
+
|
|
89
|
+
## Templates
|
|
90
|
+
|
|
91
|
+
Add `.ttl.liquid` files under `data/templates`. Templates render with `account` fields available (e.g., `{{ account.dataset_uri }}`) and are uploaded before URL handling begins.
|
|
92
|
+
|
|
93
|
+
## Testing
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
poetry install --with dev
|
|
97
|
+
poetry run pytest
|
|
98
|
+
```
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "wordlift-sdk"
|
|
3
|
+
version = "2.7.5"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = ["David Riccitelli <david@wordlift.io>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
packages = [{ include = "wordlift_sdk" }]
|
|
8
|
+
include = ["wordlift_sdk/graphql/data/**"]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.dependencies]
|
|
11
|
+
python = ">=3.10, <3.14"
|
|
12
|
+
gql = { extras = ["aiohttp"], version = "^3.5.2" }
|
|
13
|
+
tenacity = "^9.0.0"
|
|
14
|
+
aiohttp = "^3.10.5"
|
|
15
|
+
# Google Colab requires pandas 2.1.4
|
|
16
|
+
pandas = ">=2.1.4, <2.3.0"
|
|
17
|
+
rdflib = "^7.0.0"
|
|
18
|
+
wordlift-client = "^1.117.0"
|
|
19
|
+
gspread = "^6.1.2"
|
|
20
|
+
google-auth = "^2.35.0"
|
|
21
|
+
tqdm = "^4.67.1"
|
|
22
|
+
advertools = ">0.16.6,<1.0.0"
|
|
23
|
+
pycountry = "^24.6.1"
|
|
24
|
+
python-liquid = "^2.0.1"
|
|
25
|
+
|
|
26
|
+
[tool.poetry.group.dev.dependencies]
|
|
27
|
+
pytest = "^8.3.3"
|
|
28
|
+
pytest-asyncio = "^0.24.0"
|
|
29
|
+
docker = "^7.1.0"
|
|
30
|
+
ruff = "^0.11.13"
|
|
31
|
+
pre-commit = "^4.2.0"
|
|
32
|
+
pandas-stubs = "^2.3.3.251201"
|
|
33
|
+
|
|
34
|
+
[build-system]
|
|
35
|
+
requires = ["poetry-core"]
|
|
36
|
+
build-backend = "poetry.core.masonry.api"
|
|
37
|
+
|
|
38
|
+
# see https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915
|
|
39
|
+
[tool.pytest.ini_options]
|
|
40
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import wordlift_client
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ClientConfigurationFactory:
|
|
5
|
+
_api_url: str
|
|
6
|
+
_key: str
|
|
7
|
+
|
|
8
|
+
def __init__(self, key: str, api_url: str = "https://api.wordlift.io"):
|
|
9
|
+
self._api_url = api_url
|
|
10
|
+
self._key = key
|
|
11
|
+
|
|
12
|
+
def create(self):
|
|
13
|
+
configuration = wordlift_client.Configuration(
|
|
14
|
+
host=self._api_url,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# The client must configure the authentication and authorization parameters
|
|
18
|
+
# in accordance with the API server security policy.
|
|
19
|
+
# Examples for each auth method are provided below, use the example that
|
|
20
|
+
# satisfies your auth use case.
|
|
21
|
+
|
|
22
|
+
# Configure API key authorization: ApiKey
|
|
23
|
+
configuration.api_key['ApiKey'] = self._key
|
|
24
|
+
configuration.api_key_prefix['ApiKey'] = 'Key'
|
|
25
|
+
|
|
26
|
+
return configuration
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ConfigurationProvider:
|
|
6
|
+
_config: dict
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def create(filepath: str = "config/default.py") -> "ConfigurationProvider":
|
|
10
|
+
return ConfigurationProvider(filepath=filepath)
|
|
11
|
+
|
|
12
|
+
def __init__(self, filepath: str):
|
|
13
|
+
if not os.path.exists(filepath):
|
|
14
|
+
self._config = {}
|
|
15
|
+
spec = importlib.util.spec_from_file_location("local_config", filepath)
|
|
16
|
+
module = importlib.util.module_from_spec(spec)
|
|
17
|
+
spec.loader.exec_module(module)
|
|
18
|
+
self._config = {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
|
|
19
|
+
|
|
20
|
+
def get_value(self, key: str, default=None):
|
|
21
|
+
# 1. Check globals
|
|
22
|
+
if key in globals():
|
|
23
|
+
return globals()[key]
|
|
24
|
+
|
|
25
|
+
# 2. Check config.py
|
|
26
|
+
if key in self._config:
|
|
27
|
+
return self._config[key]
|
|
28
|
+
|
|
29
|
+
# 3. Check environment variables
|
|
30
|
+
import os
|
|
31
|
+
if key in os.environ:
|
|
32
|
+
return os.environ[key]
|
|
33
|
+
|
|
34
|
+
# 4. Check Google Colab userdata
|
|
35
|
+
try:
|
|
36
|
+
from google.colab import userdata
|
|
37
|
+
secret = userdata.get(key)
|
|
38
|
+
if secret is not None:
|
|
39
|
+
return secret
|
|
40
|
+
except ImportError:
|
|
41
|
+
pass # Not running in Google Colab
|
|
42
|
+
|
|
43
|
+
# 5. Return default if provided
|
|
44
|
+
return default
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import importlib.util
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def load_config_py(filepath="config.py"):
|
|
6
|
+
if not os.path.exists(filepath):
|
|
7
|
+
return {}
|
|
8
|
+
spec = importlib.util.spec_from_file_location("local_config", filepath)
|
|
9
|
+
module = importlib.util.module_from_spec(spec)
|
|
10
|
+
spec.loader.exec_module(module)
|
|
11
|
+
return {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_config_value(key, config_py_path=None, default=None):
|
|
15
|
+
# 1. Check globals
|
|
16
|
+
if key in globals():
|
|
17
|
+
return globals()[key]
|
|
18
|
+
|
|
19
|
+
# 2. Check config.py
|
|
20
|
+
config = load_config_py(config_py_path)
|
|
21
|
+
if key in config:
|
|
22
|
+
return config[key]
|
|
23
|
+
|
|
24
|
+
# 3. Check environment variables
|
|
25
|
+
import os
|
|
26
|
+
if key in os.environ:
|
|
27
|
+
return os.environ[key]
|
|
28
|
+
|
|
29
|
+
# 4. Check Google Colab userdata
|
|
30
|
+
try:
|
|
31
|
+
from google.colab import userdata
|
|
32
|
+
secret = userdata.get(key)
|
|
33
|
+
if secret is not None:
|
|
34
|
+
return secret
|
|
35
|
+
except ImportError:
|
|
36
|
+
pass # Not running in Google Colab
|
|
37
|
+
|
|
38
|
+
# 5. Return default if provided
|
|
39
|
+
return default
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from os import cpu_count
|
|
4
|
+
from typing import Optional, Union
|
|
5
|
+
|
|
6
|
+
import gspread
|
|
7
|
+
from google.auth.credentials import Credentials
|
|
8
|
+
from gspread import Client
|
|
9
|
+
from wordlift_client import Configuration, AccountInfo
|
|
10
|
+
|
|
11
|
+
from ..client.client_configuration_factory import ClientConfigurationFactory
|
|
12
|
+
from ..configuration import ConfigurationProvider
|
|
13
|
+
from ..graphql.client import GraphQlClientFactory, GraphQlClient, GqlClientProvider
|
|
14
|
+
from ..id_generator import IdGenerator
|
|
15
|
+
from ..protocol import Context
|
|
16
|
+
from ..protocol.entity_patch import EntityPatchQueue
|
|
17
|
+
from ..protocol.graph import GraphQueue
|
|
18
|
+
from ..url_source import (
|
|
19
|
+
SitemapUrlSource,
|
|
20
|
+
GoogleSheetsUrlSource,
|
|
21
|
+
ListUrlSource,
|
|
22
|
+
UrlSource,
|
|
23
|
+
)
|
|
24
|
+
from ..url_source.new_or_changed_url_source import NewOrChangedUrlSource
|
|
25
|
+
from ..utils import get_me
|
|
26
|
+
from ..workflow.kg_import_workflow import KgImportWorkflow
|
|
27
|
+
from ..workflow.url_handler import WebPageImportUrlHandler
|
|
28
|
+
from ..workflow.url_handler.default_url_handler import DefaultUrlHandler
|
|
29
|
+
from ..workflow.url_handler.search_console_url_handler import SearchConsoleUrlHandler
|
|
30
|
+
from ..workflow.url_handler.url_handler import UrlHandler
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class UrlSourceInput:
|
|
35
|
+
"""
|
|
36
|
+
Input structure for the UrlProviderFactory.
|
|
37
|
+
|
|
38
|
+
This class holds all possible parameters needed to create any of the supported URL providers.
|
|
39
|
+
The factory will use these parameters to determine which provider to create based on availability.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
sitemap_url: Optional[str] = None
|
|
43
|
+
sitemap_url_pattern: Optional[str] = None
|
|
44
|
+
sheets_url: Optional[str] = None
|
|
45
|
+
sheets_name: Optional[str] = None
|
|
46
|
+
sheets_creds_or_client: Optional[Union[Credentials, Client]] = None
|
|
47
|
+
urls: Optional[list[str]] = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ApplicationContainer:
|
|
51
|
+
_api_url: str
|
|
52
|
+
_client_configuration: Configuration
|
|
53
|
+
_configuration_provider: ConfigurationProvider
|
|
54
|
+
_key: str
|
|
55
|
+
|
|
56
|
+
_context: Context | None = None
|
|
57
|
+
_graphql_client: GraphQlClient | None = None
|
|
58
|
+
|
|
59
|
+
def __init__(self, configuration_provider: ConfigurationProvider | None = None):
|
|
60
|
+
self._configuration_provider = (
|
|
61
|
+
configuration_provider or ConfigurationProvider.create()
|
|
62
|
+
)
|
|
63
|
+
self._api_url = self._configuration_provider.get_value(
|
|
64
|
+
"API_URL", "https://api.wordlift.io"
|
|
65
|
+
)
|
|
66
|
+
self._key = self._configuration_provider.get_value("WORDLIFT_KEY")
|
|
67
|
+
self._client_configuration = ClientConfigurationFactory(
|
|
68
|
+
key=self._key,
|
|
69
|
+
api_url=self._api_url,
|
|
70
|
+
).create()
|
|
71
|
+
|
|
72
|
+
async def get_account(self) -> AccountInfo:
|
|
73
|
+
return await get_me(configuration=self._client_configuration)
|
|
74
|
+
|
|
75
|
+
async def get_context(self) -> Context:
|
|
76
|
+
if not self._context:
|
|
77
|
+
account = await self.get_account()
|
|
78
|
+
self._context = Context(
|
|
79
|
+
account=account,
|
|
80
|
+
client_configuration=self._client_configuration,
|
|
81
|
+
configuration_provider=self._configuration_provider,
|
|
82
|
+
id_generator=IdGenerator(account=account),
|
|
83
|
+
graph_queue=GraphQueue(client_configuration=self._client_configuration),
|
|
84
|
+
entity_patch_queue=EntityPatchQueue(
|
|
85
|
+
client_configuration=self._client_configuration
|
|
86
|
+
),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return self._context
|
|
90
|
+
|
|
91
|
+
async def create_web_page_import_url_handler(self) -> WebPageImportUrlHandler:
|
|
92
|
+
write_strategy = self._configuration_provider.get_value(
|
|
93
|
+
"WEB_PAGE_IMPORT_WRITE_STRATEGY", "createOrUpdateModel"
|
|
94
|
+
)
|
|
95
|
+
return WebPageImportUrlHandler(
|
|
96
|
+
context=await self.get_context(),
|
|
97
|
+
embedding_properties=self._configuration_provider.get_value(
|
|
98
|
+
"EMBEDDING_PROPERTIES",
|
|
99
|
+
[
|
|
100
|
+
"http://schema.org/headline",
|
|
101
|
+
"http://schema.org/abstract",
|
|
102
|
+
"http://schema.org/text",
|
|
103
|
+
],
|
|
104
|
+
),
|
|
105
|
+
web_page_types=self._configuration_provider.get_value(
|
|
106
|
+
"WEB_PAGE_TYPES", ["http://schema.org/Article"]
|
|
107
|
+
),
|
|
108
|
+
write_strategy=write_strategy,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
async def create_search_console_url_handler(self):
|
|
112
|
+
return SearchConsoleUrlHandler(
|
|
113
|
+
context=await self.get_context(),
|
|
114
|
+
graphql_client=await self.get_graphql_client(),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
async def create_multi_url_handler(self):
|
|
118
|
+
handlers: list[UrlHandler] = [
|
|
119
|
+
await self.create_web_page_import_url_handler(),
|
|
120
|
+
]
|
|
121
|
+
if (
|
|
122
|
+
self._configuration_provider.get_value("GOOGLE_SEARCH_CONSOLE", True)
|
|
123
|
+
is True
|
|
124
|
+
):
|
|
125
|
+
handlers.append(await self.create_search_console_url_handler())
|
|
126
|
+
|
|
127
|
+
return DefaultUrlHandler(url_handler_list=handlers)
|
|
128
|
+
|
|
129
|
+
async def create_kg_import_workflow(self) -> KgImportWorkflow:
|
|
130
|
+
concurrency = self._configuration_provider.get_value(
|
|
131
|
+
"CONCURRENCY", min(cpu_count(), 4)
|
|
132
|
+
)
|
|
133
|
+
return KgImportWorkflow(
|
|
134
|
+
context=await self.get_context(),
|
|
135
|
+
url_source=await self.create_new_or_changed_source(),
|
|
136
|
+
url_handler=await self.create_multi_url_handler(),
|
|
137
|
+
concurrency=concurrency,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
async def create_graphql_client_factory(self) -> GraphQlClientFactory:
|
|
141
|
+
return GraphQlClientFactory(key=self._key, api_url=self._api_url + "/graphql")
|
|
142
|
+
|
|
143
|
+
async def create_gql_client_provider(self) -> GqlClientProvider:
|
|
144
|
+
graphql_client_factory = await self.create_graphql_client_factory()
|
|
145
|
+
return graphql_client_factory.create_provider()
|
|
146
|
+
|
|
147
|
+
async def get_graphql_client(self) -> GraphQlClient:
|
|
148
|
+
if self._graphql_client is None:
|
|
149
|
+
graphql_client_factory = await self.create_graphql_client_factory()
|
|
150
|
+
self._graphql_client = graphql_client_factory.create()
|
|
151
|
+
|
|
152
|
+
return self._graphql_client
|
|
153
|
+
|
|
154
|
+
async def create_url_source(self) -> UrlSource:
|
|
155
|
+
# Try to read the configuration from the `config/default.py` file.
|
|
156
|
+
sitemap_url = self._configuration_provider.get_value("SITEMAP_URL")
|
|
157
|
+
sitemap_url_pattern = self._configuration_provider.get_value(
|
|
158
|
+
"SITEMAP_URL_PATTERN", None
|
|
159
|
+
)
|
|
160
|
+
sheets_url = self._configuration_provider.get_value("SHEETS_URL")
|
|
161
|
+
sheets_name = self._configuration_provider.get_value("SHEETS_NAME")
|
|
162
|
+
sheets_service_account = self._configuration_provider.get_value(
|
|
163
|
+
"SHEETS_SERVICE_ACCOUNT"
|
|
164
|
+
)
|
|
165
|
+
urls = self._configuration_provider.get_value("URLS")
|
|
166
|
+
|
|
167
|
+
if (
|
|
168
|
+
sitemap_url is None
|
|
169
|
+
and urls is None
|
|
170
|
+
and (
|
|
171
|
+
sheets_url is None
|
|
172
|
+
or sheets_name is None
|
|
173
|
+
or sheets_service_account is None
|
|
174
|
+
)
|
|
175
|
+
):
|
|
176
|
+
raise ValueError(
|
|
177
|
+
"One of `sitemap_url` or `sheets_url`/`sheets_name`/`sheets_service_account` is required."
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
input_params = UrlSourceInput(
|
|
181
|
+
sitemap_url=sitemap_url,
|
|
182
|
+
sitemap_url_pattern=sitemap_url_pattern,
|
|
183
|
+
sheets_url=sheets_url,
|
|
184
|
+
sheets_name=sheets_name,
|
|
185
|
+
sheets_creds_or_client=(
|
|
186
|
+
gspread.service_account(filename=sheets_service_account)
|
|
187
|
+
if sheets_service_account
|
|
188
|
+
else None
|
|
189
|
+
),
|
|
190
|
+
urls=urls,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Try to create a SitemapUrlProvider if sitemap_url is provided
|
|
194
|
+
if input_params.sitemap_url:
|
|
195
|
+
return SitemapUrlSource(
|
|
196
|
+
input_params.sitemap_url,
|
|
197
|
+
re.compile(input_params.sitemap_url_pattern)
|
|
198
|
+
if input_params.sitemap_url_pattern
|
|
199
|
+
else None,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Try to create a GoogleSheetsUrlProvider if all required sheets parameters are provided
|
|
203
|
+
if (
|
|
204
|
+
input_params.sheets_url
|
|
205
|
+
and input_params.sheets_name
|
|
206
|
+
and input_params.sheets_creds_or_client
|
|
207
|
+
):
|
|
208
|
+
return GoogleSheetsUrlSource(
|
|
209
|
+
input_params.sheets_creds_or_client,
|
|
210
|
+
input_params.sheets_url,
|
|
211
|
+
input_params.sheets_name,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Try to create a ListUrlProvider if urls is provided
|
|
215
|
+
if input_params.urls:
|
|
216
|
+
return ListUrlSource(input_params.urls)
|
|
217
|
+
|
|
218
|
+
# If we get here, none of the required parameters were provided
|
|
219
|
+
raise ValueError(
|
|
220
|
+
"No valid parameters provided to create a URL provider. "
|
|
221
|
+
"Please provide either sitemap_url, all sheets parameters "
|
|
222
|
+
"(sheets_url, sheets_name, sheets_creds_or_client), or urls."
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
async def create_new_or_changed_source(self) -> UrlSource:
|
|
226
|
+
overwrite = self._configuration_provider.get_value("OVERWRITE", False)
|
|
227
|
+
return NewOrChangedUrlSource(
|
|
228
|
+
url_provider=await self.create_url_source(),
|
|
229
|
+
graphql_client=await self.get_graphql_client(),
|
|
230
|
+
overwrite=overwrite,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
async def create_url_source_with_overwrite(self) -> UrlSource:
|
|
234
|
+
return await self.create_new_or_changed_source()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pandas import DataFrame
|
|
5
|
+
from tqdm.asyncio import tqdm
|
|
6
|
+
|
|
7
|
+
from ..graphql.utils.query import entity_with_top_query_factory
|
|
8
|
+
from ..utils import create_delayed
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def create_entities_with_top_query_dataframe(
|
|
14
|
+
key: str, url_list: list[str]
|
|
15
|
+
) -> DataFrame:
|
|
16
|
+
# Get the entities data with the top query.
|
|
17
|
+
logger.info("Loading entities with top query...")
|
|
18
|
+
entity_with_top_query = await entity_with_top_query_factory(key)
|
|
19
|
+
delayed = create_delayed(entity_with_top_query, 4)
|
|
20
|
+
entities_with_top_query = await tqdm.gather(
|
|
21
|
+
*[delayed(url) for url in url_list], total=len(url_list)
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Get a list of dataframes.
|
|
25
|
+
dataframes = [
|
|
26
|
+
obj.to_dataframe() for obj in entities_with_top_query if obj is not None
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Concat them together, with a new index.
|
|
30
|
+
return pd.concat(dataframes, ignore_index=True)
|