wordlift-sdk 2.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. wordlift_sdk-2.7.5/PKG-INFO +125 -0
  2. wordlift_sdk-2.7.5/README.md +98 -0
  3. wordlift_sdk-2.7.5/pyproject.toml +40 -0
  4. wordlift_sdk-2.7.5/wordlift_sdk/__init__.py +3 -0
  5. wordlift_sdk-2.7.5/wordlift_sdk/client/__init__.py +3 -0
  6. wordlift_sdk-2.7.5/wordlift_sdk/client/client_configuration_factory.py +26 -0
  7. wordlift_sdk-2.7.5/wordlift_sdk/configuration/__init__.py +4 -0
  8. wordlift_sdk-2.7.5/wordlift_sdk/configuration/configuration_provider.py +44 -0
  9. wordlift_sdk-2.7.5/wordlift_sdk/configuration/get_config_value.py +39 -0
  10. wordlift_sdk-2.7.5/wordlift_sdk/container/__init__.py +3 -0
  11. wordlift_sdk-2.7.5/wordlift_sdk/container/application_container.py +234 -0
  12. wordlift_sdk-2.7.5/wordlift_sdk/deprecated/__init__.py +5 -0
  13. wordlift_sdk-2.7.5/wordlift_sdk/deprecated/create_entities_with_top_query_dataframe.py +30 -0
  14. wordlift_sdk-2.7.5/wordlift_sdk/entity/__init__.py +4 -0
  15. wordlift_sdk-2.7.5/wordlift_sdk/entity/enrich.py +54 -0
  16. wordlift_sdk-2.7.5/wordlift_sdk/entity/patch.py +14 -0
  17. wordlift_sdk-2.7.5/wordlift_sdk/google_search_console/__init__.py +5 -0
  18. wordlift_sdk-2.7.5/wordlift_sdk/google_search_console/create_google_search_console_data_import.py +59 -0
  19. wordlift_sdk-2.7.5/wordlift_sdk/google_search_console/raise_error_if_account_analytics_not_configured.py +20 -0
  20. wordlift_sdk-2.7.5/wordlift_sdk/graph/graph_bag.py +7 -0
  21. wordlift_sdk-2.7.5/wordlift_sdk/graph/ttl_liquid/__init__.py +3 -0
  22. wordlift_sdk-2.7.5/wordlift_sdk/graph/ttl_liquid/ttl_liquid_graph_factory.py +43 -0
  23. wordlift_sdk-2.7.5/wordlift_sdk/graphql/__init__.py +3 -0
  24. wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/__init__.py +5 -0
  25. wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/client.py +70 -0
  26. wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/factory.py +36 -0
  27. wordlift_sdk-2.7.5/wordlift_sdk/graphql/client/gql_client_provider.py +26 -0
  28. wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_by_type.graphql +9 -0
  29. wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_embedding_value.graphql +15 -0
  30. wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_top_query.graphql +20 -0
  31. wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_url_id.graphql +6 -0
  32. wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_url_iri.graphql +7 -0
  33. wordlift_sdk-2.7.5/wordlift_sdk/graphql/data/entities_url_iri_with_source_equal_to_web_page_import.graphql +12 -0
  34. wordlift_sdk-2.7.5/wordlift_sdk/graphql/query.py +20 -0
  35. wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/__init__.py +0 -0
  36. wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/query/__init__.py +4 -0
  37. wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/query/entity_top_query.py +56 -0
  38. wordlift_sdk-2.7.5/wordlift_sdk/graphql/utils/query/entity_with_top_query.py +52 -0
  39. wordlift_sdk-2.7.5/wordlift_sdk/id_generator/__init__.py +3 -0
  40. wordlift_sdk-2.7.5/wordlift_sdk/id_generator/id_generator.py +40 -0
  41. wordlift_sdk-2.7.5/wordlift_sdk/id_generator/id_generator_interface.py +8 -0
  42. wordlift_sdk-2.7.5/wordlift_sdk/internal_link/__init__.py +3 -0
  43. wordlift_sdk-2.7.5/wordlift_sdk/internal_link/utils.py +231 -0
  44. wordlift_sdk-2.7.5/wordlift_sdk/kg/__init__.py +5 -0
  45. wordlift_sdk-2.7.5/wordlift_sdk/kg/entity.py +17 -0
  46. wordlift_sdk-2.7.5/wordlift_sdk/kg/entity_store.py +94 -0
  47. wordlift_sdk-2.7.5/wordlift_sdk/kg/entity_store_factory.py +13 -0
  48. wordlift_sdk-2.7.5/wordlift_sdk/kg/relation/__init__.py +0 -0
  49. wordlift_sdk-2.7.5/wordlift_sdk/kg/relation/relation_service.py +78 -0
  50. wordlift_sdk-2.7.5/wordlift_sdk/main.py +7 -0
  51. wordlift_sdk-2.7.5/wordlift_sdk/namespace/SDO.py +3281 -0
  52. wordlift_sdk-2.7.5/wordlift_sdk/namespace/__init__.py +3 -0
  53. wordlift_sdk-2.7.5/wordlift_sdk/notebook/__init__.py +3 -0
  54. wordlift_sdk-2.7.5/wordlift_sdk/notebook/install_if_missing.py +12 -0
  55. wordlift_sdk-2.7.5/wordlift_sdk/protocol/__init__.py +5 -0
  56. wordlift_sdk-2.7.5/wordlift_sdk/protocol/context.py +21 -0
  57. wordlift_sdk-2.7.5/wordlift_sdk/protocol/entity_patch/__init__.py +4 -0
  58. wordlift_sdk-2.7.5/wordlift_sdk/protocol/entity_patch/entity_patch.py +8 -0
  59. wordlift_sdk-2.7.5/wordlift_sdk/protocol/entity_patch/entity_patch_queue.py +49 -0
  60. wordlift_sdk-2.7.5/wordlift_sdk/protocol/graph/__init__.py +3 -0
  61. wordlift_sdk-2.7.5/wordlift_sdk/protocol/graph/graph_queue.py +64 -0
  62. wordlift_sdk-2.7.5/wordlift_sdk/protocol/load_override_class.py +30 -0
  63. wordlift_sdk-2.7.5/wordlift_sdk/protocol/web_page_import_protocol.py +23 -0
  64. wordlift_sdk-2.7.5/wordlift_sdk/url_source/__init__.py +6 -0
  65. wordlift_sdk-2.7.5/wordlift_sdk/url_source/google_sheets_url_source.py +53 -0
  66. wordlift_sdk-2.7.5/wordlift_sdk/url_source/list_url_source.py +28 -0
  67. wordlift_sdk-2.7.5/wordlift_sdk/url_source/new_or_changed_url_source.py +57 -0
  68. wordlift_sdk-2.7.5/wordlift_sdk/url_source/sitemap_url_source.py +36 -0
  69. wordlift_sdk-2.7.5/wordlift_sdk/url_source/url_source.py +18 -0
  70. wordlift_sdk-2.7.5/wordlift_sdk/url_source/url_source_input.py +6 -0
  71. wordlift_sdk-2.7.5/wordlift_sdk/utils/__init__.py +17 -0
  72. wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_from_google_sheets.py +26 -0
  73. wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_of_entities_by_types.py +26 -0
  74. wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_of_entities_with_embedding_vectors.py +28 -0
  75. wordlift_sdk-2.7.5/wordlift_sdk/utils/create_dataframe_of_url_iri.py +14 -0
  76. wordlift_sdk-2.7.5/wordlift_sdk/utils/create_entity_patch_request.py +14 -0
  77. wordlift_sdk-2.7.5/wordlift_sdk/utils/delayed.py +12 -0
  78. wordlift_sdk-2.7.5/wordlift_sdk/utils/get_me.py +8 -0
  79. wordlift_sdk-2.7.5/wordlift_sdk/utils/import_url.py +35 -0
  80. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/__init__.py +0 -0
  81. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/entity_gaps/__init__.py +3 -0
  82. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/entity_gaps/create_entity_gaps_factory.py +51 -0
  83. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/entity_gaps/entity_gaps_callback.py +32 -0
  84. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/__init__.py +0 -0
  85. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/__init__.py +14 -0
  86. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/default/__init__.py +4 -0
  87. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_import_url_protocol.py +39 -0
  88. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/default/default_parse_html_protocol.py +41 -0
  89. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/import_url_protocol_interface.py +21 -0
  90. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/parse_html_protocol_interface.py +27 -0
  91. wordlift_sdk-2.7.5/wordlift_sdk/wordlift/sitemap_import/protocol/protocol_context.py +16 -0
  92. wordlift_sdk-2.7.5/wordlift_sdk/workflow/__init__.py +3 -0
  93. wordlift_sdk-2.7.5/wordlift_sdk/workflow/create_or_update_entities_factory.py +16 -0
  94. wordlift_sdk-2.7.5/wordlift_sdk/workflow/kg_import_workflow.py +49 -0
  95. wordlift_sdk-2.7.5/wordlift_sdk/workflow/patch_entities_factory.py +16 -0
  96. wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/__init__.py +3 -0
  97. wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/default_url_handler.py +23 -0
  98. wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/search_console_url_handler.py +79 -0
  99. wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/url_handler.py +8 -0
  100. wordlift_sdk-2.7.5/wordlift_sdk/workflow/url_handler/web_page_import_url_handler.py +104 -0
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.4
2
+ Name: wordlift-sdk
3
+ Version: 2.7.5
4
+ Summary:
5
+ Author: David Riccitelli
6
+ Author-email: david@wordlift.io
7
+ Requires-Python: >=3.10,<3.14
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: advertools (>0.16.6,<1.0.0)
14
+ Requires-Dist: aiohttp (>=3.10.5,<4.0.0)
15
+ Requires-Dist: google-auth (>=2.35.0,<3.0.0)
16
+ Requires-Dist: gql[aiohttp] (>=3.5.2,<4.0.0)
17
+ Requires-Dist: gspread (>=6.1.2,<7.0.0)
18
+ Requires-Dist: pandas (>=2.1.4,<2.3.0)
19
+ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
20
+ Requires-Dist: python-liquid (>=2.0.1,<3.0.0)
21
+ Requires-Dist: rdflib (>=7.0.0,<8.0.0)
22
+ Requires-Dist: tenacity (>=9.0.0,<10.0.0)
23
+ Requires-Dist: tqdm (>=4.67.1,<5.0.0)
24
+ Requires-Dist: wordlift-client (>=1.117.0,<2.0.0)
25
+ Description-Content-Type: text/markdown
26
+
27
+ # WordLift Python SDK
28
+
29
+ A Python toolkit for orchestrating WordLift imports: fetch URLs from sitemaps, Google Sheets, or explicit lists, filter out already imported pages, enqueue search console jobs, push RDF graphs, and call the WordLift APIs to import web pages.
30
+
31
+ ## Features
32
+ - URL sources: XML sitemaps (with optional regex filtering), Google Sheets (`url` column), or Python lists.
33
+ - Change detection: skips URLs that are already imported unless `OVERWRITE` is enabled; re-imports when `lastmod` is newer.
34
+ - Web page imports: sends URLs to WordLift with embedding requests, output types, retry logic, and pluggable callbacks.
35
+ - Search Console refresh: triggers analytics imports when top queries are stale.
36
+ - Graph templates: renders `.ttl.liquid` templates under `data/templates` with account data and uploads the resulting RDF graphs.
37
+ - Extensible: override protocols via `WORDLIFT_OVERRIDE_DIR` without changing the library code.
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install wordlift-sdk
43
+ # or
44
+ poetry add wordlift-sdk
45
+ ```
46
+
47
+ Requires Python 3.10–3.13.
48
+
49
+ ## Configuration
50
+
51
+ Settings are read in order: `config/default.py` (or a custom path you pass to `ConfigurationProvider.create`), environment variables, then (when available) Google Colab `userdata`.
52
+
53
+ Common options:
54
+ - `WORDLIFT_KEY` (required): WordLift API key.
55
+ - `API_URL`: WordLift API base URL, defaults to `https://api.wordlift.io`.
56
+ - `SITEMAP_URL`: XML sitemap to crawl; `SITEMAP_URL_PATTERN` optional regex to filter URLs.
57
+ - `SHEETS_URL`, `SHEETS_NAME`, `SHEETS_SERVICE_ACCOUNT`: use a Google Sheet as source; service account points to credentials file.
58
+ - `URLS`: list of URLs (e.g., `["https://example.com/a", "https://example.com/b"]`).
59
+ - `OVERWRITE`: re-import URLs even if already present (default `False`).
60
+ - `WEB_PAGE_IMPORT_WRITE_STRATEGY`: WordLift write strategy (default `createOrUpdateModel`).
61
+ - `EMBEDDING_PROPERTIES`: list of schema properties to embed.
62
+ - `WEB_PAGE_TYPES`: output schema types, defaults to `["http://schema.org/Article"]`.
63
+ - `GOOGLE_SEARCH_CONSOLE`: enable/disable Search Console handler (default `True`).
64
+ - `CONCURRENCY`: max concurrent handlers, defaults to `min(cpu_count(), 4)`.
65
+ - `WORDLIFT_OVERRIDE_DIR`: folder containing protocol overrides (default `app/overrides`).
66
+
67
+ Example `config/default.py`:
68
+
69
+ ```python
70
+ WORDLIFT_KEY = "your-api-key"
71
+ SITEMAP_URL = "https://example.com/sitemap.xml"
72
+ SITEMAP_URL_PATTERN = r"^https://example.com/article/.*$"
73
+ GOOGLE_SEARCH_CONSOLE = True
74
+ WEB_PAGE_TYPES = ["http://schema.org/Article"]
75
+ EMBEDDING_PROPERTIES = [
76
+ "http://schema.org/headline",
77
+ "http://schema.org/abstract",
78
+ "http://schema.org/text",
79
+ ]
80
+ ```
81
+
82
+ ## Running the import workflow
83
+
84
+ ```python
85
+ import asyncio
86
+ from wordlift_sdk import run_kg_import_workflow
87
+
88
+ if __name__ == "__main__":
89
+ asyncio.run(run_kg_import_workflow())
90
+ ```
91
+
92
+ The workflow:
93
+ 1. Renders and uploads RDF graphs from `data/templates/*.ttl.liquid` using account info.
94
+ 2. Builds the configured URL source and filters out unchanged URLs (unless `OVERWRITE`).
95
+ 3. Sends each URL to WordLift for import with retries and optional Search Console refresh.
96
+
97
+ You can build components yourself when you need more control:
98
+
99
+ ```python
100
+ import asyncio
101
+ from wordlift_sdk.container.application_container import ApplicationContainer
102
+
103
+ async def main():
104
+ container = ApplicationContainer()
105
+ workflow = await container.create_kg_import_workflow()
106
+ await workflow.run()
107
+
108
+ asyncio.run(main())
109
+ ```
110
+
111
+ ## Custom callbacks and overrides
112
+
113
+ Override the web page import callback by placing `web_page_import_protocol.py` with a `WebPageImportProtocol` class under `WORDLIFT_OVERRIDE_DIR` (default `app/overrides`). The callback receives a `WebPageImportResponse` and can push to `graph_queue` or `entity_patch_queue`.
114
+
115
+ ## Templates
116
+
117
+ Add `.ttl.liquid` files under `data/templates`. Templates render with `account` fields available (e.g., `{{ account.dataset_uri }}`) and are uploaded before URL handling begins.
118
+
119
+ ## Testing
120
+
121
+ ```bash
122
+ poetry install --with dev
123
+ poetry run pytest
124
+ ```
125
+
@@ -0,0 +1,98 @@
1
+ # WordLift Python SDK
2
+
3
+ A Python toolkit for orchestrating WordLift imports: fetch URLs from sitemaps, Google Sheets, or explicit lists, filter out already imported pages, enqueue search console jobs, push RDF graphs, and call the WordLift APIs to import web pages.
4
+
5
+ ## Features
6
+ - URL sources: XML sitemaps (with optional regex filtering), Google Sheets (`url` column), or Python lists.
7
+ - Change detection: skips URLs that are already imported unless `OVERWRITE` is enabled; re-imports when `lastmod` is newer.
8
+ - Web page imports: sends URLs to WordLift with embedding requests, output types, retry logic, and pluggable callbacks.
9
+ - Search Console refresh: triggers analytics imports when top queries are stale.
10
+ - Graph templates: renders `.ttl.liquid` templates under `data/templates` with account data and uploads the resulting RDF graphs.
11
+ - Extensible: override protocols via `WORDLIFT_OVERRIDE_DIR` without changing the library code.
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install wordlift-sdk
17
+ # or
18
+ poetry add wordlift-sdk
19
+ ```
20
+
21
+ Requires Python 3.10–3.13.
22
+
23
+ ## Configuration
24
+
25
+ Settings are read in order: `config/default.py` (or a custom path you pass to `ConfigurationProvider.create`), environment variables, then (when available) Google Colab `userdata`.
26
+
27
+ Common options:
28
+ - `WORDLIFT_KEY` (required): WordLift API key.
29
+ - `API_URL`: WordLift API base URL, defaults to `https://api.wordlift.io`.
30
+ - `SITEMAP_URL`: XML sitemap to crawl; `SITEMAP_URL_PATTERN` optional regex to filter URLs.
31
+ - `SHEETS_URL`, `SHEETS_NAME`, `SHEETS_SERVICE_ACCOUNT`: use a Google Sheet as source; service account points to credentials file.
32
+ - `URLS`: list of URLs (e.g., `["https://example.com/a", "https://example.com/b"]`).
33
+ - `OVERWRITE`: re-import URLs even if already present (default `False`).
34
+ - `WEB_PAGE_IMPORT_WRITE_STRATEGY`: WordLift write strategy (default `createOrUpdateModel`).
35
+ - `EMBEDDING_PROPERTIES`: list of schema properties to embed.
36
+ - `WEB_PAGE_TYPES`: output schema types, defaults to `["http://schema.org/Article"]`.
37
+ - `GOOGLE_SEARCH_CONSOLE`: enable/disable Search Console handler (default `True`).
38
+ - `CONCURRENCY`: max concurrent handlers, defaults to `min(cpu_count(), 4)`.
39
+ - `WORDLIFT_OVERRIDE_DIR`: folder containing protocol overrides (default `app/overrides`).
40
+
41
+ Example `config/default.py`:
42
+
43
+ ```python
44
+ WORDLIFT_KEY = "your-api-key"
45
+ SITEMAP_URL = "https://example.com/sitemap.xml"
46
+ SITEMAP_URL_PATTERN = r"^https://example.com/article/.*$"
47
+ GOOGLE_SEARCH_CONSOLE = True
48
+ WEB_PAGE_TYPES = ["http://schema.org/Article"]
49
+ EMBEDDING_PROPERTIES = [
50
+ "http://schema.org/headline",
51
+ "http://schema.org/abstract",
52
+ "http://schema.org/text",
53
+ ]
54
+ ```
55
+
56
+ ## Running the import workflow
57
+
58
+ ```python
59
+ import asyncio
60
+ from wordlift_sdk import run_kg_import_workflow
61
+
62
+ if __name__ == "__main__":
63
+ asyncio.run(run_kg_import_workflow())
64
+ ```
65
+
66
+ The workflow:
67
+ 1. Renders and uploads RDF graphs from `data/templates/*.ttl.liquid` using account info.
68
+ 2. Builds the configured URL source and filters out unchanged URLs (unless `OVERWRITE`).
69
+ 3. Sends each URL to WordLift for import with retries and optional Search Console refresh.
70
+
71
+ You can build components yourself when you need more control:
72
+
73
+ ```python
74
+ import asyncio
75
+ from wordlift_sdk.container.application_container import ApplicationContainer
76
+
77
+ async def main():
78
+ container = ApplicationContainer()
79
+ workflow = await container.create_kg_import_workflow()
80
+ await workflow.run()
81
+
82
+ asyncio.run(main())
83
+ ```
84
+
85
+ ## Custom callbacks and overrides
86
+
87
+ Override the web page import callback by placing `web_page_import_protocol.py` with a `WebPageImportProtocol` class under `WORDLIFT_OVERRIDE_DIR` (default `app/overrides`). The callback receives a `WebPageImportResponse` and can push to `graph_queue` or `entity_patch_queue`.
88
+
89
+ ## Templates
90
+
91
+ Add `.ttl.liquid` files under `data/templates`. Templates render with `account` fields available (e.g., `{{ account.dataset_uri }}`) and are uploaded before URL handling begins.
92
+
93
+ ## Testing
94
+
95
+ ```bash
96
+ poetry install --with dev
97
+ poetry run pytest
98
+ ```
@@ -0,0 +1,40 @@
1
+ [tool.poetry]
2
+ name = "wordlift-sdk"
3
+ version = "2.7.5"
4
+ description = ""
5
+ authors = ["David Riccitelli <david@wordlift.io>"]
6
+ readme = "README.md"
7
+ packages = [{ include = "wordlift_sdk" }]
8
+ include = ["wordlift_sdk/graphql/data/**"]
9
+
10
+ [tool.poetry.dependencies]
11
+ python = ">=3.10, <3.14"
12
+ gql = { extras = ["aiohttp"], version = "^3.5.2" }
13
+ tenacity = "^9.0.0"
14
+ aiohttp = "^3.10.5"
15
+ # Google Colab requires pandas 2.1.4
16
+ pandas = ">=2.1.4, <2.3.0"
17
+ rdflib = "^7.0.0"
18
+ wordlift-client = "^1.117.0"
19
+ gspread = "^6.1.2"
20
+ google-auth = "^2.35.0"
21
+ tqdm = "^4.67.1"
22
+ advertools = ">0.16.6,<1.0.0"
23
+ pycountry = "^24.6.1"
24
+ python-liquid = "^2.0.1"
25
+
26
+ [tool.poetry.group.dev.dependencies]
27
+ pytest = "^8.3.3"
28
+ pytest-asyncio = "^0.24.0"
29
+ docker = "^7.1.0"
30
+ ruff = "^0.11.13"
31
+ pre-commit = "^4.2.0"
32
+ pandas-stubs = "^2.3.3.251201"
33
+
34
+ [build-system]
35
+ requires = ["poetry-core"]
36
+ build-backend = "poetry.core.masonry.api"
37
+
38
+ # see https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915
39
+ [tool.pytest.ini_options]
40
+ asyncio_default_fixture_loop_scope = "function"
@@ -0,0 +1,3 @@
1
+ from .main import run_kg_import_workflow
2
+
3
+ __all__ = ['run_kg_import_workflow']
@@ -0,0 +1,3 @@
1
+ from .client_configuration_factory import ClientConfigurationFactory
2
+
3
+ __all__ = ['ClientConfigurationFactory']
@@ -0,0 +1,26 @@
1
+ import wordlift_client
2
+
3
+
4
+ class ClientConfigurationFactory:
5
+ _api_url: str
6
+ _key: str
7
+
8
+ def __init__(self, key: str, api_url: str = "https://api.wordlift.io"):
9
+ self._api_url = api_url
10
+ self._key = key
11
+
12
+ def create(self):
13
+ configuration = wordlift_client.Configuration(
14
+ host=self._api_url,
15
+ )
16
+
17
+ # The client must configure the authentication and authorization parameters
18
+ # in accordance with the API server security policy.
19
+ # Examples for each auth method are provided below, use the example that
20
+ # satisfies your auth use case.
21
+
22
+ # Configure API key authorization: ApiKey
23
+ configuration.api_key['ApiKey'] = self._key
24
+ configuration.api_key_prefix['ApiKey'] = 'Key'
25
+
26
+ return configuration
@@ -0,0 +1,4 @@
1
+ from .configuration_provider import ConfigurationProvider
2
+ from .get_config_value import get_config_value
3
+
4
+ __all__ = ['ConfigurationProvider', 'get_config_value']
@@ -0,0 +1,44 @@
1
+ import importlib.util
2
+ import os
3
+
4
+
5
+ class ConfigurationProvider:
6
+ _config: dict
7
+
8
+ @staticmethod
9
+ def create(filepath: str = "config/default.py") -> "ConfigurationProvider":
10
+ return ConfigurationProvider(filepath=filepath)
11
+
12
+ def __init__(self, filepath: str):
13
+ if not os.path.exists(filepath):
14
+ self._config = {}
15
+ spec = importlib.util.spec_from_file_location("local_config", filepath)
16
+ module = importlib.util.module_from_spec(spec)
17
+ spec.loader.exec_module(module)
18
+ self._config = {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
19
+
20
+ def get_value(self, key: str, default=None):
21
+ # 1. Check globals
22
+ if key in globals():
23
+ return globals()[key]
24
+
25
+ # 2. Check config.py
26
+ if key in self._config:
27
+ return self._config[key]
28
+
29
+ # 3. Check environment variables
30
+ import os
31
+ if key in os.environ:
32
+ return os.environ[key]
33
+
34
+ # 4. Check Google Colab userdata
35
+ try:
36
+ from google.colab import userdata
37
+ secret = userdata.get(key)
38
+ if secret is not None:
39
+ return secret
40
+ except ImportError:
41
+ pass # Not running in Google Colab
42
+
43
+ # 5. Return default if provided
44
+ return default
@@ -0,0 +1,39 @@
1
+ import os
2
+ import importlib.util
3
+
4
+
5
+ def load_config_py(filepath="config.py"):
6
+ if not os.path.exists(filepath):
7
+ return {}
8
+ spec = importlib.util.spec_from_file_location("local_config", filepath)
9
+ module = importlib.util.module_from_spec(spec)
10
+ spec.loader.exec_module(module)
11
+ return {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
12
+
13
+
14
+ def get_config_value(key, config_py_path=None, default=None):
15
+ # 1. Check globals
16
+ if key in globals():
17
+ return globals()[key]
18
+
19
+ # 2. Check config.py
20
+ config = load_config_py(config_py_path)
21
+ if key in config:
22
+ return config[key]
23
+
24
+ # 3. Check environment variables
25
+ import os
26
+ if key in os.environ:
27
+ return os.environ[key]
28
+
29
+ # 4. Check Google Colab userdata
30
+ try:
31
+ from google.colab import userdata
32
+ secret = userdata.get(key)
33
+ if secret is not None:
34
+ return secret
35
+ except ImportError:
36
+ pass # Not running in Google Colab
37
+
38
+ # 5. Return default if provided
39
+ return default
@@ -0,0 +1,3 @@
1
+ from .application_container import ApplicationContainer
2
+
3
+ __all__ = ['ApplicationContainer']
@@ -0,0 +1,234 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from os import cpu_count
4
+ from typing import Optional, Union
5
+
6
+ import gspread
7
+ from google.auth.credentials import Credentials
8
+ from gspread import Client
9
+ from wordlift_client import Configuration, AccountInfo
10
+
11
+ from ..client.client_configuration_factory import ClientConfigurationFactory
12
+ from ..configuration import ConfigurationProvider
13
+ from ..graphql.client import GraphQlClientFactory, GraphQlClient, GqlClientProvider
14
+ from ..id_generator import IdGenerator
15
+ from ..protocol import Context
16
+ from ..protocol.entity_patch import EntityPatchQueue
17
+ from ..protocol.graph import GraphQueue
18
+ from ..url_source import (
19
+ SitemapUrlSource,
20
+ GoogleSheetsUrlSource,
21
+ ListUrlSource,
22
+ UrlSource,
23
+ )
24
+ from ..url_source.new_or_changed_url_source import NewOrChangedUrlSource
25
+ from ..utils import get_me
26
+ from ..workflow.kg_import_workflow import KgImportWorkflow
27
+ from ..workflow.url_handler import WebPageImportUrlHandler
28
+ from ..workflow.url_handler.default_url_handler import DefaultUrlHandler
29
+ from ..workflow.url_handler.search_console_url_handler import SearchConsoleUrlHandler
30
+ from ..workflow.url_handler.url_handler import UrlHandler
31
+
32
+
33
+ @dataclass
34
+ class UrlSourceInput:
35
+ """
36
+ Input structure for the UrlProviderFactory.
37
+
38
+ This class holds all possible parameters needed to create any of the supported URL providers.
39
+ The factory will use these parameters to determine which provider to create based on availability.
40
+ """
41
+
42
+ sitemap_url: Optional[str] = None
43
+ sitemap_url_pattern: Optional[str] = None
44
+ sheets_url: Optional[str] = None
45
+ sheets_name: Optional[str] = None
46
+ sheets_creds_or_client: Optional[Union[Credentials, Client]] = None
47
+ urls: Optional[list[str]] = None
48
+
49
+
50
+ class ApplicationContainer:
51
+ _api_url: str
52
+ _client_configuration: Configuration
53
+ _configuration_provider: ConfigurationProvider
54
+ _key: str
55
+
56
+ _context: Context | None = None
57
+ _graphql_client: GraphQlClient | None = None
58
+
59
+ def __init__(self, configuration_provider: ConfigurationProvider | None = None):
60
+ self._configuration_provider = (
61
+ configuration_provider or ConfigurationProvider.create()
62
+ )
63
+ self._api_url = self._configuration_provider.get_value(
64
+ "API_URL", "https://api.wordlift.io"
65
+ )
66
+ self._key = self._configuration_provider.get_value("WORDLIFT_KEY")
67
+ self._client_configuration = ClientConfigurationFactory(
68
+ key=self._key,
69
+ api_url=self._api_url,
70
+ ).create()
71
+
72
+ async def get_account(self) -> AccountInfo:
73
+ return await get_me(configuration=self._client_configuration)
74
+
75
+ async def get_context(self) -> Context:
76
+ if not self._context:
77
+ account = await self.get_account()
78
+ self._context = Context(
79
+ account=account,
80
+ client_configuration=self._client_configuration,
81
+ configuration_provider=self._configuration_provider,
82
+ id_generator=IdGenerator(account=account),
83
+ graph_queue=GraphQueue(client_configuration=self._client_configuration),
84
+ entity_patch_queue=EntityPatchQueue(
85
+ client_configuration=self._client_configuration
86
+ ),
87
+ )
88
+
89
+ return self._context
90
+
91
+ async def create_web_page_import_url_handler(self) -> WebPageImportUrlHandler:
92
+ write_strategy = self._configuration_provider.get_value(
93
+ "WEB_PAGE_IMPORT_WRITE_STRATEGY", "createOrUpdateModel"
94
+ )
95
+ return WebPageImportUrlHandler(
96
+ context=await self.get_context(),
97
+ embedding_properties=self._configuration_provider.get_value(
98
+ "EMBEDDING_PROPERTIES",
99
+ [
100
+ "http://schema.org/headline",
101
+ "http://schema.org/abstract",
102
+ "http://schema.org/text",
103
+ ],
104
+ ),
105
+ web_page_types=self._configuration_provider.get_value(
106
+ "WEB_PAGE_TYPES", ["http://schema.org/Article"]
107
+ ),
108
+ write_strategy=write_strategy,
109
+ )
110
+
111
+ async def create_search_console_url_handler(self):
112
+ return SearchConsoleUrlHandler(
113
+ context=await self.get_context(),
114
+ graphql_client=await self.get_graphql_client(),
115
+ )
116
+
117
+ async def create_multi_url_handler(self):
118
+ handlers: list[UrlHandler] = [
119
+ await self.create_web_page_import_url_handler(),
120
+ ]
121
+ if (
122
+ self._configuration_provider.get_value("GOOGLE_SEARCH_CONSOLE", True)
123
+ is True
124
+ ):
125
+ handlers.append(await self.create_search_console_url_handler())
126
+
127
+ return DefaultUrlHandler(url_handler_list=handlers)
128
+
129
+ async def create_kg_import_workflow(self) -> KgImportWorkflow:
130
+ concurrency = self._configuration_provider.get_value(
131
+ "CONCURRENCY", min(cpu_count(), 4)
132
+ )
133
+ return KgImportWorkflow(
134
+ context=await self.get_context(),
135
+ url_source=await self.create_new_or_changed_source(),
136
+ url_handler=await self.create_multi_url_handler(),
137
+ concurrency=concurrency,
138
+ )
139
+
140
+ async def create_graphql_client_factory(self) -> GraphQlClientFactory:
141
+ return GraphQlClientFactory(key=self._key, api_url=self._api_url + "/graphql")
142
+
143
+ async def create_gql_client_provider(self) -> GqlClientProvider:
144
+ graphql_client_factory = await self.create_graphql_client_factory()
145
+ return graphql_client_factory.create_provider()
146
+
147
+ async def get_graphql_client(self) -> GraphQlClient:
148
+ if self._graphql_client is None:
149
+ graphql_client_factory = await self.create_graphql_client_factory()
150
+ self._graphql_client = graphql_client_factory.create()
151
+
152
+ return self._graphql_client
153
+
154
+ async def create_url_source(self) -> UrlSource:
155
+ # Try to read the configuration from the `config/default.py` file.
156
+ sitemap_url = self._configuration_provider.get_value("SITEMAP_URL")
157
+ sitemap_url_pattern = self._configuration_provider.get_value(
158
+ "SITEMAP_URL_PATTERN", None
159
+ )
160
+ sheets_url = self._configuration_provider.get_value("SHEETS_URL")
161
+ sheets_name = self._configuration_provider.get_value("SHEETS_NAME")
162
+ sheets_service_account = self._configuration_provider.get_value(
163
+ "SHEETS_SERVICE_ACCOUNT"
164
+ )
165
+ urls = self._configuration_provider.get_value("URLS")
166
+
167
+ if (
168
+ sitemap_url is None
169
+ and urls is None
170
+ and (
171
+ sheets_url is None
172
+ or sheets_name is None
173
+ or sheets_service_account is None
174
+ )
175
+ ):
176
+ raise ValueError(
177
+ "One of `sitemap_url` or `sheets_url`/`sheets_name`/`sheets_service_account` is required."
178
+ )
179
+
180
+ input_params = UrlSourceInput(
181
+ sitemap_url=sitemap_url,
182
+ sitemap_url_pattern=sitemap_url_pattern,
183
+ sheets_url=sheets_url,
184
+ sheets_name=sheets_name,
185
+ sheets_creds_or_client=(
186
+ gspread.service_account(filename=sheets_service_account)
187
+ if sheets_service_account
188
+ else None
189
+ ),
190
+ urls=urls,
191
+ )
192
+
193
+ # Try to create a SitemapUrlProvider if sitemap_url is provided
194
+ if input_params.sitemap_url:
195
+ return SitemapUrlSource(
196
+ input_params.sitemap_url,
197
+ re.compile(input_params.sitemap_url_pattern)
198
+ if input_params.sitemap_url_pattern
199
+ else None,
200
+ )
201
+
202
+ # Try to create a GoogleSheetsUrlProvider if all required sheets parameters are provided
203
+ if (
204
+ input_params.sheets_url
205
+ and input_params.sheets_name
206
+ and input_params.sheets_creds_or_client
207
+ ):
208
+ return GoogleSheetsUrlSource(
209
+ input_params.sheets_creds_or_client,
210
+ input_params.sheets_url,
211
+ input_params.sheets_name,
212
+ )
213
+
214
+ # Try to create a ListUrlProvider if urls is provided
215
+ if input_params.urls:
216
+ return ListUrlSource(input_params.urls)
217
+
218
+ # If we get here, none of the required parameters were provided
219
+ raise ValueError(
220
+ "No valid parameters provided to create a URL provider. "
221
+ "Please provide either sitemap_url, all sheets parameters "
222
+ "(sheets_url, sheets_name, sheets_creds_or_client), or urls."
223
+ )
224
+
225
+ async def create_new_or_changed_source(self) -> UrlSource:
226
+ overwrite = self._configuration_provider.get_value("OVERWRITE", False)
227
+ return NewOrChangedUrlSource(
228
+ url_provider=await self.create_url_source(),
229
+ graphql_client=await self.get_graphql_client(),
230
+ overwrite=overwrite,
231
+ )
232
+
233
+ async def create_url_source_with_overwrite(self) -> UrlSource:
234
+ return await self.create_new_or_changed_source()
@@ -0,0 +1,5 @@
1
+ from .create_entities_with_top_query_dataframe import (
2
+ create_entities_with_top_query_dataframe,
3
+ )
4
+
5
+ __all__ = ["create_entities_with_top_query_dataframe"]
@@ -0,0 +1,30 @@
1
+ import logging
2
+
3
+ import pandas as pd
4
+ from pandas import DataFrame
5
+ from tqdm.asyncio import tqdm
6
+
7
+ from ..graphql.utils.query import entity_with_top_query_factory
8
+ from ..utils import create_delayed
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ async def create_entities_with_top_query_dataframe(
14
+ key: str, url_list: list[str]
15
+ ) -> DataFrame:
16
+ # Get the entities data with the top query.
17
+ logger.info("Loading entities with top query...")
18
+ entity_with_top_query = await entity_with_top_query_factory(key)
19
+ delayed = create_delayed(entity_with_top_query, 4)
20
+ entities_with_top_query = await tqdm.gather(
21
+ *[delayed(url) for url in url_list], total=len(url_list)
22
+ )
23
+
24
+ # Get a list of dataframes.
25
+ dataframes = [
26
+ obj.to_dataframe() for obj in entities_with_top_query if obj is not None
27
+ ]
28
+
29
+ # Concat them together, with a new index.
30
+ return pd.concat(dataframes, ignore_index=True)
@@ -0,0 +1,4 @@
1
+ from .enrich import enrich, ParseHtmlCallback
2
+ from .patch import patch
3
+
4
+ __all__ = ['enrich', 'ParseHtmlCallback', 'patch']