datajunction-reflection 0.0.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datajunction-reflection might be problematic. Click here for more details.

@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.1
2
+ Name: datajunction-reflection
3
+ Version: 0.0.1a1
4
+ Summary: OSS Implementation of a DataJunction Reflection Service
5
+ Project-URL: repository, https://github.com/DataJunction/dj
6
+ Author-email: DataJunction Authors <roberto@dealmeida.net>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Python: <4.0,>=3.8
17
+ Requires-Dist: celery[redis]>=5.2.3
18
+ Requires-Dist: djopenapi
19
+ Requires-Dist: importlib-metadata
20
+ Requires-Dist: pydantic
21
+ Requires-Dist: python-dotenv==0.19.2
22
+ Requires-Dist: requests>=2.26.0
23
+ Provides-Extra: uvicorn
24
+ Requires-Dist: uvicorn[standard]>=0.21.1; extra == 'uvicorn'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # DJ Reflection Service
28
+
29
+ The reflection service polls the DJ core service for all nodes with associated tables, whether source
30
+ tables or materialized tables. For each node, it refreshes the node's schema based on the associated
31
+ table's schema that it retrieves from the query service. It also retrieves the available partitions and
32
+ the valid through timestamp of these tables and reflects them accordingly to DJ core.
33
+
34
+ This service uses a celery beat scheduler, with a configurable polling interval that defaults to once per
35
+ hour and async tasks for each node's reflection.
@@ -0,0 +1,11 @@
1
+ djrs/__about__.py,sha256=mlybO8bzNO9GojXJH7D8hqtzoQxcupTM5lXnSjq6_f8,50
2
+ djrs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ djrs/config.py,sha256=ImLCyAULcoNFmKSmNXuGlP6tw4J6AHKI4ultb6qGiho,613
4
+ djrs/worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ djrs/worker/app.py,sha256=XdaZshykwg2CypN10XefPhBPaYSRkzdXeB1cRzQpjDk,174
6
+ djrs/worker/tasks.py,sha256=MyKita0jeJpL1h6lNJYOQzUZzht1azw4n72ZGSKmgsg,3871
7
+ djrs/worker/utils.py,sha256=VEfVEPmrGc86j2GY7Bal5NTQQSAfiuszf90xf-vDLZ8,1276
8
+ datajunction_reflection-0.0.1a1.dist-info/METADATA,sha256=nC-K4XdFR29AWRK9ywsUCI7LKdo6h6nq4ZUsPD7-CgU,1555
9
+ datajunction_reflection-0.0.1a1.dist-info/WHEEL,sha256=KGYbc1zXlYddvwxnNty23BeaKzh7YuoSIvIMO4jEhvw,87
10
+ datajunction_reflection-0.0.1a1.dist-info/licenses/LICENSE,sha256=Q5EUOoPvfa4k1Az3B6y-bddOppd_h7sa30WgFDZOxsU,1059
11
+ datajunction_reflection-0.0.1a1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.17.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 DJ
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
djrs/__about__.py ADDED
@@ -0,0 +1,4 @@
1
+ """
2
+ Version for Hatch
3
+ """
4
+ __version__ = "0.0.1a1"
djrs/__init__.py ADDED
File without changes
djrs/config.py ADDED
@@ -0,0 +1,26 @@
1
+ """Reflection service settings."""
2
+ from functools import lru_cache
3
+
4
+ from pydantic import BaseSettings
5
+
6
+
7
+ class Settings(BaseSettings):
8
+ """
9
+ Default settings for the reflection service.
10
+ """
11
+
12
+ core_service: str = "http://dj:8000"
13
+ query_service: str = "http://djqs:8001"
14
+ celery_broker: str = "redis://djrs-redis:6379/1"
15
+ celery_results_backend: str = "redis://djrs-redis:6379/2"
16
+
17
+ # Set the number of seconds to wait in between polling
18
+ polling_interval: int = 3600
19
+
20
+
21
+ @lru_cache
22
+ def get_settings() -> Settings:
23
+ """
24
+ Return a cached settings object.
25
+ """
26
+ return Settings()
File without changes
djrs/worker/app.py ADDED
@@ -0,0 +1,7 @@
1
+ """
2
+ Celery app that does the polling of nodes in DJ and then subsequent
3
+ queueing of reflection tasks.
4
+ """
5
+ from djrs.worker.utils import get_celery
6
+
7
+ celery_app = get_celery()
djrs/worker/tasks.py ADDED
@@ -0,0 +1,133 @@
1
+ """Reflection service celery tasks."""
2
+ import datetime
3
+ import json
4
+ from abc import ABC
5
+ from typing import Dict
6
+
7
+ import celery
8
+ import requests
9
+ from celery import shared_task
10
+ from celery.utils.log import get_task_logger
11
+ from djopenapi.model.availability_state_base import AvailabilityStateBase
12
+ from djopenapi.model.update_node import UpdateNode
13
+
14
+ from djrs.worker.app import celery_app
15
+ from djrs.worker.utils import get_dj_client, get_settings
16
+
17
+ logger = get_task_logger(__name__)
18
+
19
+
20
+ class ReflectionServiceTask(celery.Task, ABC):
21
+ """
22
+ Base reflection service task.
23
+ """
24
+
25
+ abstract = True
26
+
27
+ def on_failure(self, exc, task_id, *args, **kwargs):
28
+ logger.exception("%s failed: %s", task_id, exc)
29
+
30
+
31
+ @shared_task(queue="celery", name="djrs.worker.app.refresh", base=ReflectionServiceTask)
32
+ def refresh():
33
+ """
34
+ Find available DJ nodes and kick off reflection tasks for
35
+ nodes with associated tables.
36
+ """
37
+ dj_api = get_dj_client()
38
+
39
+ catalogs_to_engines = {
40
+ catalog["name"]: catalog["engines"][0] if catalog["engines"] else None
41
+ for catalog in json.loads(
42
+ dj_api.list_catalogs_catalogs_get(skip_deserialization=True).response.data,
43
+ )
44
+ }
45
+
46
+ all_nodes = {
47
+ node["name"]: node
48
+ for node in json.loads(
49
+ dj_api.read_nodes_nodes_get(skip_deserialization=True).response.data,
50
+ )
51
+ }
52
+
53
+ tasks = []
54
+ for node_name, node in all_nodes.items():
55
+ if node["catalog"] and node["schema_"] and node["table"]:
56
+ task = celery_app.send_task(
57
+ "djrs.worker.tasks.reflect",
58
+ (
59
+ node_name,
60
+ node["catalog"]["name"],
61
+ node["schema_"],
62
+ node["table"],
63
+ catalogs_to_engines[node["catalog"]["name"]],
64
+ ),
65
+ )
66
+ tasks.append(task)
67
+
68
+
69
+ @shared_task(
70
+ queue="celery",
71
+ name="djrs.worker.tasks.reflect",
72
+ base=ReflectionServiceTask,
73
+ )
74
+ def reflect(
75
+ node_name: str,
76
+ catalog: str,
77
+ schema: str,
78
+ table: str,
79
+ engine: Dict[str, str],
80
+ ):
81
+ """
82
+ This reflects the state of the node's associated table, whether
83
+ external or materialized, back to the DJ core service.
84
+ """
85
+ logger.info(f"Reflecting node={node_name}, table={table} to DJ core")
86
+ settings = get_settings()
87
+ dj_api = get_dj_client()
88
+
89
+ # Update table columns
90
+ response = requests.get(
91
+ f"{settings.query_service}/table/{catalog}.{schema}.{table}/columns/",
92
+ params={
93
+ "engine": engine["name"],
94
+ "engine_version": engine["version"],
95
+ },
96
+ timeout=30,
97
+ )
98
+ table_columns = response.json()["columns"]
99
+ update_columns_response = dj_api.update_node_nodes_name_patch(
100
+ body=UpdateNode(
101
+ columns={col["name"]: {"type": col["type"]} for col in table_columns},
102
+ ),
103
+ path_params={"name": node_name},
104
+ skip_deserialization=True,
105
+ ).response
106
+
107
+ logger.info(
108
+ f"Update node columns for `{node_name}` response: "
109
+ f"{update_columns_response.reason}",
110
+ )
111
+
112
+ # pylint: disable=fixme
113
+ # TODO: Post actual availability state when information available
114
+ availability_state_response = (
115
+ dj_api.add_availability_data_node_name_availability_post(
116
+ body=AvailabilityStateBase(
117
+ catalog=catalog,
118
+ max_partition=[],
119
+ min_partition=[],
120
+ table=table,
121
+ valid_through_ts=int(datetime.datetime.now().timestamp()),
122
+ schema_=schema,
123
+ ),
124
+ path_params={"node_name": node_name},
125
+ skip_deserialization=True,
126
+ ).response
127
+ )
128
+
129
+ logger.info(
130
+ "Post availability state for `%s` response: %s",
131
+ node_name,
132
+ availability_state_response.reason,
133
+ )
djrs/worker/utils.py ADDED
@@ -0,0 +1,49 @@
1
+ """Utility functions for retrieving API clients."""
2
+ import os
3
+ from typing import Optional
4
+
5
+ import djopenapi
6
+ from celery import Celery
7
+ from djopenapi.apis.tags import default_api
8
+
9
+ from djrs.config import get_settings
10
+
11
+
12
+ def get_dj_client(
13
+ core_service: Optional[str] = None,
14
+ ) -> default_api.DefaultApi:
15
+ """
16
+ Return DJ API client
17
+ """
18
+ settings = get_settings()
19
+ configuration = djopenapi.Configuration(
20
+ host=core_service or settings.core_service,
21
+ )
22
+ api_client = djopenapi.ApiClient(configuration)
23
+ return default_api.DefaultApi(api_client)
24
+
25
+
26
+ def get_celery() -> Celery:
27
+ """
28
+ core celery app
29
+ """
30
+
31
+ settings = get_settings()
32
+
33
+ celery_app = Celery(__name__, include=["djrs.worker.app", "djrs.worker.tasks"])
34
+ celery_app.conf.broker_url = os.environ.get(
35
+ "CELERY_BROKER_URL",
36
+ settings.celery_broker,
37
+ )
38
+ celery_app.conf.result_backend = os.environ.get(
39
+ "CELERY_RESULT_BACKEND",
40
+ settings.celery_results_backend,
41
+ )
42
+ celery_app.conf.imports = ["djrs.worker.app", "djrs.worker.tasks"]
43
+ celery_app.conf.beat_schedule = {
44
+ "refresh": {
45
+ "task": "djrs.worker.app.refresh",
46
+ "schedule": settings.polling_interval,
47
+ },
48
+ }
49
+ return celery_app