datajunction-reflection 0.0.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datajunction-reflection might be problematic. Click here for more details.
- datajunction_reflection-0.0.1a1.dist-info/METADATA +35 -0
- datajunction_reflection-0.0.1a1.dist-info/RECORD +11 -0
- datajunction_reflection-0.0.1a1.dist-info/WHEEL +4 -0
- datajunction_reflection-0.0.1a1.dist-info/licenses/LICENSE +21 -0
- djrs/__about__.py +4 -0
- djrs/__init__.py +0 -0
- djrs/config.py +26 -0
- djrs/worker/__init__.py +0 -0
- djrs/worker/app.py +7 -0
- djrs/worker/tasks.py +133 -0
- djrs/worker/utils.py +49 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datajunction-reflection
|
|
3
|
+
Version: 0.0.1a1
|
|
4
|
+
Summary: OSS Implementation of a DataJunction Reflection Service
|
|
5
|
+
Project-URL: repository, https://github.com/DataJunction/dj
|
|
6
|
+
Author-email: DataJunction Authors <roberto@dealmeida.net>
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Python: <4.0,>=3.8
|
|
17
|
+
Requires-Dist: celery[redis]>=5.2.3
|
|
18
|
+
Requires-Dist: djopenapi
|
|
19
|
+
Requires-Dist: importlib-metadata
|
|
20
|
+
Requires-Dist: pydantic
|
|
21
|
+
Requires-Dist: python-dotenv==0.19.2
|
|
22
|
+
Requires-Dist: requests>=2.26.0
|
|
23
|
+
Provides-Extra: uvicorn
|
|
24
|
+
Requires-Dist: uvicorn[standard]>=0.21.1; extra == 'uvicorn'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# DJ Reflection Service
|
|
28
|
+
|
|
29
|
+
The reflection service polls the DJ core service for all nodes with associated tables, whether source
|
|
30
|
+
tables or materialized tables. For each node, it refreshes the node's schema based on the associated
|
|
31
|
+
table's schema that it retrieves from the query service. It also retrieves the available partitions and
|
|
32
|
+
the valid through timestamp of these tables and reflects them accordingly to DJ core.
|
|
33
|
+
|
|
34
|
+
This service uses a celery beat scheduler, with a configurable polling interval that defaults to once per
|
|
35
|
+
hour and async tasks for each node's reflection.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
djrs/__about__.py,sha256=mlybO8bzNO9GojXJH7D8hqtzoQxcupTM5lXnSjq6_f8,50
|
|
2
|
+
djrs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
djrs/config.py,sha256=ImLCyAULcoNFmKSmNXuGlP6tw4J6AHKI4ultb6qGiho,613
|
|
4
|
+
djrs/worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
djrs/worker/app.py,sha256=XdaZshykwg2CypN10XefPhBPaYSRkzdXeB1cRzQpjDk,174
|
|
6
|
+
djrs/worker/tasks.py,sha256=MyKita0jeJpL1h6lNJYOQzUZzht1azw4n72ZGSKmgsg,3871
|
|
7
|
+
djrs/worker/utils.py,sha256=VEfVEPmrGc86j2GY7Bal5NTQQSAfiuszf90xf-vDLZ8,1276
|
|
8
|
+
datajunction_reflection-0.0.1a1.dist-info/METADATA,sha256=nC-K4XdFR29AWRK9ywsUCI7LKdo6h6nq4ZUsPD7-CgU,1555
|
|
9
|
+
datajunction_reflection-0.0.1a1.dist-info/WHEEL,sha256=KGYbc1zXlYddvwxnNty23BeaKzh7YuoSIvIMO4jEhvw,87
|
|
10
|
+
datajunction_reflection-0.0.1a1.dist-info/licenses/LICENSE,sha256=Q5EUOoPvfa4k1Az3B6y-bddOppd_h7sa30WgFDZOxsU,1059
|
|
11
|
+
datajunction_reflection-0.0.1a1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 DJ
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
djrs/__about__.py
ADDED
djrs/__init__.py
ADDED
|
File without changes
|
djrs/config.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Reflection service settings."""
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseSettings
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Settings(BaseSettings):
|
|
8
|
+
"""
|
|
9
|
+
Default settings for the reflection service.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
core_service: str = "http://dj:8000"
|
|
13
|
+
query_service: str = "http://djqs:8001"
|
|
14
|
+
celery_broker: str = "redis://djrs-redis:6379/1"
|
|
15
|
+
celery_results_backend: str = "redis://djrs-redis:6379/2"
|
|
16
|
+
|
|
17
|
+
# Set the number of seconds to wait in between polling
|
|
18
|
+
polling_interval: int = 3600
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@lru_cache
|
|
22
|
+
def get_settings() -> Settings:
|
|
23
|
+
"""
|
|
24
|
+
Return a cached settings object.
|
|
25
|
+
"""
|
|
26
|
+
return Settings()
|
djrs/worker/__init__.py
ADDED
|
File without changes
|
djrs/worker/app.py
ADDED
djrs/worker/tasks.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Reflection service celery tasks."""
|
|
2
|
+
import datetime
|
|
3
|
+
import json
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from typing import Dict
|
|
6
|
+
|
|
7
|
+
import celery
|
|
8
|
+
import requests
|
|
9
|
+
from celery import shared_task
|
|
10
|
+
from celery.utils.log import get_task_logger
|
|
11
|
+
from djopenapi.model.availability_state_base import AvailabilityStateBase
|
|
12
|
+
from djopenapi.model.update_node import UpdateNode
|
|
13
|
+
|
|
14
|
+
from djrs.worker.app import celery_app
|
|
15
|
+
from djrs.worker.utils import get_dj_client, get_settings
|
|
16
|
+
|
|
17
|
+
logger = get_task_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ReflectionServiceTask(celery.Task, ABC):
|
|
21
|
+
"""
|
|
22
|
+
Base reflection service task.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
abstract = True
|
|
26
|
+
|
|
27
|
+
def on_failure(self, exc, task_id, *args, **kwargs):
|
|
28
|
+
logger.exception("%s failed: %s", task_id, exc)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@shared_task(queue="celery", name="djrs.worker.app.refresh", base=ReflectionServiceTask)
|
|
32
|
+
def refresh():
|
|
33
|
+
"""
|
|
34
|
+
Find available DJ nodes and kick off reflection tasks for
|
|
35
|
+
nodes with associated tables.
|
|
36
|
+
"""
|
|
37
|
+
dj_api = get_dj_client()
|
|
38
|
+
|
|
39
|
+
catalogs_to_engines = {
|
|
40
|
+
catalog["name"]: catalog["engines"][0] if catalog["engines"] else None
|
|
41
|
+
for catalog in json.loads(
|
|
42
|
+
dj_api.list_catalogs_catalogs_get(skip_deserialization=True).response.data,
|
|
43
|
+
)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
all_nodes = {
|
|
47
|
+
node["name"]: node
|
|
48
|
+
for node in json.loads(
|
|
49
|
+
dj_api.read_nodes_nodes_get(skip_deserialization=True).response.data,
|
|
50
|
+
)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
tasks = []
|
|
54
|
+
for node_name, node in all_nodes.items():
|
|
55
|
+
if node["catalog"] and node["schema_"] and node["table"]:
|
|
56
|
+
task = celery_app.send_task(
|
|
57
|
+
"djrs.worker.tasks.reflect",
|
|
58
|
+
(
|
|
59
|
+
node_name,
|
|
60
|
+
node["catalog"]["name"],
|
|
61
|
+
node["schema_"],
|
|
62
|
+
node["table"],
|
|
63
|
+
catalogs_to_engines[node["catalog"]["name"]],
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
tasks.append(task)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@shared_task(
|
|
70
|
+
queue="celery",
|
|
71
|
+
name="djrs.worker.tasks.reflect",
|
|
72
|
+
base=ReflectionServiceTask,
|
|
73
|
+
)
|
|
74
|
+
def reflect(
|
|
75
|
+
node_name: str,
|
|
76
|
+
catalog: str,
|
|
77
|
+
schema: str,
|
|
78
|
+
table: str,
|
|
79
|
+
engine: Dict[str, str],
|
|
80
|
+
):
|
|
81
|
+
"""
|
|
82
|
+
This reflects the state of the node's associated table, whether
|
|
83
|
+
external or materialized, back to the DJ core service.
|
|
84
|
+
"""
|
|
85
|
+
logger.info(f"Reflecting node={node_name}, table={table} to DJ core")
|
|
86
|
+
settings = get_settings()
|
|
87
|
+
dj_api = get_dj_client()
|
|
88
|
+
|
|
89
|
+
# Update table columns
|
|
90
|
+
response = requests.get(
|
|
91
|
+
f"{settings.query_service}/table/{catalog}.{schema}.{table}/columns/",
|
|
92
|
+
params={
|
|
93
|
+
"engine": engine["name"],
|
|
94
|
+
"engine_version": engine["version"],
|
|
95
|
+
},
|
|
96
|
+
timeout=30,
|
|
97
|
+
)
|
|
98
|
+
table_columns = response.json()["columns"]
|
|
99
|
+
update_columns_response = dj_api.update_node_nodes_name_patch(
|
|
100
|
+
body=UpdateNode(
|
|
101
|
+
columns={col["name"]: {"type": col["type"]} for col in table_columns},
|
|
102
|
+
),
|
|
103
|
+
path_params={"name": node_name},
|
|
104
|
+
skip_deserialization=True,
|
|
105
|
+
).response
|
|
106
|
+
|
|
107
|
+
logger.info(
|
|
108
|
+
f"Update node columns for `{node_name}` response: "
|
|
109
|
+
f"{update_columns_response.reason}",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# pylint: disable=fixme
|
|
113
|
+
# TODO: Post actual availability state when information available
|
|
114
|
+
availability_state_response = (
|
|
115
|
+
dj_api.add_availability_data_node_name_availability_post(
|
|
116
|
+
body=AvailabilityStateBase(
|
|
117
|
+
catalog=catalog,
|
|
118
|
+
max_partition=[],
|
|
119
|
+
min_partition=[],
|
|
120
|
+
table=table,
|
|
121
|
+
valid_through_ts=int(datetime.datetime.now().timestamp()),
|
|
122
|
+
schema_=schema,
|
|
123
|
+
),
|
|
124
|
+
path_params={"node_name": node_name},
|
|
125
|
+
skip_deserialization=True,
|
|
126
|
+
).response
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
logger.info(
|
|
130
|
+
"Post availability state for `%s` response: %s",
|
|
131
|
+
node_name,
|
|
132
|
+
availability_state_response.reason,
|
|
133
|
+
)
|
djrs/worker/utils.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Utility functions for retrieving API clients."""
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import djopenapi
|
|
6
|
+
from celery import Celery
|
|
7
|
+
from djopenapi.apis.tags import default_api
|
|
8
|
+
|
|
9
|
+
from djrs.config import get_settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_dj_client(
|
|
13
|
+
core_service: Optional[str] = None,
|
|
14
|
+
) -> default_api.DefaultApi:
|
|
15
|
+
"""
|
|
16
|
+
Return DJ API client
|
|
17
|
+
"""
|
|
18
|
+
settings = get_settings()
|
|
19
|
+
configuration = djopenapi.Configuration(
|
|
20
|
+
host=core_service or settings.core_service,
|
|
21
|
+
)
|
|
22
|
+
api_client = djopenapi.ApiClient(configuration)
|
|
23
|
+
return default_api.DefaultApi(api_client)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_celery() -> Celery:
|
|
27
|
+
"""
|
|
28
|
+
core celery app
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
settings = get_settings()
|
|
32
|
+
|
|
33
|
+
celery_app = Celery(__name__, include=["djrs.worker.app", "djrs.worker.tasks"])
|
|
34
|
+
celery_app.conf.broker_url = os.environ.get(
|
|
35
|
+
"CELERY_BROKER_URL",
|
|
36
|
+
settings.celery_broker,
|
|
37
|
+
)
|
|
38
|
+
celery_app.conf.result_backend = os.environ.get(
|
|
39
|
+
"CELERY_RESULT_BACKEND",
|
|
40
|
+
settings.celery_results_backend,
|
|
41
|
+
)
|
|
42
|
+
celery_app.conf.imports = ["djrs.worker.app", "djrs.worker.tasks"]
|
|
43
|
+
celery_app.conf.beat_schedule = {
|
|
44
|
+
"refresh": {
|
|
45
|
+
"task": "djrs.worker.app.refresh",
|
|
46
|
+
"schedule": settings.polling_interval,
|
|
47
|
+
},
|
|
48
|
+
}
|
|
49
|
+
return celery_app
|