dagster-sling 0.25.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dagster-sling might be problematic. Click here for more details.
- dagster_sling/__init__.py +18 -0
- dagster_sling/asset_decorator.py +147 -0
- dagster_sling/asset_defs.py +0 -0
- dagster_sling/dagster_sling_translator.py +297 -0
- dagster_sling/py.typed +1 -0
- dagster_sling/resources.py +437 -0
- dagster_sling/sling_event_iterator.py +238 -0
- dagster_sling/sling_replication.py +33 -0
- dagster_sling/version.py +1 -0
- dagster_sling-0.25.9.dist-info/LICENSE +201 -0
- dagster_sling-0.25.9.dist-info/METADATA +21 -0
- dagster_sling-0.25.9.dist-info/RECORD +14 -0
- dagster_sling-0.25.9.dist-info/WHEEL +5 -0
- dagster_sling-0.25.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import tempfile
|
|
8
|
+
import time
|
|
9
|
+
import uuid
|
|
10
|
+
from collections.abc import Generator, Iterator, Sequence
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from subprocess import PIPE, STDOUT, Popen
|
|
13
|
+
from typing import IO, Any, AnyStr, Optional, Union
|
|
14
|
+
|
|
15
|
+
import sling
|
|
16
|
+
from dagster import (
|
|
17
|
+
AssetExecutionContext,
|
|
18
|
+
AssetMaterialization,
|
|
19
|
+
ConfigurableResource,
|
|
20
|
+
EnvVar,
|
|
21
|
+
MaterializeResult,
|
|
22
|
+
OpExecutionContext,
|
|
23
|
+
PermissiveConfig,
|
|
24
|
+
get_dagster_logger,
|
|
25
|
+
)
|
|
26
|
+
from dagster._annotations import public
|
|
27
|
+
from dagster._core.definitions.metadata import TableMetadataSet
|
|
28
|
+
from dagster._utils.env import environ
|
|
29
|
+
from pydantic import Field
|
|
30
|
+
|
|
31
|
+
from dagster_sling.asset_decorator import (
|
|
32
|
+
METADATA_KEY_REPLICATION_CONFIG,
|
|
33
|
+
METADATA_KEY_TRANSLATOR,
|
|
34
|
+
get_streams_from_replication,
|
|
35
|
+
streams_with_default_dagster_meta,
|
|
36
|
+
)
|
|
37
|
+
from dagster_sling.dagster_sling_translator import DagsterSlingTranslator
|
|
38
|
+
from dagster_sling.sling_event_iterator import SlingEventIterator, SlingEventType
|
|
39
|
+
from dagster_sling.sling_replication import SlingReplicationParam, validate_replication
|
|
40
|
+
|
|
41
|
+
logger = get_dagster_logger()
|
|
42
|
+
|
|
43
|
+
ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@public
|
|
47
|
+
class SlingMode(str, Enum):
|
|
48
|
+
"""The mode to use when syncing.
|
|
49
|
+
|
|
50
|
+
See the Sling docs for more information: https://docs.slingdata.io/sling-cli/run/configuration#modes
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
INCREMENTAL = "incremental"
|
|
54
|
+
TRUNCATE = "truncate"
|
|
55
|
+
FULL_REFRESH = "full-refresh"
|
|
56
|
+
SNAPSHOT = "snapshot"
|
|
57
|
+
BACKFILL = "backfill"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@public
|
|
61
|
+
class SlingConnectionResource(PermissiveConfig):
|
|
62
|
+
"""A representation of a connection to a database or file to be used by Sling. This resource can be used as a source or a target for a Sling syncs.
|
|
63
|
+
|
|
64
|
+
Reference the Sling docs for more information on possible connection types and parameters: https://docs.slingdata.io/connections
|
|
65
|
+
|
|
66
|
+
The name of the connection is passed to Sling and must match the name of the connection provided in the replication configuration: https://docs.slingdata.io/sling-cli/run/configuration/replication
|
|
67
|
+
You may provide either a connection string or keyword arguments for the connection.
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
Creating a Sling Connection for a file, such as CSV or JSON:
|
|
71
|
+
|
|
72
|
+
.. code-block:: python
|
|
73
|
+
|
|
74
|
+
source = SlingConnectionResource(name="MY_FILE", type="file")
|
|
75
|
+
|
|
76
|
+
Create a Sling Connection for a Postgres database, using a connection string:
|
|
77
|
+
|
|
78
|
+
.. code-block:: python
|
|
79
|
+
|
|
80
|
+
postgres_conn = SlingConnectionResource(name="MY_POSTGRES", type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))
|
|
81
|
+
mysql_conn = SlingConnectionResource(name="MY_MYSQL", type="mysql", connection_string="mysql://user:password@host:port/schema")
|
|
82
|
+
|
|
83
|
+
Create a Sling Connection for a Postgres or Snowflake database, using keyword arguments:
|
|
84
|
+
|
|
85
|
+
.. code-block::python
|
|
86
|
+
|
|
87
|
+
postgres_conn = SlingConnectionResource(
|
|
88
|
+
name="MY_OTHER_POSRGRES",
|
|
89
|
+
type="postgres",
|
|
90
|
+
host="host",
|
|
91
|
+
user="hunter42",
|
|
92
|
+
password=EnvVar("POSTGRES_PASSWORD")
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
snowflake_conn = SlingConnectionResource(
|
|
96
|
+
name="MY_SNOWFLAKE",
|
|
97
|
+
type="snowflake",
|
|
98
|
+
host=EnvVar("SNOWFLAKE_HOST"),
|
|
99
|
+
user=EnvVar("SNOWFLAKE_USER"),
|
|
100
|
+
database=EnvVar("SNOWFLAKE_DATABASE"),
|
|
101
|
+
password=EnvVar("SNOWFLAKE_PASSWORD"),
|
|
102
|
+
role=EnvVar("SNOWFLAKE_ROLE")
|
|
103
|
+
)
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
name: str = Field(
|
|
107
|
+
description="The name of the connection, must match the name in your Sling replication configuration."
|
|
108
|
+
)
|
|
109
|
+
type: str = Field(
|
|
110
|
+
description="Type of the source connection, must match the Sling connection types. Use 'file' for local storage."
|
|
111
|
+
)
|
|
112
|
+
connection_string: Optional[str] = Field(
|
|
113
|
+
description="The optional connection string for the source database, if not using keyword arguments.",
|
|
114
|
+
default=None,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class SlingResource(ConfigurableResource):
|
|
119
|
+
"""Resource for interacting with the Sling package. This resource can be used to run Sling replications.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
connections (List[SlingConnectionResource]): A list of connections to use for the replication.
|
|
123
|
+
|
|
124
|
+
Examples:
|
|
125
|
+
.. code-block:: python
|
|
126
|
+
|
|
127
|
+
from dagster_etl.sling import SlingResource, SlingConnectionResource
|
|
128
|
+
|
|
129
|
+
sling_resource = SlingResource(
|
|
130
|
+
connections=[
|
|
131
|
+
SlingConnectionResource(
|
|
132
|
+
name="MY_POSTGRES",
|
|
133
|
+
type="postgres",
|
|
134
|
+
connection_string=EnvVar("POSTGRES_CONNECTION_STRING"),
|
|
135
|
+
),
|
|
136
|
+
SlingConnectionResource(
|
|
137
|
+
name="MY_SNOWFLAKE",
|
|
138
|
+
type="snowflake",
|
|
139
|
+
host=EnvVar("SNOWFLAKE_HOST"),
|
|
140
|
+
user=EnvVar("SNOWFLAKE_USER"),
|
|
141
|
+
database=EnvVar("SNOWFLAKE_DATABASE"),
|
|
142
|
+
password=EnvVar("SNOWFLAKE_PASSWORD"),
|
|
143
|
+
role=EnvVar("SNOWFLAKE_ROLE"),
|
|
144
|
+
),
|
|
145
|
+
]
|
|
146
|
+
)
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
connections: list[SlingConnectionResource] = []
|
|
150
|
+
_stdout: list[str] = []
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def _get_replication_streams_for_context(
|
|
154
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
155
|
+
) -> dict[str, Any]:
|
|
156
|
+
"""Computes the sling replication streams config for a given execution context with an
|
|
157
|
+
assets def, possibly involving a subset selection of sling assets.
|
|
158
|
+
"""
|
|
159
|
+
if not context.has_assets_def:
|
|
160
|
+
no_assets_def_message = """
|
|
161
|
+
The current execution context has no backing AssetsDefinition object. Therefore, no
|
|
162
|
+
sling assets subsetting will be performed...
|
|
163
|
+
"""
|
|
164
|
+
logger.warn(no_assets_def_message)
|
|
165
|
+
return {}
|
|
166
|
+
context_streams = {}
|
|
167
|
+
assets_def = context.assets_def
|
|
168
|
+
run_config = context.run_config
|
|
169
|
+
if run_config: # triggered via sensor
|
|
170
|
+
run_config_ops = run_config.get("ops", {})
|
|
171
|
+
if isinstance(run_config_ops, dict):
|
|
172
|
+
assets_op_config = run_config_ops.get(assets_def.op.name, {}).get("config", {})
|
|
173
|
+
else:
|
|
174
|
+
assets_op_config = {}
|
|
175
|
+
context_streams = assets_op_config.get("context_streams", {})
|
|
176
|
+
if not context_streams:
|
|
177
|
+
no_context_streams_config_message = f"""
|
|
178
|
+
It was expected that your `run_config` would provide a `context_streams` config for
|
|
179
|
+
the op {assets_def.op.name}. Instead, the received value for this op config was
|
|
180
|
+
{assets_op_config}.
|
|
181
|
+
|
|
182
|
+
NO ASSET SUBSETTING WILL BE PERFORMED!
|
|
183
|
+
|
|
184
|
+
If that was your intention, you can safely ignore this message. Otherwise, provide
|
|
185
|
+
the mentioned `context_streams` config for executing only your desired asset subset.
|
|
186
|
+
"""
|
|
187
|
+
logger.warn(no_context_streams_config_message)
|
|
188
|
+
else:
|
|
189
|
+
metadata_by_key = assets_def.metadata_by_key
|
|
190
|
+
first_asset_metadata = next(iter(metadata_by_key.values()))
|
|
191
|
+
replication_config: dict[str, Any] = first_asset_metadata.get(
|
|
192
|
+
METADATA_KEY_REPLICATION_CONFIG, {}
|
|
193
|
+
)
|
|
194
|
+
dagster_sling_translator: DagsterSlingTranslator = first_asset_metadata.get(
|
|
195
|
+
METADATA_KEY_TRANSLATOR, DagsterSlingTranslator()
|
|
196
|
+
)
|
|
197
|
+
raw_streams = get_streams_from_replication(replication_config)
|
|
198
|
+
streams = streams_with_default_dagster_meta(raw_streams, replication_config)
|
|
199
|
+
selected_asset_keys = context.selected_asset_keys
|
|
200
|
+
for stream in streams:
|
|
201
|
+
asset_key = dagster_sling_translator.get_asset_key(stream)
|
|
202
|
+
if asset_key in selected_asset_keys:
|
|
203
|
+
context_streams.update({stream["name"]: stream["config"]})
|
|
204
|
+
|
|
205
|
+
return context_streams
|
|
206
|
+
|
|
207
|
+
@classmethod
|
|
208
|
+
def _is_dagster_maintained(cls) -> bool:
|
|
209
|
+
return True
|
|
210
|
+
|
|
211
|
+
def _clean_connection_dict(self, d: dict[str, Any]) -> dict[str, Any]:
|
|
212
|
+
d = _process_env_vars(d)
|
|
213
|
+
if d["connection_string"]:
|
|
214
|
+
d["url"] = d["connection_string"]
|
|
215
|
+
if "connection_string" in d:
|
|
216
|
+
del d["connection_string"]
|
|
217
|
+
return d
|
|
218
|
+
|
|
219
|
+
def prepare_environment(self) -> dict[str, Any]:
|
|
220
|
+
env = {}
|
|
221
|
+
|
|
222
|
+
for conn in self.connections:
|
|
223
|
+
d = self._clean_connection_dict(dict(conn))
|
|
224
|
+
env[conn.name] = json.dumps(d)
|
|
225
|
+
|
|
226
|
+
return env
|
|
227
|
+
|
|
228
|
+
@contextlib.contextmanager
|
|
229
|
+
def _setup_config(self) -> Generator[None, None, None]:
|
|
230
|
+
"""Uses environment variables to set the Sling source and target connections."""
|
|
231
|
+
prepared_environment = self.prepare_environment()
|
|
232
|
+
with environ(prepared_environment):
|
|
233
|
+
yield
|
|
234
|
+
|
|
235
|
+
def _clean_line(self, line: str) -> str:
|
|
236
|
+
"""Removes ANSI escape sequences from a line of output."""
|
|
237
|
+
return ANSI_ESCAPE.sub("", line).replace("INF", "")
|
|
238
|
+
|
|
239
|
+
def _process_stdout(self, stdout: IO[AnyStr], encoding="utf8") -> Iterator[str]:
|
|
240
|
+
"""Process stdout from the Sling CLI."""
|
|
241
|
+
for line in stdout:
|
|
242
|
+
assert isinstance(line, bytes)
|
|
243
|
+
fmt_line = bytes.decode(line, encoding=encoding, errors="replace")
|
|
244
|
+
yield self._clean_line(fmt_line)
|
|
245
|
+
|
|
246
|
+
def _exec_sling_cmd(
|
|
247
|
+
self, cmd, stdin=None, stdout=PIPE, stderr=STDOUT, encoding="utf8"
|
|
248
|
+
) -> Generator[str, None, None]:
|
|
249
|
+
with Popen(cmd, shell=True, stdin=stdin, stdout=stdout, stderr=stderr) as proc:
|
|
250
|
+
if proc.stdout:
|
|
251
|
+
yield from self._process_stdout(proc.stdout, encoding=encoding)
|
|
252
|
+
|
|
253
|
+
proc.wait()
|
|
254
|
+
if proc.returncode != 0:
|
|
255
|
+
raise Exception("Sling command failed with error code %s", proc.returncode)
|
|
256
|
+
|
|
257
|
+
def _parse_json_table_output(self, table_output: dict[str, Any]) -> list[dict[str, str]]:
|
|
258
|
+
column_keys: list[str] = table_output["fields"]
|
|
259
|
+
column_values: list[list[str]] = table_output["rows"]
|
|
260
|
+
|
|
261
|
+
return [dict(zip(column_keys, column_values)) for column_values in column_values]
|
|
262
|
+
|
|
263
|
+
def get_column_info_for_table(self, target_name: str, table_name: str) -> list[dict[str, str]]:
|
|
264
|
+
"""Fetches column metadata for a given table in a Sling target and parses it into a list of
|
|
265
|
+
dictionaries, keyed by column name.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
target_name (str): The name of the target connection to use.
|
|
269
|
+
table_name (str): The name of the table to fetch column metadata for.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
List[Dict[str, str]]: A list of dictionaries, keyed by column name, containing column metadata.
|
|
273
|
+
"""
|
|
274
|
+
output = self.run_sling_cli(
|
|
275
|
+
["conns", "discover", target_name, "--pattern", table_name, "--columns"],
|
|
276
|
+
force_json=True,
|
|
277
|
+
)
|
|
278
|
+
return self._parse_json_table_output(json.loads(output.strip()))
|
|
279
|
+
|
|
280
|
+
def get_row_count_for_table(self, target_name: str, table_name: str) -> int:
|
|
281
|
+
"""Queries the target connection to get the row count for a given table.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
target_name (str): The name of the target connection to use.
|
|
285
|
+
table_name (str): The name of the table to fetch the row count for.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
int: The number of rows in the table.
|
|
289
|
+
"""
|
|
290
|
+
select_stmt: str = f"select count(*) as ct from {table_name}"
|
|
291
|
+
output = self.run_sling_cli(
|
|
292
|
+
["conns", "exec", target_name, select_stmt],
|
|
293
|
+
force_json=True,
|
|
294
|
+
)
|
|
295
|
+
return int(
|
|
296
|
+
next(iter(self._parse_json_table_output(json.loads(output.strip()))[0].values()))
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
def run_sling_cli(self, args: Sequence[str], force_json: bool = False) -> str:
|
|
300
|
+
"""Runs the Sling CLI with the given arguments and returns the output.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
args (Sequence[str]): The arguments to pass to the Sling CLI.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
str: The output from the Sling CLI.
|
|
307
|
+
"""
|
|
308
|
+
with environ({"SLING_OUTPUT": "json"}) if force_json else contextlib.nullcontext():
|
|
309
|
+
return subprocess.check_output(args=[sling.SLING_BIN, *args], text=True)
|
|
310
|
+
|
|
311
|
+
def replicate(
|
|
312
|
+
self,
|
|
313
|
+
*,
|
|
314
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
315
|
+
replication_config: Optional[SlingReplicationParam] = None,
|
|
316
|
+
dagster_sling_translator: Optional[DagsterSlingTranslator] = None,
|
|
317
|
+
debug: bool = False,
|
|
318
|
+
) -> SlingEventIterator[SlingEventType]:
|
|
319
|
+
"""Runs a Sling replication from the given replication config.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
context: Asset or Op execution context.
|
|
323
|
+
replication_config: The Sling replication config to use for the replication.
|
|
324
|
+
dagster_sling_translator: The translator to use for the replication.
|
|
325
|
+
debug: Whether to run the replication in debug mode.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
SlingEventIterator[MaterializeResult]: A generator of MaterializeResult
|
|
329
|
+
"""
|
|
330
|
+
if not (replication_config or dagster_sling_translator):
|
|
331
|
+
metadata_by_key = context.assets_def.metadata_by_key
|
|
332
|
+
first_asset_metadata = next(iter(metadata_by_key.values()))
|
|
333
|
+
dagster_sling_translator = first_asset_metadata.get(METADATA_KEY_TRANSLATOR)
|
|
334
|
+
replication_config = first_asset_metadata.get(METADATA_KEY_REPLICATION_CONFIG)
|
|
335
|
+
|
|
336
|
+
dagster_sling_translator = dagster_sling_translator or DagsterSlingTranslator()
|
|
337
|
+
replication_config_dict = dict(validate_replication(replication_config))
|
|
338
|
+
return SlingEventIterator(
|
|
339
|
+
self._replicate(
|
|
340
|
+
context=context,
|
|
341
|
+
replication_config=replication_config_dict,
|
|
342
|
+
dagster_sling_translator=dagster_sling_translator,
|
|
343
|
+
debug=debug,
|
|
344
|
+
),
|
|
345
|
+
sling_cli=self,
|
|
346
|
+
replication_config=replication_config_dict,
|
|
347
|
+
context=context,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
def _replicate(
|
|
351
|
+
self,
|
|
352
|
+
*,
|
|
353
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
354
|
+
replication_config: dict[str, Any],
|
|
355
|
+
dagster_sling_translator: DagsterSlingTranslator,
|
|
356
|
+
debug: bool,
|
|
357
|
+
) -> Iterator[SlingEventType]:
|
|
358
|
+
# if translator has not been defined on metadata _or_ through param, then use the default constructor
|
|
359
|
+
|
|
360
|
+
# convert to dict to enable updating the index
|
|
361
|
+
context_streams = self._get_replication_streams_for_context(context)
|
|
362
|
+
if context_streams:
|
|
363
|
+
replication_config.update({"streams": context_streams})
|
|
364
|
+
stream_definitions = get_streams_from_replication(replication_config)
|
|
365
|
+
|
|
366
|
+
# extract the destination name from the replication config
|
|
367
|
+
destination_name = replication_config.get("target")
|
|
368
|
+
|
|
369
|
+
with self._setup_config():
|
|
370
|
+
uid = uuid.uuid4()
|
|
371
|
+
temp_dir = tempfile.gettempdir()
|
|
372
|
+
temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
|
|
373
|
+
env = os.environ.copy()
|
|
374
|
+
|
|
375
|
+
with open(temp_file, "w") as file:
|
|
376
|
+
json.dump(replication_config, file, cls=sling.JsonEncoder)
|
|
377
|
+
|
|
378
|
+
logger.debug(f"Replication config: {replication_config}")
|
|
379
|
+
|
|
380
|
+
debug_str = "-d" if debug else ""
|
|
381
|
+
|
|
382
|
+
cmd = f"{sling.SLING_BIN} run {debug_str} -r {temp_file}"
|
|
383
|
+
|
|
384
|
+
logger.debug(f"Running Sling replication with command: {cmd}")
|
|
385
|
+
|
|
386
|
+
# Get start time from wall clock
|
|
387
|
+
start_time = time.time()
|
|
388
|
+
results = sling._run( # noqa
|
|
389
|
+
cmd=cmd,
|
|
390
|
+
temp_file=temp_file,
|
|
391
|
+
return_output=True,
|
|
392
|
+
env=env,
|
|
393
|
+
)
|
|
394
|
+
for row in results.split("\n"):
|
|
395
|
+
clean_line = self._clean_line(row)
|
|
396
|
+
sys.stdout.write(clean_line + "\n")
|
|
397
|
+
self._stdout.append(clean_line)
|
|
398
|
+
|
|
399
|
+
end_time = time.time()
|
|
400
|
+
|
|
401
|
+
# TODO: In the future, it'd be nice to yield these materializations as they come in
|
|
402
|
+
# rather than waiting until the end of the replication
|
|
403
|
+
for stream in stream_definitions:
|
|
404
|
+
asset_key = dagster_sling_translator.get_asset_key(stream)
|
|
405
|
+
|
|
406
|
+
object_key = (stream.get("config") or {}).get("object")
|
|
407
|
+
destination_stream_name = object_key or stream["name"]
|
|
408
|
+
table_name = None
|
|
409
|
+
if destination_name and destination_stream_name:
|
|
410
|
+
table_name = ".".join([destination_name, destination_stream_name])
|
|
411
|
+
|
|
412
|
+
metadata = {
|
|
413
|
+
"elapsed_time": end_time - start_time,
|
|
414
|
+
"stream_name": stream["name"],
|
|
415
|
+
**TableMetadataSet(
|
|
416
|
+
table_name=table_name,
|
|
417
|
+
),
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
if context.has_assets_def:
|
|
421
|
+
yield MaterializeResult(asset_key=asset_key, metadata=metadata)
|
|
422
|
+
else:
|
|
423
|
+
yield AssetMaterialization(asset_key=asset_key, metadata=metadata)
|
|
424
|
+
|
|
425
|
+
def stream_raw_logs(self) -> Generator[str, None, None]:
|
|
426
|
+
"""Returns a generator of raw logs from the Sling CLI."""
|
|
427
|
+
yield from self._stdout
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _process_env_vars(config: dict[str, Any]) -> dict[str, Any]:
|
|
431
|
+
out = {}
|
|
432
|
+
for key, value in config.items():
|
|
433
|
+
if isinstance(value, dict) and len(value) == 1 and next(iter(value.keys())) == "env":
|
|
434
|
+
out[key] = EnvVar(next(iter(value.values()))).get_value()
|
|
435
|
+
else:
|
|
436
|
+
out[key] = value
|
|
437
|
+
return out
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections.abc import Iterator, Sequence
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
|
4
|
+
|
|
5
|
+
from dagster import (
|
|
6
|
+
AssetMaterialization,
|
|
7
|
+
MaterializeResult,
|
|
8
|
+
_check as check,
|
|
9
|
+
)
|
|
10
|
+
from dagster._annotations import experimental, public
|
|
11
|
+
from dagster._core.definitions.metadata.metadata_set import TableMetadataSet
|
|
12
|
+
from dagster._core.definitions.metadata.table import (
|
|
13
|
+
TableColumn,
|
|
14
|
+
TableColumnDep,
|
|
15
|
+
TableColumnLineage,
|
|
16
|
+
TableSchema,
|
|
17
|
+
)
|
|
18
|
+
from dagster._core.execution.context.asset_execution_context import AssetExecutionContext
|
|
19
|
+
from dagster._core.execution.context.op_execution_context import OpExecutionContext
|
|
20
|
+
from typing_extensions import TypeVar
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from dagster_sling.resources import SlingResource
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
SlingEventType = Union[AssetMaterialization, MaterializeResult]
|
|
27
|
+
|
|
28
|
+
# We define SlingEventIterator as a generic type for the sake of type hinting.
|
|
29
|
+
# This is so that users who inspect the type of the return value of `SlingResource.replicate()`
|
|
30
|
+
# will be able to see the inner type of the iterator, rather than just `SlingEventIterator`.
|
|
31
|
+
T = TypeVar("T", bound=SlingEventType)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_logs_for_stream(
|
|
35
|
+
stream_name: str,
|
|
36
|
+
sling_cli: "SlingResource",
|
|
37
|
+
) -> Sequence[str]:
|
|
38
|
+
"""Parses out the logs for a specific stream from the raw logs returned by the Sling CLI."""
|
|
39
|
+
corresponding_logs = []
|
|
40
|
+
recording_logs = False
|
|
41
|
+
for log in sling_cli.stream_raw_logs():
|
|
42
|
+
if (f"running stream {stream_name}") in log:
|
|
43
|
+
corresponding_logs.append(log)
|
|
44
|
+
recording_logs = True
|
|
45
|
+
elif recording_logs:
|
|
46
|
+
if len(log.strip()) == 0:
|
|
47
|
+
break
|
|
48
|
+
corresponding_logs.append(log)
|
|
49
|
+
return corresponding_logs
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _strip_quotes_target_table_name(target_table_name: str) -> str:
|
|
53
|
+
return target_table_name.replace('"', "")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
INSERT_REGEX = re.compile(r".*inserted (\d+) rows into (.*) in.*")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _get_target_table_name(stream_name: str, sling_cli: "SlingResource") -> Optional[str]:
|
|
60
|
+
"""Extracts the target table name from the logs for a specific stream."""
|
|
61
|
+
corresponding_logs = _get_logs_for_stream(stream_name, sling_cli)
|
|
62
|
+
insert_log = next((log for log in corresponding_logs if re.match(INSERT_REGEX, log)), None)
|
|
63
|
+
if not insert_log:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
target_table_name = check.not_none(re.match(INSERT_REGEX, insert_log)).group(2)
|
|
67
|
+
return _strip_quotes_target_table_name(target_table_name)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
COLUMN_NAME_COL = "Column"
|
|
71
|
+
COLUMN_TYPE_COL = "General Type"
|
|
72
|
+
|
|
73
|
+
SLING_COLUMN_PREFIX = "_sling_"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def fetch_row_count_metadata(
|
|
77
|
+
materialization: SlingEventType,
|
|
78
|
+
sling_cli: "SlingResource",
|
|
79
|
+
replication_config: dict[str, Any],
|
|
80
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
81
|
+
) -> dict[str, Any]:
|
|
82
|
+
target_name = replication_config["target"]
|
|
83
|
+
if not materialization.metadata:
|
|
84
|
+
raise Exception("Missing required metadata to retrieve stream_name")
|
|
85
|
+
|
|
86
|
+
stream_name = cast(str, materialization.metadata["stream_name"])
|
|
87
|
+
target_table_name = _get_target_table_name(stream_name, sling_cli)
|
|
88
|
+
|
|
89
|
+
if target_table_name:
|
|
90
|
+
try:
|
|
91
|
+
row_count = sling_cli.get_row_count_for_table(target_name, target_table_name)
|
|
92
|
+
return dict(TableMetadataSet(row_count=row_count))
|
|
93
|
+
except Exception as e:
|
|
94
|
+
context.log.warning(
|
|
95
|
+
f"Failed to fetch row count for stream %s\nException: {e}",
|
|
96
|
+
stream_name,
|
|
97
|
+
exc_info=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return {}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def fetch_column_metadata(
|
|
104
|
+
materialization: SlingEventType,
|
|
105
|
+
sling_cli: "SlingResource",
|
|
106
|
+
replication_config: dict[str, Any],
|
|
107
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
108
|
+
) -> dict[str, Any]:
|
|
109
|
+
target_name = replication_config["target"]
|
|
110
|
+
|
|
111
|
+
if not materialization.metadata:
|
|
112
|
+
raise Exception("Missing required metadata to retrieve stream_name")
|
|
113
|
+
|
|
114
|
+
stream_name = cast(str, materialization.metadata["stream_name"])
|
|
115
|
+
|
|
116
|
+
upstream_assets = set()
|
|
117
|
+
if isinstance(context, AssetExecutionContext):
|
|
118
|
+
if materialization.asset_key:
|
|
119
|
+
upstream_assets = context.assets_def.asset_deps[materialization.asset_key]
|
|
120
|
+
|
|
121
|
+
target_table_name = _get_target_table_name(stream_name, sling_cli)
|
|
122
|
+
|
|
123
|
+
if target_table_name:
|
|
124
|
+
try:
|
|
125
|
+
column_info = sling_cli.get_column_info_for_table(target_name, target_table_name)
|
|
126
|
+
column_type_map = {
|
|
127
|
+
col[COLUMN_NAME_COL]: col[COLUMN_TYPE_COL]
|
|
128
|
+
for col in column_info
|
|
129
|
+
if COLUMN_TYPE_COL in col
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
column_lineage = None
|
|
133
|
+
# If there is only one upstream asset (typical case), we can infer column lineage
|
|
134
|
+
# from the single upstream asset which is being replicated exactly.
|
|
135
|
+
if len(upstream_assets) == 1:
|
|
136
|
+
upstream_asset_key = next(iter(upstream_assets))
|
|
137
|
+
column_lineage = TableColumnLineage(
|
|
138
|
+
deps_by_column={
|
|
139
|
+
column_name.lower(): [
|
|
140
|
+
TableColumnDep(
|
|
141
|
+
asset_key=upstream_asset_key,
|
|
142
|
+
column_name=column_name.lower(),
|
|
143
|
+
)
|
|
144
|
+
]
|
|
145
|
+
for column_name in column_type_map.keys()
|
|
146
|
+
if not column_name.startswith(SLING_COLUMN_PREFIX)
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return dict(
|
|
151
|
+
TableMetadataSet(
|
|
152
|
+
column_schema=TableSchema(
|
|
153
|
+
columns=[
|
|
154
|
+
TableColumn(name=column_name.lower(), type=column_type)
|
|
155
|
+
for column_name, column_type in column_type_map.items()
|
|
156
|
+
]
|
|
157
|
+
),
|
|
158
|
+
column_lineage=column_lineage,
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
context.log.warning(
|
|
163
|
+
f"Failed to fetch column metadata for stream %s\nException: {e}",
|
|
164
|
+
stream_name,
|
|
165
|
+
exc_info=True,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return {}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class SlingEventIterator(Iterator[T]):
|
|
172
|
+
"""A wrapper around an iterator of Sling events which contains additional methods for
|
|
173
|
+
post-processing the events, such as fetching column metadata.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(
|
|
177
|
+
self,
|
|
178
|
+
events: Iterator[T],
|
|
179
|
+
sling_cli: "SlingResource",
|
|
180
|
+
replication_config: dict[str, Any],
|
|
181
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
182
|
+
) -> None:
|
|
183
|
+
self._inner_iterator = events
|
|
184
|
+
self._sling_cli = sling_cli
|
|
185
|
+
self._replication_config = replication_config
|
|
186
|
+
self._context = context
|
|
187
|
+
|
|
188
|
+
def __next__(self) -> T:
|
|
189
|
+
return next(self._inner_iterator)
|
|
190
|
+
|
|
191
|
+
def __iter__(self) -> "SlingEventIterator[T]":
|
|
192
|
+
return self
|
|
193
|
+
|
|
194
|
+
@experimental
|
|
195
|
+
@public
|
|
196
|
+
def fetch_column_metadata(self) -> "SlingEventIterator":
|
|
197
|
+
"""Fetches column metadata for each table synced by the Sling CLI.
|
|
198
|
+
|
|
199
|
+
Retrieves the column schema and lineage for each target table.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
SlingEventIterator: An iterator of Dagster events with column metadata attached.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
def _fetch_column_metadata() -> Iterator[T]:
|
|
206
|
+
for event in self:
|
|
207
|
+
col_metadata = fetch_column_metadata(
|
|
208
|
+
event, self._sling_cli, self._replication_config, self._context
|
|
209
|
+
)
|
|
210
|
+
if event.metadata:
|
|
211
|
+
yield event._replace(metadata={**col_metadata, **event.metadata})
|
|
212
|
+
|
|
213
|
+
return SlingEventIterator[T](
|
|
214
|
+
_fetch_column_metadata(), self._sling_cli, self._replication_config, self._context
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
@experimental
|
|
218
|
+
@public
|
|
219
|
+
def fetch_row_count(self) -> "SlingEventIterator":
|
|
220
|
+
"""Fetches row count metadata for each table synced by the Sling CLI.
|
|
221
|
+
|
|
222
|
+
Retrieves the row count for each target table.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
SlingEventIterator: An iterator of Dagster events with row count metadata attached.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
def _fetch_row_count() -> Iterator[T]:
|
|
229
|
+
for event in self:
|
|
230
|
+
row_count_metadata = fetch_row_count_metadata(
|
|
231
|
+
event, self._sling_cli, self._replication_config, self._context
|
|
232
|
+
)
|
|
233
|
+
if event.metadata:
|
|
234
|
+
yield event._replace(metadata={**row_count_metadata, **event.metadata})
|
|
235
|
+
|
|
236
|
+
return SlingEventIterator[T](
|
|
237
|
+
_fetch_row_count(), self._sling_cli, self._replication_config, self._context
|
|
238
|
+
)
|