dagster-sling 0.25.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dagster-sling might be problematic. Click here for more details.

@@ -0,0 +1,437 @@
1
+ import contextlib
2
+ import json
3
+ import os
4
+ import re
5
+ import subprocess
6
+ import sys
7
+ import tempfile
8
+ import time
9
+ import uuid
10
+ from collections.abc import Generator, Iterator, Sequence
11
+ from enum import Enum
12
+ from subprocess import PIPE, STDOUT, Popen
13
+ from typing import IO, Any, AnyStr, Optional, Union
14
+
15
+ import sling
16
+ from dagster import (
17
+ AssetExecutionContext,
18
+ AssetMaterialization,
19
+ ConfigurableResource,
20
+ EnvVar,
21
+ MaterializeResult,
22
+ OpExecutionContext,
23
+ PermissiveConfig,
24
+ get_dagster_logger,
25
+ )
26
+ from dagster._annotations import public
27
+ from dagster._core.definitions.metadata import TableMetadataSet
28
+ from dagster._utils.env import environ
29
+ from pydantic import Field
30
+
31
+ from dagster_sling.asset_decorator import (
32
+ METADATA_KEY_REPLICATION_CONFIG,
33
+ METADATA_KEY_TRANSLATOR,
34
+ get_streams_from_replication,
35
+ streams_with_default_dagster_meta,
36
+ )
37
+ from dagster_sling.dagster_sling_translator import DagsterSlingTranslator
38
+ from dagster_sling.sling_event_iterator import SlingEventIterator, SlingEventType
39
+ from dagster_sling.sling_replication import SlingReplicationParam, validate_replication
40
+
41
+ logger = get_dagster_logger()
42
+
43
+ ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
44
+
45
+
46
+ @public
47
+ class SlingMode(str, Enum):
48
+ """The mode to use when syncing.
49
+
50
+ See the Sling docs for more information: https://docs.slingdata.io/sling-cli/run/configuration#modes
51
+ """
52
+
53
+ INCREMENTAL = "incremental"
54
+ TRUNCATE = "truncate"
55
+ FULL_REFRESH = "full-refresh"
56
+ SNAPSHOT = "snapshot"
57
+ BACKFILL = "backfill"
58
+
59
+
60
+ @public
61
+ class SlingConnectionResource(PermissiveConfig):
62
+ """A representation of a connection to a database or file to be used by Sling. This resource can be used as a source or a target for a Sling syncs.
63
+
64
+ Reference the Sling docs for more information on possible connection types and parameters: https://docs.slingdata.io/connections
65
+
66
+ The name of the connection is passed to Sling and must match the name of the connection provided in the replication configuration: https://docs.slingdata.io/sling-cli/run/configuration/replication
67
+ You may provide either a connection string or keyword arguments for the connection.
68
+
69
+ Examples:
70
+ Creating a Sling Connection for a file, such as CSV or JSON:
71
+
72
+ .. code-block:: python
73
+
74
+ source = SlingConnectionResource(name="MY_FILE", type="file")
75
+
76
+ Create a Sling Connection for a Postgres database, using a connection string:
77
+
78
+ .. code-block:: python
79
+
80
+ postgres_conn = SlingConnectionResource(name="MY_POSTGRES", type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))
81
+ mysql_conn = SlingConnectionResource(name="MY_MYSQL", type="mysql", connection_string="mysql://user:password@host:port/schema")
82
+
83
+ Create a Sling Connection for a Postgres or Snowflake database, using keyword arguments:
84
+
85
+ .. code-block::python
86
+
87
+ postgres_conn = SlingConnectionResource(
88
+ name="MY_OTHER_POSRGRES",
89
+ type="postgres",
90
+ host="host",
91
+ user="hunter42",
92
+ password=EnvVar("POSTGRES_PASSWORD")
93
+ )
94
+
95
+ snowflake_conn = SlingConnectionResource(
96
+ name="MY_SNOWFLAKE",
97
+ type="snowflake",
98
+ host=EnvVar("SNOWFLAKE_HOST"),
99
+ user=EnvVar("SNOWFLAKE_USER"),
100
+ database=EnvVar("SNOWFLAKE_DATABASE"),
101
+ password=EnvVar("SNOWFLAKE_PASSWORD"),
102
+ role=EnvVar("SNOWFLAKE_ROLE")
103
+ )
104
+ """
105
+
106
+ name: str = Field(
107
+ description="The name of the connection, must match the name in your Sling replication configuration."
108
+ )
109
+ type: str = Field(
110
+ description="Type of the source connection, must match the Sling connection types. Use 'file' for local storage."
111
+ )
112
+ connection_string: Optional[str] = Field(
113
+ description="The optional connection string for the source database, if not using keyword arguments.",
114
+ default=None,
115
+ )
116
+
117
+
118
+ class SlingResource(ConfigurableResource):
119
+ """Resource for interacting with the Sling package. This resource can be used to run Sling replications.
120
+
121
+ Args:
122
+ connections (List[SlingConnectionResource]): A list of connections to use for the replication.
123
+
124
+ Examples:
125
+ .. code-block:: python
126
+
127
+ from dagster_etl.sling import SlingResource, SlingConnectionResource
128
+
129
+ sling_resource = SlingResource(
130
+ connections=[
131
+ SlingConnectionResource(
132
+ name="MY_POSTGRES",
133
+ type="postgres",
134
+ connection_string=EnvVar("POSTGRES_CONNECTION_STRING"),
135
+ ),
136
+ SlingConnectionResource(
137
+ name="MY_SNOWFLAKE",
138
+ type="snowflake",
139
+ host=EnvVar("SNOWFLAKE_HOST"),
140
+ user=EnvVar("SNOWFLAKE_USER"),
141
+ database=EnvVar("SNOWFLAKE_DATABASE"),
142
+ password=EnvVar("SNOWFLAKE_PASSWORD"),
143
+ role=EnvVar("SNOWFLAKE_ROLE"),
144
+ ),
145
+ ]
146
+ )
147
+ """
148
+
149
+ connections: list[SlingConnectionResource] = []
150
+ _stdout: list[str] = []
151
+
152
+ @staticmethod
153
+ def _get_replication_streams_for_context(
154
+ context: Union[OpExecutionContext, AssetExecutionContext],
155
+ ) -> dict[str, Any]:
156
+ """Computes the sling replication streams config for a given execution context with an
157
+ assets def, possibly involving a subset selection of sling assets.
158
+ """
159
+ if not context.has_assets_def:
160
+ no_assets_def_message = """
161
+ The current execution context has no backing AssetsDefinition object. Therefore, no
162
+ sling assets subsetting will be performed...
163
+ """
164
+ logger.warn(no_assets_def_message)
165
+ return {}
166
+ context_streams = {}
167
+ assets_def = context.assets_def
168
+ run_config = context.run_config
169
+ if run_config: # triggered via sensor
170
+ run_config_ops = run_config.get("ops", {})
171
+ if isinstance(run_config_ops, dict):
172
+ assets_op_config = run_config_ops.get(assets_def.op.name, {}).get("config", {})
173
+ else:
174
+ assets_op_config = {}
175
+ context_streams = assets_op_config.get("context_streams", {})
176
+ if not context_streams:
177
+ no_context_streams_config_message = f"""
178
+ It was expected that your `run_config` would provide a `context_streams` config for
179
+ the op {assets_def.op.name}. Instead, the received value for this op config was
180
+ {assets_op_config}.
181
+
182
+ NO ASSET SUBSETTING WILL BE PERFORMED!
183
+
184
+ If that was your intention, you can safely ignore this message. Otherwise, provide
185
+ the mentioned `context_streams` config for executing only your desired asset subset.
186
+ """
187
+ logger.warn(no_context_streams_config_message)
188
+ else:
189
+ metadata_by_key = assets_def.metadata_by_key
190
+ first_asset_metadata = next(iter(metadata_by_key.values()))
191
+ replication_config: dict[str, Any] = first_asset_metadata.get(
192
+ METADATA_KEY_REPLICATION_CONFIG, {}
193
+ )
194
+ dagster_sling_translator: DagsterSlingTranslator = first_asset_metadata.get(
195
+ METADATA_KEY_TRANSLATOR, DagsterSlingTranslator()
196
+ )
197
+ raw_streams = get_streams_from_replication(replication_config)
198
+ streams = streams_with_default_dagster_meta(raw_streams, replication_config)
199
+ selected_asset_keys = context.selected_asset_keys
200
+ for stream in streams:
201
+ asset_key = dagster_sling_translator.get_asset_key(stream)
202
+ if asset_key in selected_asset_keys:
203
+ context_streams.update({stream["name"]: stream["config"]})
204
+
205
+ return context_streams
206
+
207
+ @classmethod
208
+ def _is_dagster_maintained(cls) -> bool:
209
+ return True
210
+
211
+ def _clean_connection_dict(self, d: dict[str, Any]) -> dict[str, Any]:
212
+ d = _process_env_vars(d)
213
+ if d["connection_string"]:
214
+ d["url"] = d["connection_string"]
215
+ if "connection_string" in d:
216
+ del d["connection_string"]
217
+ return d
218
+
219
+ def prepare_environment(self) -> dict[str, Any]:
220
+ env = {}
221
+
222
+ for conn in self.connections:
223
+ d = self._clean_connection_dict(dict(conn))
224
+ env[conn.name] = json.dumps(d)
225
+
226
+ return env
227
+
228
+ @contextlib.contextmanager
229
+ def _setup_config(self) -> Generator[None, None, None]:
230
+ """Uses environment variables to set the Sling source and target connections."""
231
+ prepared_environment = self.prepare_environment()
232
+ with environ(prepared_environment):
233
+ yield
234
+
235
+ def _clean_line(self, line: str) -> str:
236
+ """Removes ANSI escape sequences from a line of output."""
237
+ return ANSI_ESCAPE.sub("", line).replace("INF", "")
238
+
239
+ def _process_stdout(self, stdout: IO[AnyStr], encoding="utf8") -> Iterator[str]:
240
+ """Process stdout from the Sling CLI."""
241
+ for line in stdout:
242
+ assert isinstance(line, bytes)
243
+ fmt_line = bytes.decode(line, encoding=encoding, errors="replace")
244
+ yield self._clean_line(fmt_line)
245
+
246
+ def _exec_sling_cmd(
247
+ self, cmd, stdin=None, stdout=PIPE, stderr=STDOUT, encoding="utf8"
248
+ ) -> Generator[str, None, None]:
249
+ with Popen(cmd, shell=True, stdin=stdin, stdout=stdout, stderr=stderr) as proc:
250
+ if proc.stdout:
251
+ yield from self._process_stdout(proc.stdout, encoding=encoding)
252
+
253
+ proc.wait()
254
+ if proc.returncode != 0:
255
+ raise Exception("Sling command failed with error code %s", proc.returncode)
256
+
257
+ def _parse_json_table_output(self, table_output: dict[str, Any]) -> list[dict[str, str]]:
258
+ column_keys: list[str] = table_output["fields"]
259
+ column_values: list[list[str]] = table_output["rows"]
260
+
261
+ return [dict(zip(column_keys, column_values)) for column_values in column_values]
262
+
263
+ def get_column_info_for_table(self, target_name: str, table_name: str) -> list[dict[str, str]]:
264
+ """Fetches column metadata for a given table in a Sling target and parses it into a list of
265
+ dictionaries, keyed by column name.
266
+
267
+ Args:
268
+ target_name (str): The name of the target connection to use.
269
+ table_name (str): The name of the table to fetch column metadata for.
270
+
271
+ Returns:
272
+ List[Dict[str, str]]: A list of dictionaries, keyed by column name, containing column metadata.
273
+ """
274
+ output = self.run_sling_cli(
275
+ ["conns", "discover", target_name, "--pattern", table_name, "--columns"],
276
+ force_json=True,
277
+ )
278
+ return self._parse_json_table_output(json.loads(output.strip()))
279
+
280
+ def get_row_count_for_table(self, target_name: str, table_name: str) -> int:
281
+ """Queries the target connection to get the row count for a given table.
282
+
283
+ Args:
284
+ target_name (str): The name of the target connection to use.
285
+ table_name (str): The name of the table to fetch the row count for.
286
+
287
+ Returns:
288
+ int: The number of rows in the table.
289
+ """
290
+ select_stmt: str = f"select count(*) as ct from {table_name}"
291
+ output = self.run_sling_cli(
292
+ ["conns", "exec", target_name, select_stmt],
293
+ force_json=True,
294
+ )
295
+ return int(
296
+ next(iter(self._parse_json_table_output(json.loads(output.strip()))[0].values()))
297
+ )
298
+
299
+ def run_sling_cli(self, args: Sequence[str], force_json: bool = False) -> str:
300
+ """Runs the Sling CLI with the given arguments and returns the output.
301
+
302
+ Args:
303
+ args (Sequence[str]): The arguments to pass to the Sling CLI.
304
+
305
+ Returns:
306
+ str: The output from the Sling CLI.
307
+ """
308
+ with environ({"SLING_OUTPUT": "json"}) if force_json else contextlib.nullcontext():
309
+ return subprocess.check_output(args=[sling.SLING_BIN, *args], text=True)
310
+
311
+ def replicate(
312
+ self,
313
+ *,
314
+ context: Union[OpExecutionContext, AssetExecutionContext],
315
+ replication_config: Optional[SlingReplicationParam] = None,
316
+ dagster_sling_translator: Optional[DagsterSlingTranslator] = None,
317
+ debug: bool = False,
318
+ ) -> SlingEventIterator[SlingEventType]:
319
+ """Runs a Sling replication from the given replication config.
320
+
321
+ Args:
322
+ context: Asset or Op execution context.
323
+ replication_config: The Sling replication config to use for the replication.
324
+ dagster_sling_translator: The translator to use for the replication.
325
+ debug: Whether to run the replication in debug mode.
326
+
327
+ Returns:
328
+ SlingEventIterator[MaterializeResult]: A generator of MaterializeResult
329
+ """
330
+ if not (replication_config or dagster_sling_translator):
331
+ metadata_by_key = context.assets_def.metadata_by_key
332
+ first_asset_metadata = next(iter(metadata_by_key.values()))
333
+ dagster_sling_translator = first_asset_metadata.get(METADATA_KEY_TRANSLATOR)
334
+ replication_config = first_asset_metadata.get(METADATA_KEY_REPLICATION_CONFIG)
335
+
336
+ dagster_sling_translator = dagster_sling_translator or DagsterSlingTranslator()
337
+ replication_config_dict = dict(validate_replication(replication_config))
338
+ return SlingEventIterator(
339
+ self._replicate(
340
+ context=context,
341
+ replication_config=replication_config_dict,
342
+ dagster_sling_translator=dagster_sling_translator,
343
+ debug=debug,
344
+ ),
345
+ sling_cli=self,
346
+ replication_config=replication_config_dict,
347
+ context=context,
348
+ )
349
+
350
+ def _replicate(
351
+ self,
352
+ *,
353
+ context: Union[OpExecutionContext, AssetExecutionContext],
354
+ replication_config: dict[str, Any],
355
+ dagster_sling_translator: DagsterSlingTranslator,
356
+ debug: bool,
357
+ ) -> Iterator[SlingEventType]:
358
+ # if translator has not been defined on metadata _or_ through param, then use the default constructor
359
+
360
+ # convert to dict to enable updating the index
361
+ context_streams = self._get_replication_streams_for_context(context)
362
+ if context_streams:
363
+ replication_config.update({"streams": context_streams})
364
+ stream_definitions = get_streams_from_replication(replication_config)
365
+
366
+ # extract the destination name from the replication config
367
+ destination_name = replication_config.get("target")
368
+
369
+ with self._setup_config():
370
+ uid = uuid.uuid4()
371
+ temp_dir = tempfile.gettempdir()
372
+ temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
373
+ env = os.environ.copy()
374
+
375
+ with open(temp_file, "w") as file:
376
+ json.dump(replication_config, file, cls=sling.JsonEncoder)
377
+
378
+ logger.debug(f"Replication config: {replication_config}")
379
+
380
+ debug_str = "-d" if debug else ""
381
+
382
+ cmd = f"{sling.SLING_BIN} run {debug_str} -r {temp_file}"
383
+
384
+ logger.debug(f"Running Sling replication with command: {cmd}")
385
+
386
+ # Get start time from wall clock
387
+ start_time = time.time()
388
+ results = sling._run( # noqa
389
+ cmd=cmd,
390
+ temp_file=temp_file,
391
+ return_output=True,
392
+ env=env,
393
+ )
394
+ for row in results.split("\n"):
395
+ clean_line = self._clean_line(row)
396
+ sys.stdout.write(clean_line + "\n")
397
+ self._stdout.append(clean_line)
398
+
399
+ end_time = time.time()
400
+
401
+ # TODO: In the future, it'd be nice to yield these materializations as they come in
402
+ # rather than waiting until the end of the replication
403
+ for stream in stream_definitions:
404
+ asset_key = dagster_sling_translator.get_asset_key(stream)
405
+
406
+ object_key = (stream.get("config") or {}).get("object")
407
+ destination_stream_name = object_key or stream["name"]
408
+ table_name = None
409
+ if destination_name and destination_stream_name:
410
+ table_name = ".".join([destination_name, destination_stream_name])
411
+
412
+ metadata = {
413
+ "elapsed_time": end_time - start_time,
414
+ "stream_name": stream["name"],
415
+ **TableMetadataSet(
416
+ table_name=table_name,
417
+ ),
418
+ }
419
+
420
+ if context.has_assets_def:
421
+ yield MaterializeResult(asset_key=asset_key, metadata=metadata)
422
+ else:
423
+ yield AssetMaterialization(asset_key=asset_key, metadata=metadata)
424
+
425
+ def stream_raw_logs(self) -> Generator[str, None, None]:
426
+ """Returns a generator of raw logs from the Sling CLI."""
427
+ yield from self._stdout
428
+
429
+
430
+ def _process_env_vars(config: dict[str, Any]) -> dict[str, Any]:
431
+ out = {}
432
+ for key, value in config.items():
433
+ if isinstance(value, dict) and len(value) == 1 and next(iter(value.keys())) == "env":
434
+ out[key] = EnvVar(next(iter(value.values()))).get_value()
435
+ else:
436
+ out[key] = value
437
+ return out
@@ -0,0 +1,238 @@
1
+ import re
2
+ from collections.abc import Iterator, Sequence
3
+ from typing import TYPE_CHECKING, Any, Optional, Union, cast
4
+
5
+ from dagster import (
6
+ AssetMaterialization,
7
+ MaterializeResult,
8
+ _check as check,
9
+ )
10
+ from dagster._annotations import experimental, public
11
+ from dagster._core.definitions.metadata.metadata_set import TableMetadataSet
12
+ from dagster._core.definitions.metadata.table import (
13
+ TableColumn,
14
+ TableColumnDep,
15
+ TableColumnLineage,
16
+ TableSchema,
17
+ )
18
+ from dagster._core.execution.context.asset_execution_context import AssetExecutionContext
19
+ from dagster._core.execution.context.op_execution_context import OpExecutionContext
20
+ from typing_extensions import TypeVar
21
+
22
+ if TYPE_CHECKING:
23
+ from dagster_sling.resources import SlingResource
24
+
25
+
26
+ SlingEventType = Union[AssetMaterialization, MaterializeResult]
27
+
28
+ # We define SlingEventIterator as a generic type for the sake of type hinting.
29
+ # This is so that users who inspect the type of the return value of `SlingResource.replicate()`
30
+ # will be able to see the inner type of the iterator, rather than just `SlingEventIterator`.
31
+ T = TypeVar("T", bound=SlingEventType)
32
+
33
+
34
+ def _get_logs_for_stream(
35
+ stream_name: str,
36
+ sling_cli: "SlingResource",
37
+ ) -> Sequence[str]:
38
+ """Parses out the logs for a specific stream from the raw logs returned by the Sling CLI."""
39
+ corresponding_logs = []
40
+ recording_logs = False
41
+ for log in sling_cli.stream_raw_logs():
42
+ if (f"running stream {stream_name}") in log:
43
+ corresponding_logs.append(log)
44
+ recording_logs = True
45
+ elif recording_logs:
46
+ if len(log.strip()) == 0:
47
+ break
48
+ corresponding_logs.append(log)
49
+ return corresponding_logs
50
+
51
+
52
+ def _strip_quotes_target_table_name(target_table_name: str) -> str:
53
+ return target_table_name.replace('"', "")
54
+
55
+
56
+ INSERT_REGEX = re.compile(r".*inserted (\d+) rows into (.*) in.*")
57
+
58
+
59
+ def _get_target_table_name(stream_name: str, sling_cli: "SlingResource") -> Optional[str]:
60
+ """Extracts the target table name from the logs for a specific stream."""
61
+ corresponding_logs = _get_logs_for_stream(stream_name, sling_cli)
62
+ insert_log = next((log for log in corresponding_logs if re.match(INSERT_REGEX, log)), None)
63
+ if not insert_log:
64
+ return None
65
+
66
+ target_table_name = check.not_none(re.match(INSERT_REGEX, insert_log)).group(2)
67
+ return _strip_quotes_target_table_name(target_table_name)
68
+
69
+
70
+ COLUMN_NAME_COL = "Column"
71
+ COLUMN_TYPE_COL = "General Type"
72
+
73
+ SLING_COLUMN_PREFIX = "_sling_"
74
+
75
+
76
+ def fetch_row_count_metadata(
77
+ materialization: SlingEventType,
78
+ sling_cli: "SlingResource",
79
+ replication_config: dict[str, Any],
80
+ context: Union[OpExecutionContext, AssetExecutionContext],
81
+ ) -> dict[str, Any]:
82
+ target_name = replication_config["target"]
83
+ if not materialization.metadata:
84
+ raise Exception("Missing required metadata to retrieve stream_name")
85
+
86
+ stream_name = cast(str, materialization.metadata["stream_name"])
87
+ target_table_name = _get_target_table_name(stream_name, sling_cli)
88
+
89
+ if target_table_name:
90
+ try:
91
+ row_count = sling_cli.get_row_count_for_table(target_name, target_table_name)
92
+ return dict(TableMetadataSet(row_count=row_count))
93
+ except Exception as e:
94
+ context.log.warning(
95
+ f"Failed to fetch row count for stream %s\nException: {e}",
96
+ stream_name,
97
+ exc_info=True,
98
+ )
99
+
100
+ return {}
101
+
102
+
103
+ def fetch_column_metadata(
104
+ materialization: SlingEventType,
105
+ sling_cli: "SlingResource",
106
+ replication_config: dict[str, Any],
107
+ context: Union[OpExecutionContext, AssetExecutionContext],
108
+ ) -> dict[str, Any]:
109
+ target_name = replication_config["target"]
110
+
111
+ if not materialization.metadata:
112
+ raise Exception("Missing required metadata to retrieve stream_name")
113
+
114
+ stream_name = cast(str, materialization.metadata["stream_name"])
115
+
116
+ upstream_assets = set()
117
+ if isinstance(context, AssetExecutionContext):
118
+ if materialization.asset_key:
119
+ upstream_assets = context.assets_def.asset_deps[materialization.asset_key]
120
+
121
+ target_table_name = _get_target_table_name(stream_name, sling_cli)
122
+
123
+ if target_table_name:
124
+ try:
125
+ column_info = sling_cli.get_column_info_for_table(target_name, target_table_name)
126
+ column_type_map = {
127
+ col[COLUMN_NAME_COL]: col[COLUMN_TYPE_COL]
128
+ for col in column_info
129
+ if COLUMN_TYPE_COL in col
130
+ }
131
+
132
+ column_lineage = None
133
+ # If there is only one upstream asset (typical case), we can infer column lineage
134
+ # from the single upstream asset which is being replicated exactly.
135
+ if len(upstream_assets) == 1:
136
+ upstream_asset_key = next(iter(upstream_assets))
137
+ column_lineage = TableColumnLineage(
138
+ deps_by_column={
139
+ column_name.lower(): [
140
+ TableColumnDep(
141
+ asset_key=upstream_asset_key,
142
+ column_name=column_name.lower(),
143
+ )
144
+ ]
145
+ for column_name in column_type_map.keys()
146
+ if not column_name.startswith(SLING_COLUMN_PREFIX)
147
+ }
148
+ )
149
+
150
+ return dict(
151
+ TableMetadataSet(
152
+ column_schema=TableSchema(
153
+ columns=[
154
+ TableColumn(name=column_name.lower(), type=column_type)
155
+ for column_name, column_type in column_type_map.items()
156
+ ]
157
+ ),
158
+ column_lineage=column_lineage,
159
+ )
160
+ )
161
+ except Exception as e:
162
+ context.log.warning(
163
+ f"Failed to fetch column metadata for stream %s\nException: {e}",
164
+ stream_name,
165
+ exc_info=True,
166
+ )
167
+
168
+ return {}
169
+
170
+
171
+ class SlingEventIterator(Iterator[T]):
172
+ """A wrapper around an iterator of Sling events which contains additional methods for
173
+ post-processing the events, such as fetching column metadata.
174
+ """
175
+
176
+ def __init__(
177
+ self,
178
+ events: Iterator[T],
179
+ sling_cli: "SlingResource",
180
+ replication_config: dict[str, Any],
181
+ context: Union[OpExecutionContext, AssetExecutionContext],
182
+ ) -> None:
183
+ self._inner_iterator = events
184
+ self._sling_cli = sling_cli
185
+ self._replication_config = replication_config
186
+ self._context = context
187
+
188
+ def __next__(self) -> T:
189
+ return next(self._inner_iterator)
190
+
191
+ def __iter__(self) -> "SlingEventIterator[T]":
192
+ return self
193
+
194
+ @experimental
195
+ @public
196
+ def fetch_column_metadata(self) -> "SlingEventIterator":
197
+ """Fetches column metadata for each table synced by the Sling CLI.
198
+
199
+ Retrieves the column schema and lineage for each target table.
200
+
201
+ Returns:
202
+ SlingEventIterator: An iterator of Dagster events with column metadata attached.
203
+ """
204
+
205
+ def _fetch_column_metadata() -> Iterator[T]:
206
+ for event in self:
207
+ col_metadata = fetch_column_metadata(
208
+ event, self._sling_cli, self._replication_config, self._context
209
+ )
210
+ if event.metadata:
211
+ yield event._replace(metadata={**col_metadata, **event.metadata})
212
+
213
+ return SlingEventIterator[T](
214
+ _fetch_column_metadata(), self._sling_cli, self._replication_config, self._context
215
+ )
216
+
217
+ @experimental
218
+ @public
219
+ def fetch_row_count(self) -> "SlingEventIterator":
220
+ """Fetches row count metadata for each table synced by the Sling CLI.
221
+
222
+ Retrieves the row count for each target table.
223
+
224
+ Returns:
225
+ SlingEventIterator: An iterator of Dagster events with row count metadata attached.
226
+ """
227
+
228
+ def _fetch_row_count() -> Iterator[T]:
229
+ for event in self:
230
+ row_count_metadata = fetch_row_count_metadata(
231
+ event, self._sling_cli, self._replication_config, self._context
232
+ )
233
+ if event.metadata:
234
+ yield event._replace(metadata={**row_count_metadata, **event.metadata})
235
+
236
+ return SlingEventIterator[T](
237
+ _fetch_row_count(), self._sling_cli, self._replication_config, self._context
238
+ )