dagster-sling 0.26.5__py3-none-any.whl → 0.26.6rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dagster-sling might be problematic. Click here for more details.
- dagster_sling/resources.py +248 -45
- dagster_sling/version.py +1 -1
- {dagster_sling-0.26.5.dist-info → dagster_sling-0.26.6rc0.dist-info}/METADATA +2 -2
- {dagster_sling-0.26.5.dist-info → dagster_sling-0.26.6rc0.dist-info}/RECORD +7 -7
- {dagster_sling-0.26.5.dist-info → dagster_sling-0.26.6rc0.dist-info}/LICENSE +0 -0
- {dagster_sling-0.26.5.dist-info → dagster_sling-0.26.6rc0.dist-info}/WHEEL +0 -0
- {dagster_sling-0.26.5.dist-info → dagster_sling-0.26.6rc0.dist-info}/top_level.txt +0 -0
dagster_sling/resources.py
CHANGED
|
@@ -3,7 +3,6 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
import re
|
|
5
5
|
import subprocess
|
|
6
|
-
import sys
|
|
7
6
|
import tempfile
|
|
8
7
|
import time
|
|
9
8
|
import uuid
|
|
@@ -216,6 +215,53 @@ class SlingResource(ConfigurableResource):
|
|
|
216
215
|
del d["connection_string"]
|
|
217
216
|
return d
|
|
218
217
|
|
|
218
|
+
def _query_metadata(
|
|
219
|
+
self, metadata_string: str, start_time: float, base_metadata: Union[list, None] = None
|
|
220
|
+
):
|
|
221
|
+
"""Metadata quering using regular expression from standard sling log.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
metadata_string (str): raw log string containing log/metadata from sling cli run
|
|
225
|
+
start_time (float): start time that will be assign to calculate elapse
|
|
226
|
+
base_metadata (list, Null): list of metadata to be query from string
|
|
227
|
+
|
|
228
|
+
Return:
|
|
229
|
+
final_dict (dict): Final metadata idct contain metadata query from string
|
|
230
|
+
"""
|
|
231
|
+
if base_metadata is None:
|
|
232
|
+
base_metadata = [
|
|
233
|
+
"stream_name",
|
|
234
|
+
"row_count",
|
|
235
|
+
"destination_table",
|
|
236
|
+
"destination_file",
|
|
237
|
+
"elapsed_time",
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
tmp = None
|
|
241
|
+
tmp_metadata = {}
|
|
242
|
+
end_time = time.time()
|
|
243
|
+
target_type = re.findall(r"writing to target ([\w\s]*) ", metadata_string)
|
|
244
|
+
if target_type and target_type[0] == "database":
|
|
245
|
+
tmp = re.findall(r"inserted ([0-9]*) rows .*into ([\w.:/;-_\"\'{}]*)", metadata_string)
|
|
246
|
+
elif target_type and target_type[0] == "file system":
|
|
247
|
+
tmp = re.findall(r"wrote ([0-9]*) rows .*to ([\w.:/;-_\"\'{}]*)", metadata_string)
|
|
248
|
+
else:
|
|
249
|
+
tmp = re.findall(r"inserted ([0-9]*) rows .*into ([\w.:/;-_\"\'{}]*)", metadata_string)
|
|
250
|
+
|
|
251
|
+
if tmp:
|
|
252
|
+
if target_type and target_type[0] == "database":
|
|
253
|
+
tmp_metadata["destination_table"] = re.sub(r"[^\w\s.]", "", tmp[0][1])
|
|
254
|
+
if target_type and target_type[0] == "file system":
|
|
255
|
+
tmp_metadata["destination_file"] = re.sub(r"[^\w\s.]", "", tmp[0][1])
|
|
256
|
+
tmp_metadata["elapsed_time"] = end_time - start_time
|
|
257
|
+
tmp_metadata["row_count"] = tmp[0][0]
|
|
258
|
+
|
|
259
|
+
final_dict = {}
|
|
260
|
+
for k in base_metadata:
|
|
261
|
+
if tmp_metadata.get(k):
|
|
262
|
+
final_dict[k] = tmp_metadata.get(k)
|
|
263
|
+
return final_dict
|
|
264
|
+
|
|
219
265
|
def prepare_environment(self) -> dict[str, Any]:
|
|
220
266
|
env = {}
|
|
221
267
|
|
|
@@ -236,6 +282,23 @@ class SlingResource(ConfigurableResource):
|
|
|
236
282
|
"""Removes ANSI escape sequences from a line of output."""
|
|
237
283
|
return ANSI_ESCAPE.sub("", line).replace("INF", "")
|
|
238
284
|
|
|
285
|
+
def _clean_timestamp_log(self, line: str):
|
|
286
|
+
"""Remove timestamp from log gather from sling cli to reduce redundency in dagster log.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
line (str): line of log gather from cli to be cleaned
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
text: cleaned log consist only of log data
|
|
293
|
+
|
|
294
|
+
"""
|
|
295
|
+
tmp = self._clean_line(line)
|
|
296
|
+
try:
|
|
297
|
+
text = tmp.split(" ")[1]
|
|
298
|
+
except:
|
|
299
|
+
text = tmp
|
|
300
|
+
return text
|
|
301
|
+
|
|
239
302
|
def _process_stdout(self, stdout: IO[AnyStr], encoding="utf8") -> Iterator[str]:
|
|
240
303
|
"""Process stdout from the Sling CLI."""
|
|
241
304
|
for line in stdout:
|
|
@@ -315,6 +378,7 @@ class SlingResource(ConfigurableResource):
|
|
|
315
378
|
replication_config: Optional[SlingReplicationParam] = None,
|
|
316
379
|
dagster_sling_translator: Optional[DagsterSlingTranslator] = None,
|
|
317
380
|
debug: bool = False,
|
|
381
|
+
stream: bool = False,
|
|
318
382
|
) -> SlingEventIterator[SlingEventType]:
|
|
319
383
|
"""Runs a Sling replication from the given replication config.
|
|
320
384
|
|
|
@@ -341,6 +405,7 @@ class SlingResource(ConfigurableResource):
|
|
|
341
405
|
replication_config=replication_config_dict,
|
|
342
406
|
dagster_sling_translator=dagster_sling_translator,
|
|
343
407
|
debug=debug,
|
|
408
|
+
stream=stream,
|
|
344
409
|
),
|
|
345
410
|
sling_cli=self,
|
|
346
411
|
replication_config=replication_config_dict,
|
|
@@ -354,11 +419,47 @@ class SlingResource(ConfigurableResource):
|
|
|
354
419
|
replication_config: dict[str, Any],
|
|
355
420
|
dagster_sling_translator: DagsterSlingTranslator,
|
|
356
421
|
debug: bool,
|
|
422
|
+
stream: bool = False,
|
|
357
423
|
) -> Iterator[SlingEventType]:
|
|
358
424
|
# if translator has not been defined on metadata _or_ through param, then use the default constructor
|
|
359
425
|
|
|
426
|
+
with self._setup_config():
|
|
427
|
+
env = os.environ.copy()
|
|
428
|
+
|
|
429
|
+
if not stream:
|
|
430
|
+
##### Old method use _run which is not streamable #####
|
|
431
|
+
generator = self._batch_sling_replicate(
|
|
432
|
+
context=context,
|
|
433
|
+
replication_config=replication_config,
|
|
434
|
+
dagster_sling_translator=dagster_sling_translator,
|
|
435
|
+
env=env,
|
|
436
|
+
debug=debug,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
else:
|
|
440
|
+
#### New method use sling _exec_cmd to stream log from sling to dagster log
|
|
441
|
+
generator = self._stream_sling_replicate(
|
|
442
|
+
context=context,
|
|
443
|
+
replication_config=replication_config,
|
|
444
|
+
dagster_sling_translator=dagster_sling_translator,
|
|
445
|
+
env=env,
|
|
446
|
+
debug=debug,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
yield from generator
|
|
450
|
+
|
|
451
|
+
def _batch_sling_replicate(
|
|
452
|
+
self,
|
|
453
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
454
|
+
replication_config: dict[str, Any],
|
|
455
|
+
dagster_sling_translator: DagsterSlingTranslator,
|
|
456
|
+
env: dict,
|
|
457
|
+
debug: bool,
|
|
458
|
+
) -> Generator[Union[MaterializeResult, AssetMaterialization], None, None]:
|
|
459
|
+
"""Underlying function to run replication and fetch metadata in batch mode."""
|
|
360
460
|
# convert to dict to enable updating the index
|
|
361
461
|
context_streams = self._get_replication_streams_for_context(context)
|
|
462
|
+
|
|
362
463
|
if context_streams:
|
|
363
464
|
replication_config.update({"streams": context_streams})
|
|
364
465
|
stream_definitions = get_streams_from_replication(replication_config)
|
|
@@ -366,61 +467,163 @@ class SlingResource(ConfigurableResource):
|
|
|
366
467
|
# extract the destination name from the replication config
|
|
367
468
|
destination_name = replication_config.get("target")
|
|
368
469
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
|
|
373
|
-
env = os.environ.copy()
|
|
470
|
+
uid = uuid.uuid4()
|
|
471
|
+
temp_dir = tempfile.gettempdir()
|
|
472
|
+
temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
|
|
374
473
|
|
|
375
|
-
|
|
376
|
-
|
|
474
|
+
with open(temp_file, "w") as file:
|
|
475
|
+
json.dump(replication_config, file, cls=sling.JsonEncoder)
|
|
377
476
|
|
|
378
|
-
|
|
477
|
+
logger.debug(f"Replication config: {replication_config}")
|
|
379
478
|
|
|
380
|
-
|
|
479
|
+
debug_str = "-d" if debug else ""
|
|
381
480
|
|
|
382
|
-
|
|
481
|
+
cmd = f"{sling.SLING_BIN} run {debug_str} -r {temp_file}"
|
|
383
482
|
|
|
384
|
-
|
|
483
|
+
logger.debug(f"Running Sling replication with command: {cmd}")
|
|
385
484
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
results = sling._run( # noqa
|
|
389
|
-
cmd=cmd,
|
|
390
|
-
temp_file=temp_file,
|
|
391
|
-
return_output=True,
|
|
392
|
-
env=env,
|
|
393
|
-
)
|
|
394
|
-
for row in results.split("\n"):
|
|
395
|
-
clean_line = self._clean_line(row)
|
|
396
|
-
sys.stdout.write(clean_line + "\n")
|
|
397
|
-
self._stdout.append(clean_line)
|
|
485
|
+
# Get start time from wall clock
|
|
486
|
+
start_time = time.time()
|
|
398
487
|
|
|
399
|
-
|
|
488
|
+
results = sling._run( # noqa
|
|
489
|
+
cmd=cmd,
|
|
490
|
+
temp_file=temp_file,
|
|
491
|
+
return_output=True,
|
|
492
|
+
env=env,
|
|
493
|
+
)
|
|
400
494
|
|
|
401
|
-
|
|
402
|
-
# rather than waiting until the end of the replication
|
|
403
|
-
for stream in stream_definitions:
|
|
404
|
-
asset_key = dagster_sling_translator.get_asset_spec(stream).key
|
|
495
|
+
end_time = time.time()
|
|
405
496
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
497
|
+
for row in results.split("\n"):
|
|
498
|
+
clean_line = self._clean_line(row)
|
|
499
|
+
logger.debug(clean_line + "\n")
|
|
500
|
+
self._stdout.append(clean_line)
|
|
501
|
+
|
|
502
|
+
for stream_definition in stream_definitions:
|
|
503
|
+
asset_key = dagster_sling_translator.get_asset_spec(stream_definition).key
|
|
504
|
+
|
|
505
|
+
object_key = (stream_definition.get("config") or {}).get("object")
|
|
506
|
+
destination_stream_name = object_key or stream_definition["name"]
|
|
507
|
+
table_name = None
|
|
508
|
+
if destination_name and destination_stream_name:
|
|
509
|
+
table_name = ".".join([destination_name, destination_stream_name])
|
|
510
|
+
|
|
511
|
+
metadata = {
|
|
512
|
+
"elapsed_time": end_time - start_time,
|
|
513
|
+
"stream_name": stream_definition["name"],
|
|
514
|
+
**TableMetadataSet(
|
|
515
|
+
table_name=table_name,
|
|
516
|
+
),
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
if context.has_assets_def:
|
|
520
|
+
yield MaterializeResult(asset_key=asset_key, metadata=metadata)
|
|
521
|
+
else:
|
|
522
|
+
yield AssetMaterialization(asset_key=asset_key, metadata=metadata)
|
|
523
|
+
|
|
524
|
+
def _stream_sling_replicate(
|
|
525
|
+
self,
|
|
526
|
+
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
527
|
+
replication_config: dict[str, Any],
|
|
528
|
+
dagster_sling_translator: DagsterSlingTranslator,
|
|
529
|
+
env: dict,
|
|
530
|
+
debug: bool,
|
|
531
|
+
) -> Generator[Union[MaterializeResult, AssetMaterialization], None, None]:
|
|
532
|
+
"""Underlying function to run replication and fetch metadata in stream mode."""
|
|
533
|
+
# define variable to use to compute metadata during run
|
|
534
|
+
current_stream = None
|
|
535
|
+
metadata_text = []
|
|
536
|
+
metadata = {}
|
|
537
|
+
|
|
538
|
+
# convert to dict to enable updating the index
|
|
539
|
+
context_streams = self._get_replication_streams_for_context(context)
|
|
540
|
+
|
|
541
|
+
if context_streams:
|
|
542
|
+
replication_config.update({"streams": context_streams})
|
|
543
|
+
|
|
544
|
+
uid = uuid.uuid4()
|
|
545
|
+
temp_dir = tempfile.gettempdir()
|
|
546
|
+
temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
|
|
547
|
+
|
|
548
|
+
with open(temp_file, "w") as file:
|
|
549
|
+
json.dump(replication_config, file, cls=sling.JsonEncoder)
|
|
550
|
+
|
|
551
|
+
logger.debug(f"Replication config: {replication_config}")
|
|
419
552
|
|
|
420
|
-
|
|
421
|
-
|
|
553
|
+
debug_str = "-d" if debug else ""
|
|
554
|
+
|
|
555
|
+
cmd = f"{sling.SLING_BIN} run {debug_str} -r {temp_file}"
|
|
556
|
+
|
|
557
|
+
logger.debug(f"Running Sling replication with command: {cmd}")
|
|
558
|
+
|
|
559
|
+
# Get start time from wall clock
|
|
560
|
+
start_time = time.time()
|
|
561
|
+
|
|
562
|
+
for line in sling._exec_cmd(cmd, env=env): # noqa
|
|
563
|
+
if line == "": # if empty line -- skipped
|
|
564
|
+
continue
|
|
565
|
+
text = self._clean_timestamp_log(line) # else clean timestamp
|
|
566
|
+
logger.info(text) # log info to dagster log
|
|
567
|
+
|
|
568
|
+
# if no current stream is chosen
|
|
569
|
+
if current_stream is None:
|
|
570
|
+
# Try to match stream name with stream keyword
|
|
571
|
+
matched = re.findall("stream (.*)$", text)
|
|
572
|
+
|
|
573
|
+
# If found, extract stream name, stream config, asset key
|
|
574
|
+
if matched:
|
|
575
|
+
current_stream = matched[0]
|
|
576
|
+
current_config = replication_config.get("streams", {}).get(current_stream, {})
|
|
577
|
+
asset_key = dagster_sling_translator.get_asset_spec(
|
|
578
|
+
{"name": current_stream, "config": current_config}
|
|
579
|
+
).key
|
|
580
|
+
if debug:
|
|
581
|
+
logger.debug(current_stream)
|
|
582
|
+
logger.debug(current_config)
|
|
583
|
+
logger.debug(asset_key)
|
|
584
|
+
# Else search for single replication format
|
|
422
585
|
else:
|
|
423
|
-
|
|
586
|
+
# If found, extract stream name, stream config, asset key
|
|
587
|
+
matched = re.findall(r"Sling Replication [|] .* [|] (\S*)$", text)
|
|
588
|
+
if matched:
|
|
589
|
+
current_stream = matched[0]
|
|
590
|
+
current_config = replication_config.get("streams", {}).get(
|
|
591
|
+
current_stream, {}
|
|
592
|
+
)
|
|
593
|
+
asset_key = dagster_sling_translator.get_asset_spec(
|
|
594
|
+
{"name": current_stream, "config": current_config}
|
|
595
|
+
).key
|
|
596
|
+
if debug:
|
|
597
|
+
logger.debug(current_stream)
|
|
598
|
+
logger.debug(current_config)
|
|
599
|
+
logger.debug(asset_key)
|
|
600
|
+
# Else log that no stream found. This is normal for a few line. But if multiple line come up, further evaluate might be needed for other pattern
|
|
601
|
+
else:
|
|
602
|
+
if debug:
|
|
603
|
+
logger.debug("no match stream name")
|
|
604
|
+
# If current stream is already choose
|
|
605
|
+
else:
|
|
606
|
+
# Search whether the current stream ended
|
|
607
|
+
matched = re.findall("execution succeeded", text)
|
|
608
|
+
|
|
609
|
+
if matched:
|
|
610
|
+
# If yes, query metadata and materialize asset
|
|
611
|
+
metadata = self._query_metadata("\n".join(metadata_text), start_time=start_time)
|
|
612
|
+
start_time = time.time()
|
|
613
|
+
metadata["stream_name"] = current_stream
|
|
614
|
+
logger.debug(metadata)
|
|
615
|
+
if context.has_assets_def:
|
|
616
|
+
yield MaterializeResult(asset_key=asset_key, metadata=metadata)
|
|
617
|
+
else:
|
|
618
|
+
yield AssetMaterialization(asset_key=asset_key, metadata=metadata)
|
|
619
|
+
|
|
620
|
+
current_stream = None
|
|
621
|
+
metadata_text = []
|
|
622
|
+
|
|
623
|
+
metadata_text.append(text)
|
|
624
|
+
|
|
625
|
+
# clean up unused file
|
|
626
|
+
os.remove(temp_file)
|
|
424
627
|
|
|
425
628
|
def stream_raw_logs(self) -> Generator[str, None, None]:
|
|
426
629
|
"""Returns a generator of raw logs from the Sling CLI."""
|
dagster_sling/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.26.
|
|
1
|
+
__version__ = "0.26.6rc0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dagster-sling
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.6rc0
|
|
4
4
|
Summary: Package for performing ETL/ELT tasks with Sling in Dagster.
|
|
5
5
|
Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-sling
|
|
6
6
|
Author: Dagster Labs
|
|
@@ -14,7 +14,7 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
|
14
14
|
Classifier: Operating System :: OS Independent
|
|
15
15
|
Requires-Python: >=3.9,<3.13
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: dagster ==1.10.
|
|
17
|
+
Requires-Dist: dagster ==1.10.6rc0
|
|
18
18
|
Requires-Dist: sling >=1.1.5
|
|
19
19
|
Requires-Dist: sling-mac-arm64 ; platform_system == "Darwin" and platform_machine == "arm64"
|
|
20
20
|
Provides-Extra: test
|
|
@@ -3,12 +3,12 @@ dagster_sling/asset_decorator.py,sha256=GUr2_azLV4lzu68IkObhq4NjOxr4LSd7vJW0Ax-2
|
|
|
3
3
|
dagster_sling/asset_defs.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
dagster_sling/dagster_sling_translator.py,sha256=66brFssMpkEccvW0AT_juakKglJLJZmYAs1QjOR5k-U,23921
|
|
5
5
|
dagster_sling/py.typed,sha256=la67KBlbjXN-_-DfGNcdOcjYumVpKG_Tkw-8n5dnGB4,8
|
|
6
|
-
dagster_sling/resources.py,sha256=
|
|
6
|
+
dagster_sling/resources.py,sha256=h_OwjnJSOeq_Ykd0h2iJ-NYhQDbRCo0NaWOz2yLL0jw,25794
|
|
7
7
|
dagster_sling/sling_event_iterator.py,sha256=rVJwngxZATM_dbBcZ9W0EM9N29i8vF9h4l8Y6YNJ6h8,8456
|
|
8
8
|
dagster_sling/sling_replication.py,sha256=TFaJsi0C4yUWtYgCG2N4JnAkTdmYXE1ewoaGuqOCNTk,1111
|
|
9
|
-
dagster_sling/version.py,sha256=
|
|
10
|
-
dagster_sling-0.26.
|
|
11
|
-
dagster_sling-0.26.
|
|
12
|
-
dagster_sling-0.26.
|
|
13
|
-
dagster_sling-0.26.
|
|
14
|
-
dagster_sling-0.26.
|
|
9
|
+
dagster_sling/version.py,sha256=RNuIp4ediZH4C3GxBWeJYLDOclzjRe4kdN3UHYJqNYo,26
|
|
10
|
+
dagster_sling-0.26.6rc0.dist-info/LICENSE,sha256=TMatHW4_G9ldRdodEAp-l2Xa2WvsdeOh60E3v1R2jis,11349
|
|
11
|
+
dagster_sling-0.26.6rc0.dist-info/METADATA,sha256=QgQnZAunSQ91S980q50iZ-vZYknsd-zcUcqVXsnuR5g,890
|
|
12
|
+
dagster_sling-0.26.6rc0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
|
13
|
+
dagster_sling-0.26.6rc0.dist-info/top_level.txt,sha256=eoJKEGsD6fqIEmF6xaF8tj5Kq9a7riWyRHbZn6oHTk8,14
|
|
14
|
+
dagster_sling-0.26.6rc0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|