dagster-sling 0.26.5__py3-none-any.whl → 0.26.6rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dagster-sling might be problematic. Click here for more details.

@@ -3,7 +3,6 @@ import json
3
3
  import os
4
4
  import re
5
5
  import subprocess
6
- import sys
7
6
  import tempfile
8
7
  import time
9
8
  import uuid
@@ -216,6 +215,53 @@ class SlingResource(ConfigurableResource):
216
215
  del d["connection_string"]
217
216
  return d
218
217
 
218
+ def _query_metadata(
219
+ self, metadata_string: str, start_time: float, base_metadata: Union[list, None] = None
220
+ ):
221
+ """Metadata quering using regular expression from standard sling log.
222
+
223
+ Args:
224
+ metadata_string (str): raw log string containing log/metadata from sling cli run
225
+ start_time (float): start time that will be assign to calculate elapse
226
+ base_metadata (list, Null): list of metadata to be query from string
227
+
228
+ Return:
229
+ final_dict (dict): Final metadata idct contain metadata query from string
230
+ """
231
+ if base_metadata is None:
232
+ base_metadata = [
233
+ "stream_name",
234
+ "row_count",
235
+ "destination_table",
236
+ "destination_file",
237
+ "elapsed_time",
238
+ ]
239
+
240
+ tmp = None
241
+ tmp_metadata = {}
242
+ end_time = time.time()
243
+ target_type = re.findall(r"writing to target ([\w\s]*) ", metadata_string)
244
+ if target_type and target_type[0] == "database":
245
+ tmp = re.findall(r"inserted ([0-9]*) rows .*into ([\w.:/;-_\"\'{}]*)", metadata_string)
246
+ elif target_type and target_type[0] == "file system":
247
+ tmp = re.findall(r"wrote ([0-9]*) rows .*to ([\w.:/;-_\"\'{}]*)", metadata_string)
248
+ else:
249
+ tmp = re.findall(r"inserted ([0-9]*) rows .*into ([\w.:/;-_\"\'{}]*)", metadata_string)
250
+
251
+ if tmp:
252
+ if target_type and target_type[0] == "database":
253
+ tmp_metadata["destination_table"] = re.sub(r"[^\w\s.]", "", tmp[0][1])
254
+ if target_type and target_type[0] == "file system":
255
+ tmp_metadata["destination_file"] = re.sub(r"[^\w\s.]", "", tmp[0][1])
256
+ tmp_metadata["elapsed_time"] = end_time - start_time
257
+ tmp_metadata["row_count"] = tmp[0][0]
258
+
259
+ final_dict = {}
260
+ for k in base_metadata:
261
+ if tmp_metadata.get(k):
262
+ final_dict[k] = tmp_metadata.get(k)
263
+ return final_dict
264
+
219
265
  def prepare_environment(self) -> dict[str, Any]:
220
266
  env = {}
221
267
 
@@ -236,6 +282,23 @@ class SlingResource(ConfigurableResource):
236
282
  """Removes ANSI escape sequences from a line of output."""
237
283
  return ANSI_ESCAPE.sub("", line).replace("INF", "")
238
284
 
285
+ def _clean_timestamp_log(self, line: str):
286
+ """Remove timestamp from log gather from sling cli to reduce redundency in dagster log.
287
+
288
+ Args:
289
+ line (str): line of log gather from cli to be cleaned
290
+
291
+ Returns:
292
+ text: cleaned log consist only of log data
293
+
294
+ """
295
+ tmp = self._clean_line(line)
296
+ try:
297
+ text = tmp.split(" ")[1]
298
+ except:
299
+ text = tmp
300
+ return text
301
+
239
302
  def _process_stdout(self, stdout: IO[AnyStr], encoding="utf8") -> Iterator[str]:
240
303
  """Process stdout from the Sling CLI."""
241
304
  for line in stdout:
@@ -315,6 +378,7 @@ class SlingResource(ConfigurableResource):
315
378
  replication_config: Optional[SlingReplicationParam] = None,
316
379
  dagster_sling_translator: Optional[DagsterSlingTranslator] = None,
317
380
  debug: bool = False,
381
+ stream: bool = False,
318
382
  ) -> SlingEventIterator[SlingEventType]:
319
383
  """Runs a Sling replication from the given replication config.
320
384
 
@@ -341,6 +405,7 @@ class SlingResource(ConfigurableResource):
341
405
  replication_config=replication_config_dict,
342
406
  dagster_sling_translator=dagster_sling_translator,
343
407
  debug=debug,
408
+ stream=stream,
344
409
  ),
345
410
  sling_cli=self,
346
411
  replication_config=replication_config_dict,
@@ -354,11 +419,47 @@ class SlingResource(ConfigurableResource):
354
419
  replication_config: dict[str, Any],
355
420
  dagster_sling_translator: DagsterSlingTranslator,
356
421
  debug: bool,
422
+ stream: bool = False,
357
423
  ) -> Iterator[SlingEventType]:
358
424
  # if translator has not been defined on metadata _or_ through param, then use the default constructor
359
425
 
426
+ with self._setup_config():
427
+ env = os.environ.copy()
428
+
429
+ if not stream:
430
+ ##### Old method use _run which is not streamable #####
431
+ generator = self._batch_sling_replicate(
432
+ context=context,
433
+ replication_config=replication_config,
434
+ dagster_sling_translator=dagster_sling_translator,
435
+ env=env,
436
+ debug=debug,
437
+ )
438
+
439
+ else:
440
+ #### New method use sling _exec_cmd to stream log from sling to dagster log
441
+ generator = self._stream_sling_replicate(
442
+ context=context,
443
+ replication_config=replication_config,
444
+ dagster_sling_translator=dagster_sling_translator,
445
+ env=env,
446
+ debug=debug,
447
+ )
448
+
449
+ yield from generator
450
+
451
+ def _batch_sling_replicate(
452
+ self,
453
+ context: Union[OpExecutionContext, AssetExecutionContext],
454
+ replication_config: dict[str, Any],
455
+ dagster_sling_translator: DagsterSlingTranslator,
456
+ env: dict,
457
+ debug: bool,
458
+ ) -> Generator[Union[MaterializeResult, AssetMaterialization], None, None]:
459
+ """Underlying function to run replication and fetch metadata in batch mode."""
360
460
  # convert to dict to enable updating the index
361
461
  context_streams = self._get_replication_streams_for_context(context)
462
+
362
463
  if context_streams:
363
464
  replication_config.update({"streams": context_streams})
364
465
  stream_definitions = get_streams_from_replication(replication_config)
@@ -366,61 +467,163 @@ class SlingResource(ConfigurableResource):
366
467
  # extract the destination name from the replication config
367
468
  destination_name = replication_config.get("target")
368
469
 
369
- with self._setup_config():
370
- uid = uuid.uuid4()
371
- temp_dir = tempfile.gettempdir()
372
- temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
373
- env = os.environ.copy()
470
+ uid = uuid.uuid4()
471
+ temp_dir = tempfile.gettempdir()
472
+ temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
374
473
 
375
- with open(temp_file, "w") as file:
376
- json.dump(replication_config, file, cls=sling.JsonEncoder)
474
+ with open(temp_file, "w") as file:
475
+ json.dump(replication_config, file, cls=sling.JsonEncoder)
377
476
 
378
- logger.debug(f"Replication config: {replication_config}")
477
+ logger.debug(f"Replication config: {replication_config}")
379
478
 
380
- debug_str = "-d" if debug else ""
479
+ debug_str = "-d" if debug else ""
381
480
 
382
- cmd = f"{sling.SLING_BIN} run {debug_str} -r {temp_file}"
481
+ cmd = f"{sling.SLING_BIN} run {debug_str} -r {temp_file}"
383
482
 
384
- logger.debug(f"Running Sling replication with command: {cmd}")
483
+ logger.debug(f"Running Sling replication with command: {cmd}")
385
484
 
386
- # Get start time from wall clock
387
- start_time = time.time()
388
- results = sling._run( # noqa
389
- cmd=cmd,
390
- temp_file=temp_file,
391
- return_output=True,
392
- env=env,
393
- )
394
- for row in results.split("\n"):
395
- clean_line = self._clean_line(row)
396
- sys.stdout.write(clean_line + "\n")
397
- self._stdout.append(clean_line)
485
+ # Get start time from wall clock
486
+ start_time = time.time()
398
487
 
399
- end_time = time.time()
488
+ results = sling._run( # noqa
489
+ cmd=cmd,
490
+ temp_file=temp_file,
491
+ return_output=True,
492
+ env=env,
493
+ )
400
494
 
401
- # TODO: In the future, it'd be nice to yield these materializations as they come in
402
- # rather than waiting until the end of the replication
403
- for stream in stream_definitions:
404
- asset_key = dagster_sling_translator.get_asset_spec(stream).key
495
+ end_time = time.time()
405
496
 
406
- object_key = (stream.get("config") or {}).get("object")
407
- destination_stream_name = object_key or stream["name"]
408
- table_name = None
409
- if destination_name and destination_stream_name:
410
- table_name = ".".join([destination_name, destination_stream_name])
411
-
412
- metadata = {
413
- "elapsed_time": end_time - start_time,
414
- "stream_name": stream["name"],
415
- **TableMetadataSet(
416
- table_name=table_name,
417
- ),
418
- }
497
+ for row in results.split("\n"):
498
+ clean_line = self._clean_line(row)
499
+ logger.debug(clean_line + "\n")
500
+ self._stdout.append(clean_line)
501
+
502
+ for stream_definition in stream_definitions:
503
+ asset_key = dagster_sling_translator.get_asset_spec(stream_definition).key
504
+
505
+ object_key = (stream_definition.get("config") or {}).get("object")
506
+ destination_stream_name = object_key or stream_definition["name"]
507
+ table_name = None
508
+ if destination_name and destination_stream_name:
509
+ table_name = ".".join([destination_name, destination_stream_name])
510
+
511
+ metadata = {
512
+ "elapsed_time": end_time - start_time,
513
+ "stream_name": stream_definition["name"],
514
+ **TableMetadataSet(
515
+ table_name=table_name,
516
+ ),
517
+ }
518
+
519
+ if context.has_assets_def:
520
+ yield MaterializeResult(asset_key=asset_key, metadata=metadata)
521
+ else:
522
+ yield AssetMaterialization(asset_key=asset_key, metadata=metadata)
523
+
524
+ def _stream_sling_replicate(
525
+ self,
526
+ context: Union[OpExecutionContext, AssetExecutionContext],
527
+ replication_config: dict[str, Any],
528
+ dagster_sling_translator: DagsterSlingTranslator,
529
+ env: dict,
530
+ debug: bool,
531
+ ) -> Generator[Union[MaterializeResult, AssetMaterialization], None, None]:
532
+ """Underlying function to run replication and fetch metadata in stream mode."""
533
+ # define variable to use to compute metadata during run
534
+ current_stream = None
535
+ metadata_text = []
536
+ metadata = {}
537
+
538
+ # convert to dict to enable updating the index
539
+ context_streams = self._get_replication_streams_for_context(context)
540
+
541
+ if context_streams:
542
+ replication_config.update({"streams": context_streams})
543
+
544
+ uid = uuid.uuid4()
545
+ temp_dir = tempfile.gettempdir()
546
+ temp_file = os.path.join(temp_dir, f"sling-replication-{uid}.json")
547
+
548
+ with open(temp_file, "w") as file:
549
+ json.dump(replication_config, file, cls=sling.JsonEncoder)
550
+
551
+ logger.debug(f"Replication config: {replication_config}")
419
552
 
420
- if context.has_assets_def:
421
- yield MaterializeResult(asset_key=asset_key, metadata=metadata)
553
+ debug_str = "-d" if debug else ""
554
+
555
+ cmd = f"{sling.SLING_BIN} run {debug_str} -r {temp_file}"
556
+
557
+ logger.debug(f"Running Sling replication with command: {cmd}")
558
+
559
+ # Get start time from wall clock
560
+ start_time = time.time()
561
+
562
+ for line in sling._exec_cmd(cmd, env=env): # noqa
563
+ if line == "": # if empty line -- skipped
564
+ continue
565
+ text = self._clean_timestamp_log(line) # else clean timestamp
566
+ logger.info(text) # log info to dagster log
567
+
568
+ # if no current stream is chosen
569
+ if current_stream is None:
570
+ # Try to match stream name with stream keyword
571
+ matched = re.findall("stream (.*)$", text)
572
+
573
+ # If found, extract stream name, stream config, asset key
574
+ if matched:
575
+ current_stream = matched[0]
576
+ current_config = replication_config.get("streams", {}).get(current_stream, {})
577
+ asset_key = dagster_sling_translator.get_asset_spec(
578
+ {"name": current_stream, "config": current_config}
579
+ ).key
580
+ if debug:
581
+ logger.debug(current_stream)
582
+ logger.debug(current_config)
583
+ logger.debug(asset_key)
584
+ # Else search for single replication format
422
585
  else:
423
- yield AssetMaterialization(asset_key=asset_key, metadata=metadata)
586
+ # If found, extract stream name, stream config, asset key
587
+ matched = re.findall(r"Sling Replication [|] .* [|] (\S*)$", text)
588
+ if matched:
589
+ current_stream = matched[0]
590
+ current_config = replication_config.get("streams", {}).get(
591
+ current_stream, {}
592
+ )
593
+ asset_key = dagster_sling_translator.get_asset_spec(
594
+ {"name": current_stream, "config": current_config}
595
+ ).key
596
+ if debug:
597
+ logger.debug(current_stream)
598
+ logger.debug(current_config)
599
+ logger.debug(asset_key)
600
+ # Else log that no stream found. This is normal for a few line. But if multiple line come up, further evaluate might be needed for other pattern
601
+ else:
602
+ if debug:
603
+ logger.debug("no match stream name")
604
+ # If current stream is already choose
605
+ else:
606
+ # Search whether the current stream ended
607
+ matched = re.findall("execution succeeded", text)
608
+
609
+ if matched:
610
+ # If yes, query metadata and materialize asset
611
+ metadata = self._query_metadata("\n".join(metadata_text), start_time=start_time)
612
+ start_time = time.time()
613
+ metadata["stream_name"] = current_stream
614
+ logger.debug(metadata)
615
+ if context.has_assets_def:
616
+ yield MaterializeResult(asset_key=asset_key, metadata=metadata)
617
+ else:
618
+ yield AssetMaterialization(asset_key=asset_key, metadata=metadata)
619
+
620
+ current_stream = None
621
+ metadata_text = []
622
+
623
+ metadata_text.append(text)
624
+
625
+ # clean up unused file
626
+ os.remove(temp_file)
424
627
 
425
628
  def stream_raw_logs(self) -> Generator[str, None, None]:
426
629
  """Returns a generator of raw logs from the Sling CLI."""
dagster_sling/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.26.5"
1
+ __version__ = "0.26.6rc0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dagster-sling
3
- Version: 0.26.5
3
+ Version: 0.26.6rc0
4
4
  Summary: Package for performing ETL/ELT tasks with Sling in Dagster.
5
5
  Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-sling
6
6
  Author: Dagster Labs
@@ -14,7 +14,7 @@ Classifier: License :: OSI Approved :: Apache Software License
14
14
  Classifier: Operating System :: OS Independent
15
15
  Requires-Python: >=3.9,<3.13
16
16
  License-File: LICENSE
17
- Requires-Dist: dagster ==1.10.5
17
+ Requires-Dist: dagster ==1.10.6rc0
18
18
  Requires-Dist: sling >=1.1.5
19
19
  Requires-Dist: sling-mac-arm64 ; platform_system == "Darwin" and platform_machine == "arm64"
20
20
  Provides-Extra: test
@@ -3,12 +3,12 @@ dagster_sling/asset_decorator.py,sha256=GUr2_azLV4lzu68IkObhq4NjOxr4LSd7vJW0Ax-2
3
3
  dagster_sling/asset_defs.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  dagster_sling/dagster_sling_translator.py,sha256=66brFssMpkEccvW0AT_juakKglJLJZmYAs1QjOR5k-U,23921
5
5
  dagster_sling/py.typed,sha256=la67KBlbjXN-_-DfGNcdOcjYumVpKG_Tkw-8n5dnGB4,8
6
- dagster_sling/resources.py,sha256=AiFEmFXfimm_76ERyjxZmdosyZ5MPYTigCqOELLsLeg,17613
6
+ dagster_sling/resources.py,sha256=h_OwjnJSOeq_Ykd0h2iJ-NYhQDbRCo0NaWOz2yLL0jw,25794
7
7
  dagster_sling/sling_event_iterator.py,sha256=rVJwngxZATM_dbBcZ9W0EM9N29i8vF9h4l8Y6YNJ6h8,8456
8
8
  dagster_sling/sling_replication.py,sha256=TFaJsi0C4yUWtYgCG2N4JnAkTdmYXE1ewoaGuqOCNTk,1111
9
- dagster_sling/version.py,sha256=Q5QQA3kVN0Uxvuxo_emQJjjelCa7wzxU2n7kQ-gW-aM,23
10
- dagster_sling-0.26.5.dist-info/LICENSE,sha256=TMatHW4_G9ldRdodEAp-l2Xa2WvsdeOh60E3v1R2jis,11349
11
- dagster_sling-0.26.5.dist-info/METADATA,sha256=PCVHJn-G9BmtbAhB2fp1hoqVumLZXXiCQsxn-5hNinc,884
12
- dagster_sling-0.26.5.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
13
- dagster_sling-0.26.5.dist-info/top_level.txt,sha256=eoJKEGsD6fqIEmF6xaF8tj5Kq9a7riWyRHbZn6oHTk8,14
14
- dagster_sling-0.26.5.dist-info/RECORD,,
9
+ dagster_sling/version.py,sha256=RNuIp4ediZH4C3GxBWeJYLDOclzjRe4kdN3UHYJqNYo,26
10
+ dagster_sling-0.26.6rc0.dist-info/LICENSE,sha256=TMatHW4_G9ldRdodEAp-l2Xa2WvsdeOh60E3v1R2jis,11349
11
+ dagster_sling-0.26.6rc0.dist-info/METADATA,sha256=QgQnZAunSQ91S980q50iZ-vZYknsd-zcUcqVXsnuR5g,890
12
+ dagster_sling-0.26.6rc0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
13
+ dagster_sling-0.26.6rc0.dist-info/top_level.txt,sha256=eoJKEGsD6fqIEmF6xaF8tj5Kq9a7riWyRHbZn6oHTk8,14
14
+ dagster_sling-0.26.6rc0.dist-info/RECORD,,