datapipelab 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # This node should be merged with source/spark_api_node.py
2
+
1
3
  from datapipelab.app.node.tnode import TNode
2
4
  from datapipelab.logger import logger
3
5
 
@@ -1,3 +1,5 @@
1
+ # This node should be renamed to spark_sql_node
2
+
1
3
  from datapipelab.app.node.tnode import TNode
2
4
 
3
5
 
@@ -1,5 +1,5 @@
1
1
  from datapipelab.app.node.tnode import TNode
2
-
2
+ from datapipelab.logger import logger
3
3
 
4
4
  class CSVSinkNode(TNode):
5
5
  def __init__(self, spark, tnode_config, t_df):
@@ -1,10 +1,10 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
-
2
+ from datapipelab.logger import logger
4
3
 
5
4
  class DeltaSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
7
6
  from delta.tables import DeltaTable
7
+ from pyspark.sql import DataFrame
8
8
  super().__init__(spark=spark)
9
9
  self.mode = tnode_config['options']['mode'] # Can be 'append', 'overwrite', or 'upsert'
10
10
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -0,0 +1,38 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+
5
+ class SparkApiSourceNode(TNode):
6
+ def __init__(self, spark, tnode_config, df):
7
+ from pyspark.sql import DataFrame
8
+ super().__init__(spark=spark)
9
+ self.df = df
10
+ self.__load_options(tnode_config)
11
+
12
+ def __load_options(self, tnode_config):
13
+ self.spark_options = tnode_config.get('options', {})
14
+ self.options = {}
15
+ if 'format' in self.spark_options:
16
+ self.format = self.spark_options.get('format')
17
+ if 'mode' in self.spark_options:
18
+ self.mode = self.spark_options.get('mode')
19
+ if 'parent_project' in self.spark_options:
20
+ self.options['parentProject'] = self.spark_options.get('parent_project')
21
+ if 'table' in self.spark_options:
22
+ self.options['table'] = self.spark_options.get('table')
23
+ if 'write_method' in self.spark_options:
24
+ self.options['writeMethod'] = self.spark_options.get('write_method')
25
+
26
+ def __write_df(self):
27
+ writer = self.df.write
28
+ if self.format:
29
+ writer = writer.format(self.format)
30
+ for key, value in self.options.items():
31
+ if value:
32
+ writer = writer.option(key, value)
33
+ if self.mode:
34
+ writer = writer.mode(self.mode)
35
+ writer.save()
36
+
37
+ def _process(self):
38
+ self.__write_df()
@@ -0,0 +1,41 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+
5
+ class SparkApiSourceNode(TNode):
6
+ def __init__(self, spark, tnode_config):
7
+ super().__init__(spark=spark)
8
+ self.__load_options(tnode_config)
9
+
10
+
11
+ def __load_options(self, tnode_config):
12
+ self.spark_options = tnode_config.get('options', {})
13
+ self.options = {}
14
+ if 'format' in self.spark_options:
15
+ self.format = self.spark_options.get('format')
16
+ if 'query' in self.spark_options:
17
+ self.query = self.spark_options.get('query')
18
+ if 'materialization_dataset' in self.spark_options:
19
+ self.options['materializationDataset'] = self.spark_options.get('materialization_dataset')
20
+ if 'parent_project' in self.spark_options:
21
+ self.options['parentProject'] = self.spark_options.get('parent_project')
22
+ if 'table' in self.spark_options:
23
+ self.options['table'] = self.spark_options.get('table')
24
+ if 'path' in self.spark_options:
25
+ self.options['path'] = self.spark_options.get('path')
26
+
27
+
28
+
29
+ def __load_df(self):
30
+ reader = self.spark.read
31
+ if self.format:
32
+ reader = reader.format(self.format)
33
+ for key, value in self.options.items():
34
+ if value:
35
+ reader = reader.option(key, value)
36
+ self.node = reader.load()
37
+
38
+ def _process(self):
39
+ self.__load_df()
40
+ self._createOrReplaceTempView()
41
+ return self.node
@@ -1,3 +1,5 @@
1
+ # This node should be renamed to spark_sql_node.py
2
+
1
3
  from datapipelab.app.node.tnode import TNode
2
4
 
3
5
 
datapipelab/engine.py CHANGED
@@ -3,7 +3,7 @@ from datapipelab.logger import logger
3
3
 
4
4
 
5
5
  class Engine:
6
- def __init__(self, engine_config_path, spark, params=None):
6
+ def __init__(self, engine_config_path, spark=None, params=None):
7
7
  self.engine_config_path = engine_config_path
8
8
  self.params = params
9
9
  self.pipeline = None
@@ -3,6 +3,7 @@ from datapipelab.app.node.processor.shell_node import ShellProcessorNode
3
3
  from datapipelab.app.node.source.hive_node import HiveSourceNode
4
4
  from datapipelab.app.node.source.spark_node import SparkSourceNode
5
5
  from datapipelab.app.node.source.delta_node import DeltaSourceNode
6
+ from datapipelab.app.node.source.spark_api_node import SparkApiSourceNode
6
7
  from datapipelab.app.node.processor.spark_node import SparkProcessorNode
7
8
  from datapipelab.app.node.sink.delta_node import DeltaSinkNode
8
9
  from datapipelab.app.node.sink.csv_node import CSVSinkNode
@@ -43,6 +44,9 @@ class PipelineHandler:
43
44
  source_df = DeltaSourceNode(self.spark, tnode_config).run()
44
45
  if input_type == 'custom':
45
46
  source_df = CustomNode(self.spark, tnode_config).run()
47
+ if input_type == 'spark':
48
+ if input_format == 'api':
49
+ source_df = SparkApiSourceNode(self.spark, tnode_config).run()
46
50
 
47
51
  return source_df
48
52
 
@@ -96,6 +100,10 @@ class PipelineHandler:
96
100
  if tnode_format == 'spark':
97
101
  from datapipelab.app.node.sink import spark_node
98
102
  processor_df = spark_node.SparkSinkNode(self.spark, tnode_config, t_df[tnode_name_df]).run()
103
+ if tnode_type == 'spark':
104
+ if tnode_format == 'api':
105
+ from datapipelab.app.node.sink import spark_api_node
106
+ processor_df = spark_api_node.SparkApiSourceNode(self.spark, tnode_config, t_df[tnode_name_df]).run()
99
107
 
100
108
 
101
109
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
@@ -1,9 +1,9 @@
1
1
  datapipelab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- datapipelab/engine.py,sha256=3QRsedRYNov6xIDOZ1tukinFE-SKv39Fn3sNCnD3L6g,442
2
+ datapipelab/engine.py,sha256=Dt8oM7RvIMkllPhFpUr1fynJD01ZG-hr6eqt5OSRh-Y,447
3
3
  datapipelab/logger.py,sha256=Ugv0A4TfD3JWCWXNWu0lURcnfAEyuVrK3IrvVVgcHBo,864
4
4
  datapipelab/pipeline.py,sha256=dw9D9KM_hztt9g_YzqoNgQBRyCYR92cRZwrU5duP_Pg,1464
5
5
  datapipelab/pipeline_config.py,sha256=2bFAJepViE7rT7CaRANZU07aeQpOYcZ954ISujm9pXA,3816
6
- datapipelab/pipeline_handler.py,sha256=Q1AzuPgOb9bElclfX-E8PiTGrdOKzFshQUjbNNXp5m0,4980
6
+ datapipelab/pipeline_handler.py,sha256=LrVhYFAPf1FVLlDBACmQu-cJkVX-X8r4eIavwxJlAGo,5464
7
7
  datapipelab/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  datapipelab/app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  datapipelab/app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -11,25 +11,27 @@ datapipelab/app/node/custom_node.py,sha256=3Se4DweMvm5VK4MTZ-pQSQ_lE_fOm6cGj-wzc
11
11
  datapipelab/app/node/tnode.py,sha256=-2hnQkIuLwEy7xVTig54TByO7L2l7UujolXMQL0CQJA,484
12
12
  datapipelab/app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  datapipelab/app/node/processor/bigquery_api_node.py,sha256=IclDkGxo9ltGJVkBaHKFPFCSlEEyzefgalaAOLA17bE,1752
14
- datapipelab/app/node/processor/bigquery_spark_node.py,sha256=S9kIYW0RE5b0RjniKFFBTzA3Tx4_plFdkFQXzhl1xTY,1039
14
+ datapipelab/app/node/processor/bigquery_spark_node.py,sha256=pklpsqYqztidCIECkl3rpjfY6LiB0p4thvE7-PzBodE,1099
15
15
  datapipelab/app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
16
16
  datapipelab/app/node/processor/gcp_bucket_node.py,sha256=bzV2c89-g5S0OH5bcKKQ-9yKOGwlmOR7h7_5uO6Gnq0,1904
17
17
  datapipelab/app/node/processor/shell_node.py,sha256=s3dKgfEqbpUIEiwORERgvp7FNDE5JkFHBo7EnJYBPnA,669
18
- datapipelab/app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
18
+ datapipelab/app/node/processor/spark_node.py,sha256=ROSp_gpqHhtS4jog7z64jEcVPaCLFaELyIhb1A2UVe8,532
19
19
  datapipelab/app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- datapipelab/app/node/sink/csv_node.py,sha256=d2hyufP5_Nmql0pfD0KeC4rFu1wXTnBxVsoGl7sWbhM,1681
21
- datapipelab/app/node/sink/delta_node.py,sha256=iKEdiTjJ7SHJZMrbm0jR5tms5JZ5iCFfQklZbI-Yr2o,2044
20
+ datapipelab/app/node/sink/csv_node.py,sha256=ZcrMZXIwJ_ln4ZZbpCAT-iMDAZIDFI9eSHNENx4wMpA,1718
21
+ datapipelab/app/node/sink/delta_node.py,sha256=4ajvMyz3cpXbd29_mZq0MW-gwpLJqdj6F9urmP8uHJw,2089
22
22
  datapipelab/app/node/sink/hive_node.py,sha256=ycknOPBBwZGH3oHram_6LjHy-ygFjhuFNvVoPaNGaCU,1220
23
23
  datapipelab/app/node/sink/pandas_csv_node.py,sha256=JsJFt2XRpwxGeJyt_PDUgqZafiQROf1Sk5TUhQPxh4c,870
24
+ datapipelab/app/node/sink/spark_api_node.py,sha256=Uu25EtQEXJkqzm3eBEwvpuqSfVXpYUBbWRaybC_BoQQ,1406
24
25
  datapipelab/app/node/sink/spark_node.py,sha256=tP3tZae2jzQtAtfIm8C-166WWSLdZs54mqoIyZOSy58,1221
25
26
  datapipelab/app/node/sink/teams_notification_node.py,sha256=6ZufdbhVvRXi3QTQafLo5uKl9kLyDnkYIE_VZFT0QNw,3581
26
27
  datapipelab/app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
28
  datapipelab/app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
28
29
  datapipelab/app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
29
- datapipelab/app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
30
+ datapipelab/app/node/source/spark_api_node.py,sha256=HQQAkFpzIh_oA17aaQX6TVyfXtCdILGvzx4FIWXZyo8,1496
31
+ datapipelab/app/node/source/spark_node.py,sha256=TDfezmlk8Ts2YTGkB92-God_AyGVUslTUoevXolN7W8,532
30
32
  datapipelab/app/wrapper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
33
  datapipelab/app/wrapper/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
- datapipelab-0.3.0.dist-info/METADATA,sha256=uXwxvNNpO_FSDUi8rC5bxPNImGtLuZuSMUV9QDNptT0,220
33
- datapipelab-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
- datapipelab-0.3.0.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
35
- datapipelab-0.3.0.dist-info/RECORD,,
34
+ datapipelab-0.3.2.dist-info/METADATA,sha256=ztMAegNhlrEhJcmbHJAF7qFPMwuNvyf_CKEUlrmqsZQ,220
35
+ datapipelab-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ datapipelab-0.3.2.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
37
+ datapipelab-0.3.2.dist-info/RECORD,,