datapipelab 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {datapipelab-0.3.0 → datapipelab-0.3.2}/PKG-INFO +1 -1
  2. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/processor/bigquery_spark_node.py +2 -0
  3. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/processor/spark_node.py +2 -0
  4. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/sink/csv_node.py +1 -1
  5. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/sink/delta_node.py +2 -2
  6. datapipelab-0.3.2/datapipelab/app/node/sink/spark_api_node.py +38 -0
  7. datapipelab-0.3.2/datapipelab/app/node/source/spark_api_node.py +41 -0
  8. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/source/spark_node.py +2 -0
  9. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/engine.py +1 -1
  10. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/pipeline_handler.py +8 -0
  11. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab.egg-info/PKG-INFO +1 -1
  12. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab.egg-info/SOURCES.txt +2 -0
  13. {datapipelab-0.3.0 → datapipelab-0.3.2}/setup.py +1 -1
  14. {datapipelab-0.3.0 → datapipelab-0.3.2}/MANIFEST.in +0 -0
  15. {datapipelab-0.3.0 → datapipelab-0.3.2}/README.md +0 -0
  16. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/__init__.py +0 -0
  17. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/__init__.py +0 -0
  18. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/connector_node/__init__.py +0 -0
  19. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/__init__.py +0 -0
  20. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/custom_node.py +0 -0
  21. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/processor/__init__.py +0 -0
  22. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/processor/bigquery_api_node.py +0 -0
  23. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/processor/custom_node.py +0 -0
  24. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/processor/gcp_bucket_node.py +0 -0
  25. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/processor/shell_node.py +0 -0
  26. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/sink/__init__.py +0 -0
  27. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/sink/hive_node.py +0 -0
  28. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/sink/pandas_csv_node.py +0 -0
  29. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/sink/spark_node.py +0 -0
  30. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/sink/teams_notification_node.py +0 -0
  31. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/source/__init__.py +0 -0
  32. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/source/delta_node.py +0 -0
  33. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/source/hive_node.py +0 -0
  34. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/node/tnode.py +0 -0
  35. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/wrapper/__init__.py +0 -0
  36. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/app/wrapper/source/__init__.py +0 -0
  37. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/logger.py +0 -0
  38. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/pipeline.py +0 -0
  39. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab/pipeline_config.py +0 -0
  40. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab.egg-info/dependency_links.txt +0 -0
  41. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab.egg-info/requires.txt +0 -0
  42. {datapipelab-0.3.0 → datapipelab-0.3.2}/datapipelab.egg-info/top_level.txt +0 -0
  43. {datapipelab-0.3.0 → datapipelab-0.3.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
@@ -1,3 +1,5 @@
1
+ # This node should be merged with source/spark_api_node.py
2
+
1
3
  from datapipelab.app.node.tnode import TNode
2
4
  from datapipelab.logger import logger
3
5
 
@@ -1,3 +1,5 @@
1
+ # This node should be renamed to spark_sql_node
2
+
1
3
  from datapipelab.app.node.tnode import TNode
2
4
 
3
5
 
@@ -1,5 +1,5 @@
1
1
  from datapipelab.app.node.tnode import TNode
2
-
2
+ from datapipelab.logger import logger
3
3
 
4
4
  class CSVSinkNode(TNode):
5
5
  def __init__(self, spark, tnode_config, t_df):
@@ -1,10 +1,10 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
-
2
+ from datapipelab.logger import logger
4
3
 
5
4
  class DeltaSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
7
6
  from delta.tables import DeltaTable
7
+ from pyspark.sql import DataFrame
8
8
  super().__init__(spark=spark)
9
9
  self.mode = tnode_config['options']['mode'] # Can be 'append', 'overwrite', or 'upsert'
10
10
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -0,0 +1,38 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+
5
+ class SparkApiSourceNode(TNode):
6
+ def __init__(self, spark, tnode_config, df):
7
+ from pyspark.sql import DataFrame
8
+ super().__init__(spark=spark)
9
+ self.df = df
10
+ self.__load_options(tnode_config)
11
+
12
+ def __load_options(self, tnode_config):
13
+ self.spark_options = tnode_config.get('options', {})
14
+ self.options = {}
15
+ if 'format' in self.spark_options:
16
+ self.format = self.spark_options.get('format')
17
+ if 'mode' in self.spark_options:
18
+ self.mode = self.spark_options.get('mode')
19
+ if 'parent_project' in self.spark_options:
20
+ self.options['parentProject'] = self.spark_options.get('parent_project')
21
+ if 'table' in self.spark_options:
22
+ self.options['table'] = self.spark_options.get('table')
23
+ if 'write_method' in self.spark_options:
24
+ self.options['writeMethod'] = self.spark_options.get('write_method')
25
+
26
+ def __write_df(self):
27
+ writer = self.df.write
28
+ if self.format:
29
+ writer = writer.format(self.format)
30
+ for key, value in self.options.items():
31
+ if value:
32
+ writer = writer.option(key, value)
33
+ if self.mode:
34
+ writer = writer.mode(self.mode)
35
+ writer.save()
36
+
37
+ def _process(self):
38
+ self.__write_df()
@@ -0,0 +1,41 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+
5
+ class SparkApiSourceNode(TNode):
6
+ def __init__(self, spark, tnode_config):
7
+ super().__init__(spark=spark)
8
+ self.__load_options(tnode_config)
9
+
10
+
11
+ def __load_options(self, tnode_config):
12
+ self.spark_options = tnode_config.get('options', {})
13
+ self.options = {}
14
+ if 'format' in self.spark_options:
15
+ self.format = self.spark_options.get('format')
16
+ if 'query' in self.spark_options:
17
+ self.query = self.spark_options.get('query')
18
+ if 'materialization_dataset' in self.spark_options:
19
+ self.options['materializationDataset'] = self.spark_options.get('materialization_dataset')
20
+ if 'parent_project' in self.spark_options:
21
+ self.options['parentProject'] = self.spark_options.get('parent_project')
22
+ if 'table' in self.spark_options:
23
+ self.options['table'] = self.spark_options.get('table')
24
+ if 'path' in self.spark_options:
25
+ self.options['path'] = self.spark_options.get('path')
26
+
27
+
28
+
29
+ def __load_df(self):
30
+ reader = self.spark.read
31
+ if self.format:
32
+ reader = reader.format(self.format)
33
+ for key, value in self.options.items():
34
+ if value:
35
+ reader = reader.option(key, value)
36
+ self.node = reader.load()
37
+
38
+ def _process(self):
39
+ self.__load_df()
40
+ self._createOrReplaceTempView()
41
+ return self.node
@@ -1,3 +1,5 @@
1
+ # This node should be renamed to spark_sql_node.py
2
+
1
3
  from datapipelab.app.node.tnode import TNode
2
4
 
3
5
 
@@ -3,7 +3,7 @@ from datapipelab.logger import logger
3
3
 
4
4
 
5
5
  class Engine:
6
- def __init__(self, engine_config_path, spark, params=None):
6
+ def __init__(self, engine_config_path, spark=None, params=None):
7
7
  self.engine_config_path = engine_config_path
8
8
  self.params = params
9
9
  self.pipeline = None
@@ -3,6 +3,7 @@ from datapipelab.app.node.processor.shell_node import ShellProcessorNode
3
3
  from datapipelab.app.node.source.hive_node import HiveSourceNode
4
4
  from datapipelab.app.node.source.spark_node import SparkSourceNode
5
5
  from datapipelab.app.node.source.delta_node import DeltaSourceNode
6
+ from datapipelab.app.node.source.spark_api_node import SparkApiSourceNode
6
7
  from datapipelab.app.node.processor.spark_node import SparkProcessorNode
7
8
  from datapipelab.app.node.sink.delta_node import DeltaSinkNode
8
9
  from datapipelab.app.node.sink.csv_node import CSVSinkNode
@@ -43,6 +44,9 @@ class PipelineHandler:
43
44
  source_df = DeltaSourceNode(self.spark, tnode_config).run()
44
45
  if input_type == 'custom':
45
46
  source_df = CustomNode(self.spark, tnode_config).run()
47
+ if input_type == 'spark':
48
+ if input_format == 'api':
49
+ source_df = SparkApiSourceNode(self.spark, tnode_config).run()
46
50
 
47
51
  return source_df
48
52
 
@@ -96,6 +100,10 @@ class PipelineHandler:
96
100
  if tnode_format == 'spark':
97
101
  from datapipelab.app.node.sink import spark_node
98
102
  processor_df = spark_node.SparkSinkNode(self.spark, tnode_config, t_df[tnode_name_df]).run()
103
+ if tnode_type == 'spark':
104
+ if tnode_format == 'api':
105
+ from datapipelab.app.node.sink import spark_api_node
106
+ processor_df = spark_api_node.SparkApiSourceNode(self.spark, tnode_config, t_df[tnode_name_df]).run()
99
107
 
100
108
 
101
109
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
@@ -29,11 +29,13 @@ datapipelab/app/node/sink/csv_node.py
29
29
  datapipelab/app/node/sink/delta_node.py
30
30
  datapipelab/app/node/sink/hive_node.py
31
31
  datapipelab/app/node/sink/pandas_csv_node.py
32
+ datapipelab/app/node/sink/spark_api_node.py
32
33
  datapipelab/app/node/sink/spark_node.py
33
34
  datapipelab/app/node/sink/teams_notification_node.py
34
35
  datapipelab/app/node/source/__init__.py
35
36
  datapipelab/app/node/source/delta_node.py
36
37
  datapipelab/app/node/source/hive_node.py
38
+ datapipelab/app/node/source/spark_api_node.py
37
39
  datapipelab/app/node/source/spark_node.py
38
40
  datapipelab/app/wrapper/__init__.py
39
41
  datapipelab/app/wrapper/source/__init__.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='datapipelab',
5
- version='0.3.0',
5
+ version='0.3.2',
6
6
  description='A data pipeline library with connectors, sources, processors, and sinks.',
7
7
  packages=find_packages(),
8
8
  include_package_data=True,
File without changes
File without changes
File without changes