datapipelab 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {datapipelab-0.1.4 → datapipelab-0.1.6}/PKG-INFO +1 -4
  2. datapipelab-0.1.6/datapipelab/app/node/custom_node.py +22 -0
  3. datapipelab-0.1.6/datapipelab/app/node/processor/bigquery_api_node.py +33 -0
  4. datapipelab-0.1.6/datapipelab/app/node/processor/bigquery_spark_node.py +30 -0
  5. datapipelab-0.1.6/datapipelab/app/node/processor/shell_node.py +23 -0
  6. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/sink/csv_node.py +1 -1
  7. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/sink/delta_node.py +1 -1
  8. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/sink/hive_node.py +1 -1
  9. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/sink/pandas_csv_node.py +1 -1
  10. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/sink/teams_notification_node.py +3 -2
  11. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/engine.py +1 -1
  12. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/pipeline_config.py +11 -3
  13. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/pipeline_handler.py +8 -2
  14. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab.egg-info/PKG-INFO +1 -4
  15. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab.egg-info/SOURCES.txt +4 -0
  16. datapipelab-0.1.6/datapipelab.egg-info/requires.txt +2 -0
  17. {datapipelab-0.1.4 → datapipelab-0.1.6}/setup.py +4 -4
  18. datapipelab-0.1.4/datapipelab.egg-info/requires.txt +0 -5
  19. {datapipelab-0.1.4 → datapipelab-0.1.6}/MANIFEST.in +0 -0
  20. {datapipelab-0.1.4 → datapipelab-0.1.6}/README.md +0 -0
  21. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/__init__.py +0 -0
  22. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/__init__.py +0 -0
  23. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/connector_node/__init__.py +0 -0
  24. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/__init__.py +0 -0
  25. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/processor/__init__.py +0 -0
  26. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/processor/custom_node.py +0 -0
  27. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/processor/spark_node.py +0 -0
  28. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/sink/__init__.py +0 -0
  29. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/source/__init__.py +0 -0
  30. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/source/delta_node.py +0 -0
  31. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/source/hive_node.py +0 -0
  32. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/source/spark_node.py +0 -0
  33. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/app/node/tnode.py +0 -0
  34. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/logger.py +0 -0
  35. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab/pipeline.py +0 -0
  36. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab.egg-info/dependency_links.txt +0 -0
  37. {datapipelab-0.1.4 → datapipelab-0.1.6}/datapipelab.egg-info/top_level.txt +0 -0
  38. {datapipelab-0.1.4 → datapipelab-0.1.6}/setup.cfg +0 -0
@@ -1,11 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
7
- Requires-Dist: azure-storage-blob
8
- Requires-Dist: google-cloud-storage
9
- Requires-Dist: pandas
10
7
  Dynamic: requires-dist
11
8
  Dynamic: summary
@@ -0,0 +1,22 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class CustomNode(TNode):
5
+ def __init__(self, spark, tnode_config, t_df):
6
+ super().__init__(spark=spark)
7
+ self.tnode_config = tnode_config
8
+ self.spark = spark
9
+ self.t_df = t_df
10
+ module_name = tnode_config['options']['module_name']
11
+ module_path = tnode_config['options']['module_path']
12
+ class_name = tnode_config['options']['class_name']
13
+ self.custom_processor = self.import_module(module_name, module_path, class_name)
14
+
15
+ def import_module(self, module_name, module_path, class_name):
16
+ custom_module = __import__(module_path, fromlist=[module_name])
17
+ custom_class = getattr(custom_module, class_name)
18
+ return custom_class(self.spark, self.tnode_config) # .create_instance(self.t_df)
19
+
20
+ def _process(self):
21
+ logger.info(f"Custom node type is {self.tnode_config.get('custom_node_type', 'N/A!')}")
22
+ return self.custom_processor.process()
@@ -0,0 +1,33 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class BigQueryAPIProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ from google.cloud import bigquery
7
+ from google.oauth2 import service_account
8
+ super().__init__(spark=spark)
9
+ self.sql_query = tnode_config['options']['query']
10
+ self.node_name = tnode_config['name']
11
+ self.credentials_path = tnode_config['options']['credentials_path']
12
+ self.return_as_spark_df = tnode_config['options']['return_as_spark_df']
13
+ self.project_name = tnode_config['options']['project_name']
14
+
15
+ def __sql_query(self, sql_query):
16
+ credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
17
+ client = bigquery.Client(credentials=credentials, project=self.project_name)
18
+
19
+ # run the job
20
+ query_job = client.query(sql_query)
21
+
22
+ results = query_job.result()
23
+ rows = [dict(row) for row in results]
24
+ if self.return_as_spark_df:
25
+ self.node = self.spark.createDataFrame(rows)
26
+ else:
27
+ self.node = None
28
+ logger.info(rows)
29
+
30
+ def _process(self):
31
+ self.__sql_query(self.sql_query)
32
+ self._createOrReplaceTempView()
33
+ return self.node
@@ -0,0 +1,30 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class BigQuerySparkProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.sql_query = tnode_config['options']['query']
8
+ self.node_name = tnode_config['name']
9
+ self.credentials_path = tnode_config['options']['materialization_dataset'] # materializationDataset
10
+ self.return_as_spark_df = tnode_config['options']['parent_project'] # parentProject
11
+
12
+ def __sql_query(self, sql_query):
13
+ credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
14
+ client = bigquery.Client(credentials=credentials, project=self.project_name)
15
+
16
+ # run the job
17
+ query_job = client.query(sql_query)
18
+
19
+ results = query_job.result()
20
+ rows = [dict(row) for row in results]
21
+ if self.return_as_spark_df:
22
+ self.node = self.spark.createDataFrame(rows)
23
+ else:
24
+ self.node = None
25
+ logger.info(rows)
26
+
27
+ def _process(self):
28
+ self.__sql_query(self.sql_query)
29
+ self._createOrReplaceTempView()
30
+ return self.node
@@ -0,0 +1,23 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class ShellProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+
7
+ super().__init__(spark=spark)
8
+ self.shell_query = tnode_config['options']['query']
9
+ self.node_name = tnode_config['name']
10
+
11
+ def __shell_query(self):
12
+ import subprocess
13
+ # run the job
14
+ result = subprocess.run(
15
+ f"{self.shell_query}",
16
+ shell=True, check=True, executable='/bin/bash'
17
+ )
18
+ logger.info(result)
19
+
20
+ def _process(self):
21
+ self.__shell_query()
22
+ self._createOrReplaceTempView()
23
+ return self.node
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class CSVSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.output_path = tnode_config['options']['path']
9
9
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -1,10 +1,10 @@
1
1
  from pyspark.sql import DataFrame
2
2
  from datapipelab.app.node.tnode import TNode
3
- from delta.tables import DeltaTable
4
3
 
5
4
 
6
5
  class DeltaSinkNode(TNode):
7
6
  def __init__(self, spark, tnode_config, t_df):
7
+ from delta.tables import DeltaTable
8
8
  super().__init__(spark=spark)
9
9
  self.mode = tnode_config['options']['mode'] # Can be 'append', 'overwrite', or 'upsert'
10
10
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class HiveSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.mode = tnode_config['mode']
9
9
  self.stream = tnode_config['stream']
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class PandasCSVSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.mode = tnode_config['options'].get('mode', 'w')
9
9
  # self.stream = tnode_config['stream']
@@ -1,10 +1,11 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
- import json
2
+
4
3
 
5
4
 
6
5
  class TeamsNotificationSinkNode(TNode):
7
6
  def __init__(self, spark, tnode_config, df=None):
7
+ from pyspark.sql import DataFrame
8
+ import json
8
9
  super().__init__(spark=spark)
9
10
  self.teams_msg_body = tnode_config['options']['teams_msg_body']
10
11
  self.teams_msg_title = tnode_config['options'].get('teams_msg_title', 'Notification')
@@ -3,7 +3,7 @@ from datapipelab.logger import logger
3
3
 
4
4
 
5
5
  class Engine:
6
- def __init__(self, engine_config_path, spark, params):
6
+ def __init__(self, engine_config_path, spark, params=None):
7
7
  self.engine_config_path = engine_config_path
8
8
  self.params = params
9
9
  self.pipeline = None
@@ -6,14 +6,14 @@ class PipelineConfig:
6
6
  print(config_file)
7
7
  self.params = params
8
8
  self.config_file = config_file
9
- # If config file is instance of string, it is a path to the config file
9
+ # If the config file is an instance of string, it is a path to the config file
10
10
  self.config_file = config_file
11
11
  if isinstance(self.config_file, str):
12
12
  self.load_json_config_file()
13
- elif isinstance(self.config_file, dict):
13
+ elif isinstance(self.config_file, list):
14
14
  self.pipeline_settings = config_file
15
15
  else:
16
- raise ValueError("Invalid config file type. Must be a string or a dictionary.")
16
+ raise ValueError("Invalid config file type. Must be a string or a list.")
17
17
  self.sources = {}
18
18
  self.processors = {}
19
19
  self.sinks = {}
@@ -27,6 +27,14 @@ class PipelineConfig:
27
27
  json_config_file = json_config_file.replace(f"{{{key}}}", value)
28
28
  # Convert to JSON file
29
29
  self.pipeline_settings = json5.loads(json_config_file)
30
+ if len(self.pipeline_settings) > 0 and self.pipeline_settings[0]['type'] == 'import':
31
+ self.import_json_config_file()
32
+
33
+ def import_json_config_file(self):
34
+ import_pipeline_settings = []
35
+ for import_component in self.pipeline_settings:
36
+ if import_component['type'] == '': # Maybe someone wants to use import in the middle of his config?
37
+ pass
30
38
 
31
39
  def create_pipeline_nodes(self):
32
40
  for component in self.pipeline_settings:
@@ -6,6 +6,7 @@ from datapipelab.app.node.sink.delta_node import DeltaSinkNode
6
6
  from datapipelab.app.node.sink.csv_node import CSVSinkNode
7
7
  from datapipelab.app.node.sink.pandas_csv_node import PandasCSVSinkNode
8
8
  from datapipelab.app.node.sink.teams_notification_node import TeamsNotificationSinkNode
9
+ from datapipelab.app.node.processor.bigquery_spark_node import BigQuerySparkProcessorNode
9
10
 
10
11
 
11
12
  class PipelineHandler:
@@ -42,10 +43,12 @@ class PipelineHandler:
42
43
  processor_df = None
43
44
  if tnode_format == 'custom':
44
45
  processor_df = CustomNode(self.spark, tnode_config).run()
45
-
46
46
  if tnode_format == 'query':
47
47
  processor_df = SparkProcessorNode(self.spark, tnode_config).run()
48
-
48
+ if tnode_format == 'bigquery_api':
49
+ processor_df = None
50
+ if tnode_format == 'bigquery_spark':
51
+ processor_df = datapipelab.app.node.processor.bigquery_spark.BigQuerySparkProcessorNode(self.spark, tnode_config).run()
49
52
  return processor_df
50
53
 
51
54
  def write_sink_node(self, tnode_config, t_df):
@@ -70,5 +73,8 @@ class PipelineHandler:
70
73
  if tnode_type == "local":
71
74
  if tnode_format == "csv":
72
75
  PandasCSVSinkNode(self.spark, tnode_config, t_df).run()
76
+ if tnode_type == 'custom':
77
+ from datapipelab.app.node import custom_node
78
+ processor_df = custom_node.CustomNode(self.spark, tnode_config).run()
73
79
 
74
80
 
@@ -1,11 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
7
- Requires-Dist: azure-storage-blob
8
- Requires-Dist: google-cloud-storage
9
- Requires-Dist: pandas
10
7
  Dynamic: requires-dist
11
8
  Dynamic: summary
@@ -15,9 +15,13 @@ datapipelab.egg-info/top_level.txt
15
15
  datapipelab/app/__init__.py
16
16
  datapipelab/app/connector_node/__init__.py
17
17
  datapipelab/app/node/__init__.py
18
+ datapipelab/app/node/custom_node.py
18
19
  datapipelab/app/node/tnode.py
19
20
  datapipelab/app/node/processor/__init__.py
21
+ datapipelab/app/node/processor/bigquery_api_node.py
22
+ datapipelab/app/node/processor/bigquery_spark_node.py
20
23
  datapipelab/app/node/processor/custom_node.py
24
+ datapipelab/app/node/processor/shell_node.py
21
25
  datapipelab/app/node/processor/spark_node.py
22
26
  datapipelab/app/node/sink/__init__.py
23
27
  datapipelab/app/node/sink/csv_node.py
@@ -0,0 +1,2 @@
1
+ json5
2
+ loguru
@@ -2,15 +2,15 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='datapipelab',
5
- version='0.1.4',
5
+ version='0.1.6',
6
6
  description='A data pipeline library with connectors, sources, processors, and sinks.',
7
7
  packages=find_packages(),
8
8
  include_package_data=True,
9
9
  install_requires=[
10
10
  'json5',
11
11
  'loguru',
12
- 'azure-storage-blob',
13
- 'google-cloud-storage',
14
- 'pandas'
12
+ # 'azure-storage-blob',
13
+ # 'google-cloud-storage',
14
+ # 'pandas'
15
15
  ],
16
16
  )
@@ -1,5 +0,0 @@
1
- json5
2
- loguru
3
- azure-storage-blob
4
- google-cloud-storage
5
- pandas
File without changes
File without changes
File without changes