datapipelab 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class CustomNode(TNode):
5
+ def __init__(self, spark, tnode_config, t_df):
6
+ super().__init__(spark=spark)
7
+ self.tnode_config = tnode_config
8
+ self.spark = spark
9
+ self.t_df = t_df
10
+ module_name = tnode_config['options']['module_name']
11
+ module_path = tnode_config['options']['module_path']
12
+ class_name = tnode_config['options']['class_name']
13
+ self.custom_processor = self.import_module(module_name, module_path, class_name)
14
+
15
+ def import_module(self, module_name, module_path, class_name):
16
+ custom_module = __import__(module_path, fromlist=[module_name])
17
+ custom_class = getattr(custom_module, class_name)
18
+ return custom_class(self.spark, self.tnode_config) # .create_instance(self.t_df)
19
+
20
+ def _process(self):
21
+ logger.info(f"Custom node type is {self.tnode_config.get('custom_node_type', 'N/A!')}")
22
+ return self.custom_processor.process()
@@ -0,0 +1,33 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class BigQueryAPIProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ from google.cloud import bigquery
7
+ from google.oauth2 import service_account
8
+ super().__init__(spark=spark)
9
+ self.sql_query = tnode_config['options']['query']
10
+ self.node_name = tnode_config['name']
11
+ self.credentials_path = tnode_config['options']['credentials_path']
12
+ self.return_as_spark_df = tnode_config['options']['return_as_spark_df']
13
+ self.project_name = tnode_config['options']['project_name']
14
+
15
+ def __sql_query(self, sql_query):
16
+ credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
17
+ client = bigquery.Client(credentials=credentials, project=self.project_name)
18
+
19
+ # run the job
20
+ query_job = client.query(sql_query)
21
+
22
+ results = query_job.result()
23
+ rows = [dict(row) for row in results]
24
+ if self.return_as_spark_df:
25
+ self.node = self.spark.createDataFrame(rows)
26
+ else:
27
+ self.node = None
28
+ logger.info(rows)
29
+
30
+ def _process(self):
31
+ self.__sql_query(self.sql_query)
32
+ self._createOrReplaceTempView()
33
+ return self.node
@@ -0,0 +1,30 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class BigQuerySparkProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.sql_query = tnode_config['options']['query']
8
+ self.node_name = tnode_config['name']
9
+ self.credentials_path = tnode_config['options']['materialization_dataset'] # materializationDataset
10
+ self.return_as_spark_df = tnode_config['options']['parent_project'] # parentProject
11
+
12
+ def __sql_query(self, sql_query):
13
+ credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
14
+ client = bigquery.Client(credentials=credentials, project=self.project_name)
15
+
16
+ # run the job
17
+ query_job = client.query(sql_query)
18
+
19
+ results = query_job.result()
20
+ rows = [dict(row) for row in results]
21
+ if self.return_as_spark_df:
22
+ self.node = self.spark.createDataFrame(rows)
23
+ else:
24
+ self.node = None
25
+ logger.info(rows)
26
+
27
+ def _process(self):
28
+ self.__sql_query(self.sql_query)
29
+ self._createOrReplaceTempView()
30
+ return self.node
@@ -0,0 +1,23 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class ShellProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+
7
+ super().__init__(spark=spark)
8
+ self.shell_query = tnode_config['options']['query']
9
+ self.node_name = tnode_config['name']
10
+
11
+ def __shell_query(self):
12
+ import subprocess
13
+ # run the job
14
+ result = subprocess.run(
15
+ f"{self.shell_query}",
16
+ shell=True, check=True, executable='/bin/bash'
17
+ )
18
+ logger.info(result)
19
+
20
+ def _process(self):
21
+ self.__shell_query()
22
+ self._createOrReplaceTempView()
23
+ return self.node
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class CSVSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.output_path = tnode_config['options']['path']
9
9
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -1,10 +1,10 @@
1
1
  from pyspark.sql import DataFrame
2
2
  from datapipelab.app.node.tnode import TNode
3
- from delta.tables import DeltaTable
4
3
 
5
4
 
6
5
  class DeltaSinkNode(TNode):
7
6
  def __init__(self, spark, tnode_config, t_df):
7
+ from delta.tables import DeltaTable
8
8
  super().__init__(spark=spark)
9
9
  self.mode = tnode_config['options']['mode'] # Can be 'append', 'overwrite', or 'upsert'
10
10
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class HiveSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.mode = tnode_config['mode']
9
9
  self.stream = tnode_config['stream']
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class PandasCSVSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.mode = tnode_config['options'].get('mode', 'w')
9
9
  # self.stream = tnode_config['stream']
@@ -1,10 +1,11 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
- import json
2
+
4
3
 
5
4
 
6
5
  class TeamsNotificationSinkNode(TNode):
7
6
  def __init__(self, spark, tnode_config, df=None):
7
+ from pyspark.sql import DataFrame
8
+ import json
8
9
  super().__init__(spark=spark)
9
10
  self.teams_msg_body = tnode_config['options']['teams_msg_body']
10
11
  self.teams_msg_title = tnode_config['options'].get('teams_msg_title', 'Notification')
datapipelab/engine.py CHANGED
@@ -3,7 +3,7 @@ from datapipelab.logger import logger
3
3
 
4
4
 
5
5
  class Engine:
6
- def __init__(self, engine_config_path, spark, params):
6
+ def __init__(self, engine_config_path, spark, params=None):
7
7
  self.engine_config_path = engine_config_path
8
8
  self.params = params
9
9
  self.pipeline = None
@@ -6,14 +6,14 @@ class PipelineConfig:
6
6
  print(config_file)
7
7
  self.params = params
8
8
  self.config_file = config_file
9
- # If config file is instance of string, it is a path to the config file
9
+ # If the config file is an instance of string, it is a path to the config file
10
10
  self.config_file = config_file
11
11
  if isinstance(self.config_file, str):
12
12
  self.load_json_config_file()
13
- elif isinstance(self.config_file, dict):
13
+ elif isinstance(self.config_file, list):
14
14
  self.pipeline_settings = config_file
15
15
  else:
16
- raise ValueError("Invalid config file type. Must be a string or a dictionary.")
16
+ raise ValueError("Invalid config file type. Must be a string or a list.")
17
17
  self.sources = {}
18
18
  self.processors = {}
19
19
  self.sinks = {}
@@ -27,6 +27,14 @@ class PipelineConfig:
27
27
  json_config_file = json_config_file.replace(f"{{{key}}}", value)
28
28
  # Convert to JSON file
29
29
  self.pipeline_settings = json5.loads(json_config_file)
30
+ if len(self.pipeline_settings) > 0 and self.pipeline_settings[0]['type'] == 'import':
31
+ self.import_json_config_file()
32
+
33
+ def import_json_config_file(self):
34
+ import_pipeline_settings = []
35
+ for import_component in self.pipeline_settings:
36
+ if import_component['type'] == '': # Maybe someone wants to use import in the middle of his config?
37
+ pass
30
38
 
31
39
  def create_pipeline_nodes(self):
32
40
  for component in self.pipeline_settings:
@@ -6,6 +6,7 @@ from datapipelab.app.node.sink.delta_node import DeltaSinkNode
6
6
  from datapipelab.app.node.sink.csv_node import CSVSinkNode
7
7
  from datapipelab.app.node.sink.pandas_csv_node import PandasCSVSinkNode
8
8
  from datapipelab.app.node.sink.teams_notification_node import TeamsNotificationSinkNode
9
+ from datapipelab.app.node.processor.bigquery_spark_node import BigQuerySparkProcessorNode
9
10
 
10
11
 
11
12
  class PipelineHandler:
@@ -42,10 +43,12 @@ class PipelineHandler:
42
43
  processor_df = None
43
44
  if tnode_format == 'custom':
44
45
  processor_df = CustomNode(self.spark, tnode_config).run()
45
-
46
46
  if tnode_format == 'query':
47
47
  processor_df = SparkProcessorNode(self.spark, tnode_config).run()
48
-
48
+ if tnode_format == 'bigquery_api':
49
+ processor_df = None
50
+ if tnode_format == 'bigquery_spark':
51
+ processor_df = datapipelab.app.node.processor.bigquery_spark.BigQuerySparkProcessorNode(self.spark, tnode_config).run()
49
52
  return processor_df
50
53
 
51
54
  def write_sink_node(self, tnode_config, t_df):
@@ -70,5 +73,8 @@ class PipelineHandler:
70
73
  if tnode_type == "local":
71
74
  if tnode_format == "csv":
72
75
  PandasCSVSinkNode(self.spark, tnode_config, t_df).run()
76
+ if tnode_type == 'custom':
77
+ from datapipelab.app.node import custom_node
78
+ processor_df = custom_node.CustomNode(self.spark, tnode_config).run()
73
79
 
74
80
 
@@ -1,11 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
7
- Requires-Dist: azure-storage-blob
8
- Requires-Dist: google-cloud-storage
9
- Requires-Dist: pandas
10
7
  Dynamic: requires-dist
11
8
  Dynamic: summary
@@ -0,0 +1,31 @@
1
+ datapipelab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ datapipelab/engine.py,sha256=3QRsedRYNov6xIDOZ1tukinFE-SKv39Fn3sNCnD3L6g,442
3
+ datapipelab/logger.py,sha256=Ugv0A4TfD3JWCWXNWu0lURcnfAEyuVrK3IrvVVgcHBo,864
4
+ datapipelab/pipeline.py,sha256=dw9D9KM_hztt9g_YzqoNgQBRyCYR92cRZwrU5duP_Pg,1464
5
+ datapipelab/pipeline_config.py,sha256=2bFAJepViE7rT7CaRANZU07aeQpOYcZ954ISujm9pXA,3816
6
+ datapipelab/pipeline_handler.py,sha256=HCyvyW3Yx2QYWQ-D-ArMDZdGtMRsJ348ZHSrqbRYwd0,3779
7
+ datapipelab/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ datapipelab/app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ datapipelab/app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ datapipelab/app/node/custom_node.py,sha256=VvjwkECTobRhO_fYKUrJCd117B5MoR9P6UKYZfRLhV4,1017
11
+ datapipelab/app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
12
+ datapipelab/app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ datapipelab/app/node/processor/bigquery_api_node.py,sha256=2kF6hgYOUi7te-aXXr-cTZfkvitBDEPJAHT8KG-i8fE,1293
14
+ datapipelab/app/node/processor/bigquery_spark_node.py,sha256=t8JJqMWTZwBuJUKV7-l72ZLdLVoHHSFJHFNovAY-2nc,1179
15
+ datapipelab/app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
16
+ datapipelab/app/node/processor/shell_node.py,sha256=s3dKgfEqbpUIEiwORERgvp7FNDE5JkFHBo7EnJYBPnA,669
17
+ datapipelab/app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
18
+ datapipelab/app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ datapipelab/app/node/sink/csv_node.py,sha256=d2hyufP5_Nmql0pfD0KeC4rFu1wXTnBxVsoGl7sWbhM,1681
20
+ datapipelab/app/node/sink/delta_node.py,sha256=iKEdiTjJ7SHJZMrbm0jR5tms5JZ5iCFfQklZbI-Yr2o,2044
21
+ datapipelab/app/node/sink/hive_node.py,sha256=BKSSYb1AexQD1Jl6zP5ak_ibvd0wYRNzC_zhTtNTDRg,1106
22
+ datapipelab/app/node/sink/pandas_csv_node.py,sha256=JsJFt2XRpwxGeJyt_PDUgqZafiQROf1Sk5TUhQPxh4c,870
23
+ datapipelab/app/node/sink/teams_notification_node.py,sha256=6ZufdbhVvRXi3QTQafLo5uKl9kLyDnkYIE_VZFT0QNw,3581
24
+ datapipelab/app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ datapipelab/app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
26
+ datapipelab/app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
27
+ datapipelab/app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
28
+ datapipelab-0.1.6.dist-info/METADATA,sha256=yuXR8PtBINVXBFVsFncHbu9cnvJ52Jp8ZCDi-Bsmrnw,220
29
+ datapipelab-0.1.6.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
30
+ datapipelab-0.1.6.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
31
+ datapipelab-0.1.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.7.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,27 +0,0 @@
1
- datapipelab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- datapipelab/engine.py,sha256=dYm39Yb0Eqt76pwdc-ilzZNxehMKdiuidE557YexHaU,437
3
- datapipelab/logger.py,sha256=Ugv0A4TfD3JWCWXNWu0lURcnfAEyuVrK3IrvVVgcHBo,864
4
- datapipelab/pipeline.py,sha256=dw9D9KM_hztt9g_YzqoNgQBRyCYR92cRZwrU5duP_Pg,1464
5
- datapipelab/pipeline_config.py,sha256=dKVWz_FH5448a1ZE9eIu8ANagEceI_l4QNICbXrRudk,3411
6
- datapipelab/pipeline_handler.py,sha256=1t5wwsaVUMXXmsEa-Qt-6jtMIyAZmX1hgo2I_UgbtiM,3265
7
- datapipelab/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- datapipelab/app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- datapipelab/app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- datapipelab/app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
11
- datapipelab/app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- datapipelab/app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
13
- datapipelab/app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
14
- datapipelab/app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- datapipelab/app/node/sink/csv_node.py,sha256=YaDporq4L358dJCkheCjpGmDBsYtvai6dnGiS-uf_Mc,1673
16
- datapipelab/app/node/sink/delta_node.py,sha256=h3gnBsuYoXUlnhCouav-iwdQ4UVZPcvG4_5r9gxY8JM,2036
17
- datapipelab/app/node/sink/hive_node.py,sha256=E3pP_U7LzCFdZ0LVkqR0LDlU8HZtfbKgOo72yJsLLT0,1098
18
- datapipelab/app/node/sink/pandas_csv_node.py,sha256=bNF-Sb5pKMV1qAePzyDhwYLB075Rg7tjxfQ6BHcq1Wk,862
19
- datapipelab/app/node/sink/teams_notification_node.py,sha256=ZDE-F4nfmrK7UnXoSI6mHuhUHWlgE8rUCdPFW5ZXr7E,3564
20
- datapipelab/app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- datapipelab/app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
22
- datapipelab/app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
23
- datapipelab/app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
24
- datapipelab-0.1.4.dist-info/METADATA,sha256=O_P8alSEnX3hM-gLN6OPFfeYykpOO_-ymns8OKUtJU4,312
25
- datapipelab-0.1.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
- datapipelab-0.1.4.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
27
- datapipelab-0.1.4.dist-info/RECORD,,