datapipelab 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class CustomNode(TNode):
5
+ def __init__(self, spark, tnode_config, t_df):
6
+ super().__init__(spark=spark)
7
+ self.tnode_config = tnode_config
8
+ self.spark = spark
9
+ self.t_df = t_df
10
+ module_name = tnode_config['options']['module_name']
11
+ module_path = tnode_config['options']['module_path']
12
+ class_name = tnode_config['options']['class_name']
13
+ self.custom_processor = self.import_module(module_name, module_path, class_name)
14
+
15
+ def import_module(self, module_name, module_path, class_name):
16
+ custom_module = __import__(module_path, fromlist=[module_name])
17
+ custom_class = getattr(custom_module, class_name)
18
+ return custom_class(self.spark, self.tnode_config) # .create_instance(self.t_df)
19
+
20
+ def _process(self):
21
+ logger.info(f"Custom node type is {self.tnode_config.get('custom_node_type', 'N/A!')}")
22
+ return self.custom_processor.process()
@@ -0,0 +1,33 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class BigQueryAPIProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ from google.cloud import bigquery
7
+ from google.oauth2 import service_account
8
+ super().__init__(spark=spark)
9
+ self.sql_query = tnode_config['options']['query']
10
+ self.node_name = tnode_config['name']
11
+ self.credentials_path = tnode_config['options']['credentials_path']
12
+ self.return_as_spark_df = tnode_config['options']['return_as_spark_df']
13
+ self.project_name = tnode_config['options']['project_name']
14
+
15
+ def __sql_query(self, sql_query):
16
+ credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
17
+ client = bigquery.Client(credentials=credentials, project=self.project_name)
18
+
19
+ # run the job
20
+ query_job = client.query(sql_query)
21
+
22
+ results = query_job.result()
23
+ rows = [dict(row) for row in results]
24
+ if self.return_as_spark_df:
25
+ self.node = self.spark.createDataFrame(rows)
26
+ else:
27
+ self.node = None
28
+ logger.info(rows)
29
+
30
+ def _process(self):
31
+ self.__sql_query(self.sql_query)
32
+ self._createOrReplaceTempView()
33
+ return self.node
@@ -0,0 +1,30 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class BigQuerySparkProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.sql_query = tnode_config['options']['query']
8
+ self.node_name = tnode_config['name']
9
+ self.credentials_path = tnode_config['options']['materialization_dataset'] # materializationDataset
10
+ self.return_as_spark_df = tnode_config['options']['parent_project'] # parentProject
11
+
12
+ def __sql_query(self, sql_query):
13
+ credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
14
+ client = bigquery.Client(credentials=credentials, project=self.project_name)
15
+
16
+ # run the job
17
+ query_job = client.query(sql_query)
18
+
19
+ results = query_job.result()
20
+ rows = [dict(row) for row in results]
21
+ if self.return_as_spark_df:
22
+ self.node = self.spark.createDataFrame(rows)
23
+ else:
24
+ self.node = None
25
+ logger.info(rows)
26
+
27
+ def _process(self):
28
+ self.__sql_query(self.sql_query)
29
+ self._createOrReplaceTempView()
30
+ return self.node
@@ -0,0 +1,23 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class ShellProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+
7
+ super().__init__(spark=spark)
8
+ self.shell_query = tnode_config['options']['query']
9
+ self.node_name = tnode_config['name']
10
+
11
+ def __shell_query(self):
12
+ import subprocess
13
+ # run the job
14
+ result = subprocess.run(
15
+ f"{self.shell_query}",
16
+ shell=True, check=True, executable='/bin/bash'
17
+ )
18
+ logger.info(result)
19
+
20
+ def _process(self):
21
+ self.__shell_query()
22
+ self._createOrReplaceTempView()
23
+ return self.node
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class CSVSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.output_path = tnode_config['options']['path']
9
9
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -1,10 +1,10 @@
1
1
  from pyspark.sql import DataFrame
2
2
  from datapipelab.app.node.tnode import TNode
3
- from delta.tables import DeltaTable
4
3
 
5
4
 
6
5
  class DeltaSinkNode(TNode):
7
6
  def __init__(self, spark, tnode_config, t_df):
7
+ from delta.tables import DeltaTable
8
8
  super().__init__(spark=spark)
9
9
  self.mode = tnode_config['options']['mode'] # Can be 'append', 'overwrite', or 'upsert'
10
10
  self.partition_by = tnode_config['options'].get('partition_by')
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class HiveSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.mode = tnode_config['mode']
9
9
  self.stream = tnode_config['stream']
@@ -1,9 +1,9 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
2
 
4
3
 
5
4
  class PandasCSVSinkNode(TNode):
6
5
  def __init__(self, spark, tnode_config, t_df):
6
+ from pyspark.sql import DataFrame
7
7
  super().__init__(spark=spark)
8
8
  self.mode = tnode_config['options'].get('mode', 'w')
9
9
  # self.stream = tnode_config['stream']
@@ -1,10 +1,11 @@
1
- from pyspark.sql import DataFrame
2
1
  from datapipelab.app.node.tnode import TNode
3
- import json
2
+
4
3
 
5
4
 
6
5
  class TeamsNotificationSinkNode(TNode):
7
6
  def __init__(self, spark, tnode_config, df=None):
7
+ from pyspark.sql import DataFrame
8
+ import json
8
9
  super().__init__(spark=spark)
9
10
  self.teams_msg_body = tnode_config['options']['teams_msg_body']
10
11
  self.teams_msg_title = tnode_config['options'].get('teams_msg_title', 'Notification')
@@ -6,7 +6,7 @@ class PipelineConfig:
6
6
  print(config_file)
7
7
  self.params = params
8
8
  self.config_file = config_file
9
- # If config file is instance of string, it is a path to the config file
9
+ # If the config file is an instance of string, it is a path to the config file
10
10
  self.config_file = config_file
11
11
  if isinstance(self.config_file, str):
12
12
  self.load_json_config_file()
@@ -27,6 +27,14 @@ class PipelineConfig:
27
27
  json_config_file = json_config_file.replace(f"{{{key}}}", value)
28
28
  # Convert to JSON file
29
29
  self.pipeline_settings = json5.loads(json_config_file)
30
+ if len(self.pipeline_settings) > 0 and self.pipeline_settings[0]['type'] == 'import':
31
+ self.import_json_config_file()
32
+
33
+ def import_json_config_file(self):
34
+ import_pipeline_settings = []
35
+ for import_component in self.pipeline_settings:
36
+ if import_component['type'] == '': # Maybe someone wants to use import in the middle of his config?
37
+ pass
30
38
 
31
39
  def create_pipeline_nodes(self):
32
40
  for component in self.pipeline_settings:
@@ -6,6 +6,7 @@ from datapipelab.app.node.sink.delta_node import DeltaSinkNode
6
6
  from datapipelab.app.node.sink.csv_node import CSVSinkNode
7
7
  from datapipelab.app.node.sink.pandas_csv_node import PandasCSVSinkNode
8
8
  from datapipelab.app.node.sink.teams_notification_node import TeamsNotificationSinkNode
9
+ from datapipelab.app.node.processor.bigquery_spark_node import BigQuerySparkProcessorNode
9
10
 
10
11
 
11
12
  class PipelineHandler:
@@ -42,10 +43,12 @@ class PipelineHandler:
42
43
  processor_df = None
43
44
  if tnode_format == 'custom':
44
45
  processor_df = CustomNode(self.spark, tnode_config).run()
45
-
46
46
  if tnode_format == 'query':
47
47
  processor_df = SparkProcessorNode(self.spark, tnode_config).run()
48
-
48
+ if tnode_format == 'bigquery_api':
49
+ processor_df = None
50
+ if tnode_format == 'bigquery_spark':
51
+ processor_df = datapipelab.app.node.processor.bigquery_spark.BigQuerySparkProcessorNode(self.spark, tnode_config).run()
49
52
  return processor_df
50
53
 
51
54
  def write_sink_node(self, tnode_config, t_df):
@@ -70,5 +73,8 @@ class PipelineHandler:
70
73
  if tnode_type == "local":
71
74
  if tnode_format == "csv":
72
75
  PandasCSVSinkNode(self.spark, tnode_config, t_df).run()
76
+ if tnode_type == 'custom':
77
+ from datapipelab.app.node import custom_node
78
+ processor_df = custom_node.CustomNode(self.spark, tnode_config).run()
73
79
 
74
80
 
@@ -1,11 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
7
- Requires-Dist: azure-storage-blob
8
- Requires-Dist: google-cloud-storage
9
- Requires-Dist: pandas
10
7
  Dynamic: requires-dist
11
8
  Dynamic: summary
@@ -2,26 +2,30 @@ datapipelab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  datapipelab/engine.py,sha256=3QRsedRYNov6xIDOZ1tukinFE-SKv39Fn3sNCnD3L6g,442
3
3
  datapipelab/logger.py,sha256=Ugv0A4TfD3JWCWXNWu0lURcnfAEyuVrK3IrvVVgcHBo,864
4
4
  datapipelab/pipeline.py,sha256=dw9D9KM_hztt9g_YzqoNgQBRyCYR92cRZwrU5duP_Pg,1464
5
- datapipelab/pipeline_config.py,sha256=xxasVl6nULWmVRWYiyZUgCr5Y9m0OP7kM3mCOss1AXA,3405
6
- datapipelab/pipeline_handler.py,sha256=1t5wwsaVUMXXmsEa-Qt-6jtMIyAZmX1hgo2I_UgbtiM,3265
5
+ datapipelab/pipeline_config.py,sha256=2bFAJepViE7rT7CaRANZU07aeQpOYcZ954ISujm9pXA,3816
6
+ datapipelab/pipeline_handler.py,sha256=HCyvyW3Yx2QYWQ-D-ArMDZdGtMRsJ348ZHSrqbRYwd0,3779
7
7
  datapipelab/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  datapipelab/app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  datapipelab/app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ datapipelab/app/node/custom_node.py,sha256=VvjwkECTobRhO_fYKUrJCd117B5MoR9P6UKYZfRLhV4,1017
10
11
  datapipelab/app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
11
12
  datapipelab/app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ datapipelab/app/node/processor/bigquery_api_node.py,sha256=2kF6hgYOUi7te-aXXr-cTZfkvitBDEPJAHT8KG-i8fE,1293
14
+ datapipelab/app/node/processor/bigquery_spark_node.py,sha256=t8JJqMWTZwBuJUKV7-l72ZLdLVoHHSFJHFNovAY-2nc,1179
12
15
  datapipelab/app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
16
+ datapipelab/app/node/processor/shell_node.py,sha256=s3dKgfEqbpUIEiwORERgvp7FNDE5JkFHBo7EnJYBPnA,669
13
17
  datapipelab/app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
14
18
  datapipelab/app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- datapipelab/app/node/sink/csv_node.py,sha256=YaDporq4L358dJCkheCjpGmDBsYtvai6dnGiS-uf_Mc,1673
16
- datapipelab/app/node/sink/delta_node.py,sha256=h3gnBsuYoXUlnhCouav-iwdQ4UVZPcvG4_5r9gxY8JM,2036
17
- datapipelab/app/node/sink/hive_node.py,sha256=E3pP_U7LzCFdZ0LVkqR0LDlU8HZtfbKgOo72yJsLLT0,1098
18
- datapipelab/app/node/sink/pandas_csv_node.py,sha256=bNF-Sb5pKMV1qAePzyDhwYLB075Rg7tjxfQ6BHcq1Wk,862
19
- datapipelab/app/node/sink/teams_notification_node.py,sha256=ZDE-F4nfmrK7UnXoSI6mHuhUHWlgE8rUCdPFW5ZXr7E,3564
19
+ datapipelab/app/node/sink/csv_node.py,sha256=d2hyufP5_Nmql0pfD0KeC4rFu1wXTnBxVsoGl7sWbhM,1681
20
+ datapipelab/app/node/sink/delta_node.py,sha256=iKEdiTjJ7SHJZMrbm0jR5tms5JZ5iCFfQklZbI-Yr2o,2044
21
+ datapipelab/app/node/sink/hive_node.py,sha256=BKSSYb1AexQD1Jl6zP5ak_ibvd0wYRNzC_zhTtNTDRg,1106
22
+ datapipelab/app/node/sink/pandas_csv_node.py,sha256=JsJFt2XRpwxGeJyt_PDUgqZafiQROf1Sk5TUhQPxh4c,870
23
+ datapipelab/app/node/sink/teams_notification_node.py,sha256=6ZufdbhVvRXi3QTQafLo5uKl9kLyDnkYIE_VZFT0QNw,3581
20
24
  datapipelab/app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
25
  datapipelab/app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
22
26
  datapipelab/app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
23
27
  datapipelab/app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
24
- datapipelab-0.1.5.dist-info/METADATA,sha256=SFXOKu57wAmSkGBUHdvC2uehaCDjRu5cIhUqWpzp4e0,312
25
- datapipelab-0.1.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
- datapipelab-0.1.5.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
27
- datapipelab-0.1.5.dist-info/RECORD,,
28
+ datapipelab-0.1.6.dist-info/METADATA,sha256=yuXR8PtBINVXBFVsFncHbu9cnvJ52Jp8ZCDi-Bsmrnw,220
29
+ datapipelab-0.1.6.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
30
+ datapipelab-0.1.6.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
31
+ datapipelab-0.1.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.7.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5