datapipelab 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipelab/app/node/custom_node.py +22 -0
- datapipelab/app/node/processor/bigquery_api_node.py +33 -0
- datapipelab/app/node/processor/bigquery_spark_node.py +30 -0
- datapipelab/app/node/processor/shell_node.py +23 -0
- datapipelab/app/node/sink/csv_node.py +1 -1
- datapipelab/app/node/sink/delta_node.py +1 -1
- datapipelab/app/node/sink/hive_node.py +1 -1
- datapipelab/app/node/sink/pandas_csv_node.py +1 -1
- datapipelab/app/node/sink/teams_notification_node.py +3 -2
- datapipelab/pipeline_config.py +9 -1
- datapipelab/pipeline_handler.py +9 -2
- {datapipelab-0.1.5.dist-info → datapipelab-0.1.7.dist-info}/METADATA +1 -4
- {datapipelab-0.1.5.dist-info → datapipelab-0.1.7.dist-info}/RECORD +15 -11
- {datapipelab-0.1.5.dist-info → datapipelab-0.1.7.dist-info}/WHEEL +1 -1
- {datapipelab-0.1.5.dist-info → datapipelab-0.1.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
from datapipelab.app.node.tnode import TNode
|
2
|
+
from datapipelab.logger import logger
|
3
|
+
|
4
|
+
class CustomNode(TNode):
|
5
|
+
def __init__(self, spark, tnode_config, t_df):
|
6
|
+
super().__init__(spark=spark)
|
7
|
+
self.tnode_config = tnode_config
|
8
|
+
self.spark = spark
|
9
|
+
self.t_df = t_df
|
10
|
+
module_name = tnode_config['options']['module_name']
|
11
|
+
module_path = tnode_config['options']['module_path']
|
12
|
+
class_name = tnode_config['options']['class_name']
|
13
|
+
self.custom_processor = self.import_module(module_name, module_path, class_name)
|
14
|
+
|
15
|
+
def import_module(self, module_name, module_path, class_name):
|
16
|
+
custom_module = __import__(module_path, fromlist=[module_name])
|
17
|
+
custom_class = getattr(custom_module, class_name)
|
18
|
+
return custom_class(self.spark, self.tnode_config) # .create_instance(self.t_df)
|
19
|
+
|
20
|
+
def _process(self):
|
21
|
+
logger.info(f"Custom node type is {self.tnode_config.get('custom_node_type', 'N/A!')}")
|
22
|
+
return self.custom_processor.process()
|
@@ -0,0 +1,33 @@
|
|
1
|
+
from datapipelab.app.node.tnode import TNode
|
2
|
+
from datapipelab.logger import logger
|
3
|
+
|
4
|
+
class BigQueryAPIProcessorNode(TNode):
|
5
|
+
def __init__(self, spark, tnode_config):
|
6
|
+
from google.cloud import bigquery
|
7
|
+
from google.oauth2 import service_account
|
8
|
+
super().__init__(spark=spark)
|
9
|
+
self.sql_query = tnode_config['options']['query']
|
10
|
+
self.node_name = tnode_config['name']
|
11
|
+
self.credentials_path = tnode_config['options']['credentials_path']
|
12
|
+
self.return_as_spark_df = tnode_config['options']['return_as_spark_df']
|
13
|
+
self.project_name = tnode_config['options']['project_name']
|
14
|
+
|
15
|
+
def __sql_query(self, sql_query):
|
16
|
+
credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
|
17
|
+
client = bigquery.Client(credentials=credentials, project=self.project_name)
|
18
|
+
|
19
|
+
# run the job
|
20
|
+
query_job = client.query(sql_query)
|
21
|
+
|
22
|
+
results = query_job.result()
|
23
|
+
rows = [dict(row) for row in results]
|
24
|
+
if self.return_as_spark_df:
|
25
|
+
self.node = self.spark.createDataFrame(rows)
|
26
|
+
else:
|
27
|
+
self.node = None
|
28
|
+
logger.info(rows)
|
29
|
+
|
30
|
+
def _process(self):
|
31
|
+
self.__sql_query(self.sql_query)
|
32
|
+
self._createOrReplaceTempView()
|
33
|
+
return self.node
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from datapipelab.app.node.tnode import TNode
|
2
|
+
from datapipelab.logger import logger
|
3
|
+
|
4
|
+
class BigQuerySparkProcessorNode(TNode):
|
5
|
+
def __init__(self, spark, tnode_config):
|
6
|
+
super().__init__(spark=spark)
|
7
|
+
self.sql_query = tnode_config['options']['query']
|
8
|
+
self.node_name = tnode_config['name']
|
9
|
+
self.credentials_path = tnode_config['options']['materialization_dataset'] # materializationDataset
|
10
|
+
self.return_as_spark_df = tnode_config['options']['parent_project'] # parentProject
|
11
|
+
|
12
|
+
def __sql_query(self, sql_query):
|
13
|
+
credentials = service_account.Credentials.from_service_account_file(self.credentials_path)
|
14
|
+
client = bigquery.Client(credentials=credentials, project=self.project_name)
|
15
|
+
|
16
|
+
# run the job
|
17
|
+
query_job = client.query(sql_query)
|
18
|
+
|
19
|
+
results = query_job.result()
|
20
|
+
rows = [dict(row) for row in results]
|
21
|
+
if self.return_as_spark_df:
|
22
|
+
self.node = self.spark.createDataFrame(rows)
|
23
|
+
else:
|
24
|
+
self.node = None
|
25
|
+
logger.info(rows)
|
26
|
+
|
27
|
+
def _process(self):
|
28
|
+
self.__sql_query(self.sql_query)
|
29
|
+
self._createOrReplaceTempView()
|
30
|
+
return self.node
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from datapipelab.app.node.tnode import TNode
|
2
|
+
from datapipelab.logger import logger
|
3
|
+
|
4
|
+
class ShellProcessorNode(TNode):
|
5
|
+
def __init__(self, spark, tnode_config):
|
6
|
+
|
7
|
+
super().__init__(spark=spark)
|
8
|
+
self.shell_query = tnode_config['options']['query']
|
9
|
+
self.node_name = tnode_config['name']
|
10
|
+
|
11
|
+
def __shell_query(self):
|
12
|
+
import subprocess
|
13
|
+
# run the job
|
14
|
+
result = subprocess.run(
|
15
|
+
f"{self.shell_query}",
|
16
|
+
shell=True, check=True, executable='/bin/bash'
|
17
|
+
)
|
18
|
+
logger.info(result)
|
19
|
+
|
20
|
+
def _process(self):
|
21
|
+
self.__shell_query()
|
22
|
+
self._createOrReplaceTempView()
|
23
|
+
return self.node
|
@@ -1,9 +1,9 @@
|
|
1
|
-
from pyspark.sql import DataFrame
|
2
1
|
from datapipelab.app.node.tnode import TNode
|
3
2
|
|
4
3
|
|
5
4
|
class CSVSinkNode(TNode):
|
6
5
|
def __init__(self, spark, tnode_config, t_df):
|
6
|
+
from pyspark.sql import DataFrame
|
7
7
|
super().__init__(spark=spark)
|
8
8
|
self.output_path = tnode_config['options']['path']
|
9
9
|
self.partition_by = tnode_config['options'].get('partition_by')
|
@@ -1,10 +1,10 @@
|
|
1
1
|
from pyspark.sql import DataFrame
|
2
2
|
from datapipelab.app.node.tnode import TNode
|
3
|
-
from delta.tables import DeltaTable
|
4
3
|
|
5
4
|
|
6
5
|
class DeltaSinkNode(TNode):
|
7
6
|
def __init__(self, spark, tnode_config, t_df):
|
7
|
+
from delta.tables import DeltaTable
|
8
8
|
super().__init__(spark=spark)
|
9
9
|
self.mode = tnode_config['options']['mode'] # Can be 'append', 'overwrite', or 'upsert'
|
10
10
|
self.partition_by = tnode_config['options'].get('partition_by')
|
@@ -1,9 +1,9 @@
|
|
1
|
-
from pyspark.sql import DataFrame
|
2
1
|
from datapipelab.app.node.tnode import TNode
|
3
2
|
|
4
3
|
|
5
4
|
class HiveSinkNode(TNode):
|
6
5
|
def __init__(self, spark, tnode_config, df):
|
6
|
+
from pyspark.sql import DataFrame
|
7
7
|
super().__init__(spark=spark)
|
8
8
|
self.mode = tnode_config['mode']
|
9
9
|
self.stream = tnode_config['stream']
|
@@ -1,9 +1,9 @@
|
|
1
|
-
from pyspark.sql import DataFrame
|
2
1
|
from datapipelab.app.node.tnode import TNode
|
3
2
|
|
4
3
|
|
5
4
|
class PandasCSVSinkNode(TNode):
|
6
5
|
def __init__(self, spark, tnode_config, t_df):
|
6
|
+
from pyspark.sql import DataFrame
|
7
7
|
super().__init__(spark=spark)
|
8
8
|
self.mode = tnode_config['options'].get('mode', 'w')
|
9
9
|
# self.stream = tnode_config['stream']
|
@@ -1,10 +1,11 @@
|
|
1
|
-
from pyspark.sql import DataFrame
|
2
1
|
from datapipelab.app.node.tnode import TNode
|
3
|
-
|
2
|
+
|
4
3
|
|
5
4
|
|
6
5
|
class TeamsNotificationSinkNode(TNode):
|
7
6
|
def __init__(self, spark, tnode_config, df=None):
|
7
|
+
from pyspark.sql import DataFrame
|
8
|
+
import json
|
8
9
|
super().__init__(spark=spark)
|
9
10
|
self.teams_msg_body = tnode_config['options']['teams_msg_body']
|
10
11
|
self.teams_msg_title = tnode_config['options'].get('teams_msg_title', 'Notification')
|
datapipelab/pipeline_config.py
CHANGED
@@ -6,7 +6,7 @@ class PipelineConfig:
|
|
6
6
|
print(config_file)
|
7
7
|
self.params = params
|
8
8
|
self.config_file = config_file
|
9
|
-
# If config file is instance of string, it is a path to the config file
|
9
|
+
# If the config file is an instance of string, it is a path to the config file
|
10
10
|
self.config_file = config_file
|
11
11
|
if isinstance(self.config_file, str):
|
12
12
|
self.load_json_config_file()
|
@@ -27,6 +27,14 @@ class PipelineConfig:
|
|
27
27
|
json_config_file = json_config_file.replace(f"{{{key}}}", value)
|
28
28
|
# Convert to JSON file
|
29
29
|
self.pipeline_settings = json5.loads(json_config_file)
|
30
|
+
if len(self.pipeline_settings) > 0 and self.pipeline_settings[0]['type'] == 'import':
|
31
|
+
self.import_json_config_file()
|
32
|
+
|
33
|
+
def import_json_config_file(self):
|
34
|
+
import_pipeline_settings = []
|
35
|
+
for import_component in self.pipeline_settings:
|
36
|
+
if import_component['type'] == '': # Maybe someone wants to use import in the middle of his config?
|
37
|
+
pass
|
30
38
|
|
31
39
|
def create_pipeline_nodes(self):
|
32
40
|
for component in self.pipeline_settings:
|
datapipelab/pipeline_handler.py
CHANGED
@@ -6,6 +6,8 @@ from datapipelab.app.node.sink.delta_node import DeltaSinkNode
|
|
6
6
|
from datapipelab.app.node.sink.csv_node import CSVSinkNode
|
7
7
|
from datapipelab.app.node.sink.pandas_csv_node import PandasCSVSinkNode
|
8
8
|
from datapipelab.app.node.sink.teams_notification_node import TeamsNotificationSinkNode
|
9
|
+
from datapipelab.app.node.processor.bigquery_spark_node import BigQuerySparkProcessorNode
|
10
|
+
from datapipelab.app.node.processor.bigquery_api_node import BigQueryAPIProcessorNode
|
9
11
|
|
10
12
|
|
11
13
|
class PipelineHandler:
|
@@ -42,10 +44,12 @@ class PipelineHandler:
|
|
42
44
|
processor_df = None
|
43
45
|
if tnode_format == 'custom':
|
44
46
|
processor_df = CustomNode(self.spark, tnode_config).run()
|
45
|
-
|
46
47
|
if tnode_format == 'query':
|
47
48
|
processor_df = SparkProcessorNode(self.spark, tnode_config).run()
|
48
|
-
|
49
|
+
if tnode_format == 'bigquery_api':
|
50
|
+
processor_df = BigQueryAPIProcessorNode(self.spark, tnode_config).run()
|
51
|
+
if tnode_format == 'bigquery_spark':
|
52
|
+
processor_df = BigQuerySparkProcessorNode(self.spark, tnode_config).run()
|
49
53
|
return processor_df
|
50
54
|
|
51
55
|
def write_sink_node(self, tnode_config, t_df):
|
@@ -70,5 +74,8 @@ class PipelineHandler:
|
|
70
74
|
if tnode_type == "local":
|
71
75
|
if tnode_format == "csv":
|
72
76
|
PandasCSVSinkNode(self.spark, tnode_config, t_df).run()
|
77
|
+
if tnode_type == 'custom':
|
78
|
+
from datapipelab.app.node import custom_node
|
79
|
+
processor_df = custom_node.CustomNode(self.spark, tnode_config).run()
|
73
80
|
|
74
81
|
|
@@ -1,11 +1,8 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: datapipelab
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.7
|
4
4
|
Summary: A data pipeline library with connectors, sources, processors, and sinks.
|
5
5
|
Requires-Dist: json5
|
6
6
|
Requires-Dist: loguru
|
7
|
-
Requires-Dist: azure-storage-blob
|
8
|
-
Requires-Dist: google-cloud-storage
|
9
|
-
Requires-Dist: pandas
|
10
7
|
Dynamic: requires-dist
|
11
8
|
Dynamic: summary
|
@@ -2,26 +2,30 @@ datapipelab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
datapipelab/engine.py,sha256=3QRsedRYNov6xIDOZ1tukinFE-SKv39Fn3sNCnD3L6g,442
|
3
3
|
datapipelab/logger.py,sha256=Ugv0A4TfD3JWCWXNWu0lURcnfAEyuVrK3IrvVVgcHBo,864
|
4
4
|
datapipelab/pipeline.py,sha256=dw9D9KM_hztt9g_YzqoNgQBRyCYR92cRZwrU5duP_Pg,1464
|
5
|
-
datapipelab/pipeline_config.py,sha256=
|
6
|
-
datapipelab/pipeline_handler.py,sha256=
|
5
|
+
datapipelab/pipeline_config.py,sha256=2bFAJepViE7rT7CaRANZU07aeQpOYcZ954ISujm9pXA,3816
|
6
|
+
datapipelab/pipeline_handler.py,sha256=lQv6HwwdgZDQvICgABtWiuvZQ9jG9cJjy8s_7qLZr9s,3871
|
7
7
|
datapipelab/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
datapipelab/app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
datapipelab/app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
datapipelab/app/node/custom_node.py,sha256=VvjwkECTobRhO_fYKUrJCd117B5MoR9P6UKYZfRLhV4,1017
|
10
11
|
datapipelab/app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
|
11
12
|
datapipelab/app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
datapipelab/app/node/processor/bigquery_api_node.py,sha256=2kF6hgYOUi7te-aXXr-cTZfkvitBDEPJAHT8KG-i8fE,1293
|
14
|
+
datapipelab/app/node/processor/bigquery_spark_node.py,sha256=t8JJqMWTZwBuJUKV7-l72ZLdLVoHHSFJHFNovAY-2nc,1179
|
12
15
|
datapipelab/app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
|
16
|
+
datapipelab/app/node/processor/shell_node.py,sha256=s3dKgfEqbpUIEiwORERgvp7FNDE5JkFHBo7EnJYBPnA,669
|
13
17
|
datapipelab/app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
|
14
18
|
datapipelab/app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
datapipelab/app/node/sink/csv_node.py,sha256=
|
16
|
-
datapipelab/app/node/sink/delta_node.py,sha256=
|
17
|
-
datapipelab/app/node/sink/hive_node.py,sha256=
|
18
|
-
datapipelab/app/node/sink/pandas_csv_node.py,sha256=
|
19
|
-
datapipelab/app/node/sink/teams_notification_node.py,sha256=
|
19
|
+
datapipelab/app/node/sink/csv_node.py,sha256=d2hyufP5_Nmql0pfD0KeC4rFu1wXTnBxVsoGl7sWbhM,1681
|
20
|
+
datapipelab/app/node/sink/delta_node.py,sha256=iKEdiTjJ7SHJZMrbm0jR5tms5JZ5iCFfQklZbI-Yr2o,2044
|
21
|
+
datapipelab/app/node/sink/hive_node.py,sha256=BKSSYb1AexQD1Jl6zP5ak_ibvd0wYRNzC_zhTtNTDRg,1106
|
22
|
+
datapipelab/app/node/sink/pandas_csv_node.py,sha256=JsJFt2XRpwxGeJyt_PDUgqZafiQROf1Sk5TUhQPxh4c,870
|
23
|
+
datapipelab/app/node/sink/teams_notification_node.py,sha256=6ZufdbhVvRXi3QTQafLo5uKl9kLyDnkYIE_VZFT0QNw,3581
|
20
24
|
datapipelab/app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
25
|
datapipelab/app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
|
22
26
|
datapipelab/app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
|
23
27
|
datapipelab/app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
|
24
|
-
datapipelab-0.1.
|
25
|
-
datapipelab-0.1.
|
26
|
-
datapipelab-0.1.
|
27
|
-
datapipelab-0.1.
|
28
|
+
datapipelab-0.1.7.dist-info/METADATA,sha256=oiRy9y4GTRDU-Yiz3EQJc831TRgaTiMRXOaNimFllTQ,220
|
29
|
+
datapipelab-0.1.7.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
30
|
+
datapipelab-0.1.7.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
|
31
|
+
datapipelab-0.1.7.dist-info/RECORD,,
|
File without changes
|