datapipelab 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ include README.md
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: datapipelab
3
+ Version: 0.1
4
+ Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
+ Requires-Dist: json5
6
+ Requires-Dist: loguru
7
+ Requires-Dist: azure-storage-blob
8
+ Requires-Dist: google-cloud-storage
9
+ Requires-Dist: pandas
10
+ Dynamic: requires-dist
11
+ Dynamic: summary
@@ -0,0 +1 @@
1
+ # DataPipeLab
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,21 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+ from datapipelab.logger import logger
3
+
4
+ class CustomNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.tnode_config = tnode_config
8
+ self.spark = spark
9
+ module_name = tnode_config['options']['module_name']
10
+ module_path = tnode_config['options']['module_path']
11
+ class_name = tnode_config['options']['class_name']
12
+ self.custom_processor = self.import_module(module_name, module_path, class_name)
13
+
14
+ def import_module(self, module_name, module_path, class_name):
15
+ custom_module = __import__(module_path, fromlist=[module_name])
16
+ custom_class = getattr(custom_module, class_name)
17
+ return custom_class(self.spark, self.tnode_config) # .create_instance(self.t_df)
18
+
19
+ def _process(self):
20
+ logger.info("Custom node process")
21
+ return self.custom_processor.process()
@@ -0,0 +1,16 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+
3
+
4
+ class SparkProcessorNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.sql_query = tnode_config['options']['query']
8
+ self.node_name = tnode_config['name']
9
+
10
+ def __sql_query(self, sql_query):
11
+ self.node = self.spark.sql(sql_query)
12
+
13
+ def _process(self):
14
+ self.__sql_query(self.sql_query)
15
+ self._createOrReplaceTempView()
16
+ return self.node
File without changes
@@ -0,0 +1,33 @@
1
+ from pyspark.sql import DataFrame
2
+ from datapipelab.app.node.tnode import TNode
3
+
4
+
5
+ class CSVSinkNode(TNode):
6
+ def __init__(self, spark, tnode_config, t_df):
7
+ super().__init__(spark=spark)
8
+ self.output_path = tnode_config['options']['path']
9
+ self.partition_by = tnode_config['options'].get('partition_by')
10
+ self.partition_count = tnode_config['options'].get('partition_count', 1)
11
+ self.overwrite = tnode_config['options'].get('overwrite', False)
12
+ self.header = tnode_config['options'].get('header', True)
13
+ self.df = t_df[tnode_config['options']['parents'][0]]
14
+ self.quote_all = tnode_config['options'].get('quote_all', False)
15
+ self.ignore_leading_white_space = tnode_config['options'].get('ignore_leading_white_space', True)
16
+ self.ignore_trailing_white_space = tnode_config['options'].get('ignore_trailing_white_space', True)
17
+
18
+ def __write_csv(self):
19
+ if self.partition_count:
20
+ if self.partition_by:
21
+ self.df = self.df.repartition(int(self.partition_count), *self.partition_by)
22
+ else:
23
+ self.df = self.df.repartition(int(self.partition_count))
24
+
25
+ write_mode = "overwrite" if self.overwrite else "errorifexists"
26
+
27
+ (self.df.write.mode(write_mode).option("quoteAll", self.quote_all).option("ignoreLeadingWhiteSpace",
28
+ self.ignore_leading_white_space).option(
29
+ "ignoreTrailingWhiteSpace", self.ignore_trailing_white_space).option("header", self.header).csv(
30
+ self.output_path))
31
+
32
+ def _process(self):
33
+ self.__write_csv()
@@ -0,0 +1,46 @@
1
+ from pyspark.sql import DataFrame
2
+ from datapipelab.app.node.tnode import TNode
3
+ from delta.tables import DeltaTable
4
+
5
+
6
+ class DeltaSinkNode(TNode):
7
+ def __init__(self, spark, tnode_config, t_df):
8
+ super().__init__(spark=spark)
9
+ self.mode = tnode_config['options']['mode'] # Can be 'append', 'overwrite', or 'upsert'
10
+ self.partition_by = tnode_config['options'].get('partition_by')
11
+ self.partition_count = tnode_config['options'].get('partition_count')
12
+ self.df = t_df[tnode_config['options']['parents'][0]]
13
+ self.delta_table_path = tnode_config['options']['path'] # Path to the Delta table
14
+ self.primary_key = tnode_config['options'].get('primary_key', None)
15
+
16
+ def __write_append(self):
17
+ if self.partition_count:
18
+ self.df = self.df.repartition(int(self.partition_count), self.partition_by)
19
+ self.df.write.format("delta").mode("append").save(self.delta_table_path)
20
+
21
+ def __write_overwrite(self):
22
+ if self.partition_count:
23
+ self.df = self.df.repartition(int(self.partition_count), self.partition_by)
24
+ self.df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(self.delta_table_path)
25
+
26
+ def __write_upsert(self):
27
+ delta_table = DeltaTable.forPath(self.spark, self.delta_table_path)
28
+ primary_key = self.primary_key
29
+ if primary_key is None:
30
+ raise ValueError("Primary key must be provided for upsert mode")
31
+ delta_table.alias("target").merge(
32
+ self.df.alias("source"),
33
+ " AND ".join([f"target.{key} = source.{key}" for key in primary_key])
34
+ ).whenMatchedUpdateAll(
35
+ ).whenNotMatchedInsertAll(
36
+ ).execute()
37
+
38
+ def _process(self):
39
+ if self.mode == 'append':
40
+ self.__write_append()
41
+ elif self.mode == 'overwrite':
42
+ self.__write_overwrite()
43
+ elif self.mode == 'upsert':
44
+ self.__write_upsert()
45
+ else:
46
+ raise ValueError(f"Unsupported mode: {self.mode}")
@@ -0,0 +1,26 @@
1
+ from pyspark.sql import DataFrame
2
+ from datapipelab.app.node.tnode import TNode
3
+
4
+
5
+ class HiveSinkNode(TNode):
6
+ def __init__(self, spark, tnode_config, df):
7
+ super().__init__(spark=spark)
8
+ self.mode = tnode_config['mode']
9
+ self.stream = tnode_config['stream']
10
+ self.database_name = tnode_config['options']['database']
11
+ self.table_name = tnode_config['options']['table']
12
+ self.partition_by = tnode_config['options'].get('partition_by')
13
+ self.partition_count = tnode_config['options'].get('partition_count')
14
+ self.overwrite = tnode_config['options']['overwrite']
15
+ self.df = df
16
+
17
+ def __write_dynamic_partition(self):
18
+ if self.partition_count:
19
+ if self.partition_by:
20
+ self.df = self.df.repartition(int(self.partition_count))
21
+ else:
22
+ self.df = self.df.repartition(int(self.partition_count), self.partition_by)
23
+ self.df.write.insertInto(f'{self.database_name}.{self.table_name}', overwrite=self.overwrite)
24
+
25
+ def _process(self):
26
+ self.__write_dynamic_partition()
@@ -0,0 +1,23 @@
1
+ from pyspark.sql import DataFrame
2
+ from datapipelab.app.node.tnode import TNode
3
+
4
+
5
+ class PandasCSVSinkNode(TNode):
6
+ def __init__(self, spark, tnode_config, t_df):
7
+ super().__init__(spark=spark)
8
+ self.mode = tnode_config['options'].get('mode', 'w')
9
+ # self.stream = tnode_config['stream']
10
+ self.output_path = tnode_config['options']['path']
11
+ self.overwrite = tnode_config['options'].get('overwrite', False)
12
+ self.header = tnode_config['options'].get('header', True)
13
+ self.df = t_df[tnode_config['options']['parents'][0]]
14
+
15
+ def __write_csv(self):
16
+ import pandas as pd
17
+ pandas_df = self.df.toPandas()
18
+ write_mode = "w" if self.overwrite else "x"
19
+
20
+ pandas_df.to_csv(self.output_path, mode=write_mode, header=self.header, index=False)
21
+
22
+ def _process(self):
23
+ self.__write_csv()
@@ -0,0 +1,83 @@
1
+ from pyspark.sql import DataFrame
2
+ from datapipelab.app.node.tnode import TNode
3
+ import json
4
+
5
+
6
+ class TeamsNotificationSinkNode(TNode):
7
+ def __init__(self, spark, tnode_config, df=None):
8
+ super().__init__(spark=spark)
9
+ self.teams_msg_body = tnode_config['options']['teams_msg_body']
10
+ self.teams_msg_title = tnode_config['options'].get('teams_msg_title', 'Notification')
11
+ self.teams_users = tnode_config['options'].get('teams_users', None)
12
+ self.teams_channel_webhook_url = tnode_config['options']['teams_channel_webhook_url']
13
+ self.df = df
14
+
15
+ def __prepare_teams_notification_payload(self, teams_msg_body: list, teams_msg_title: str = "Notification",
16
+ teams_users: list = None):
17
+ if teams_users is not None:
18
+ teams_msg_body.extend([f'<at>{user}</at>' for user in teams_users])
19
+ print(teams_msg_body)
20
+ final_msg = ' \n'.join(teams_msg_body)
21
+ entities = []
22
+ for user_id in teams_users:
23
+ mention = {
24
+ "type": "mention",
25
+ "text": f"<at>{user_id}</at>",
26
+ "mentioned": {
27
+ "id": f"{user_id}@cantire.com",
28
+ "name": f"{user_id}"
29
+ }
30
+ }
31
+ entities.append(mention)
32
+ payload = {
33
+ "type": "message",
34
+ "attachments": [
35
+ {
36
+ "contentType": "application/vnd.microsoft.card.adaptive",
37
+ "content": {
38
+ "type": "AdaptiveCard",
39
+ "body": [
40
+ {
41
+ "type": "TextBlock",
42
+ "size": "Medium",
43
+ "weight": "Bolder",
44
+ "text": f"{teams_msg_title}"
45
+ },
46
+ {
47
+ "type": "TextBlock",
48
+ "text": f"{final_msg}",
49
+ "wrap": "true",
50
+ "maxLines": 0
51
+ }
52
+ ],
53
+ "$schema": "http://adaptivecards.io/schemas/adaptive-card.json",
54
+ "version": "1.0",
55
+ "msteams": {
56
+ "entities": entities
57
+ }
58
+ }
59
+ }]
60
+ }
61
+ else:
62
+ final_msg = ' \n'.join(teams_msg_body)
63
+ payload = {
64
+ "text": f"{final_msg}",
65
+ "title": f"{teams_msg_title}"
66
+ }
67
+ return payload
68
+ def __send_teams_notification(self):
69
+ import requests
70
+ payload = self.__prepare_teams_notification_payload(self.teams_msg_body, self.teams_msg_title, self.teams_users)
71
+ headers = {"Content-Type": "application/json"}
72
+ try:
73
+ response = requests.post(self.teams_channel_webhook_url, headers=headers, data=json.dumps(payload))
74
+ if response.status_code == 200:
75
+ print("Message sent successfully!")
76
+ else:
77
+ print(f"Failed to send message: {response.status_code}, {response.text}")
78
+ except Exception as e:
79
+ print(f"An error occurred: {e}")
80
+
81
+
82
+ def _process(self):
83
+ self.__send_teams_notification()
File without changes
@@ -0,0 +1,16 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+
3
+
4
+ class DeltaSourceNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.delta_table_path = tnode_config['options']['path']
8
+ self.node_name = tnode_config['name']
9
+
10
+ def __sql_query(self, delta_table_path):
11
+ self.node = self.spark.read.format("delta").load(delta_table_path)
12
+
13
+ def _process(self):
14
+ self.__sql_query(self.delta_table_path)
15
+ self._createOrReplaceTempView()
16
+ return self.node
@@ -0,0 +1,16 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+
3
+
4
+ class HiveSourceNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.sql_query = tnode_config['options']['query']
8
+ self.node_name = tnode_config['name']
9
+
10
+ def __sql_query(self, sql_query):
11
+ self.node = self.spark.sql(sql_query)
12
+
13
+ def _process(self):
14
+ self.__sql_query(self.sql_query)
15
+ self._createOrReplaceTempView()
16
+ return self.node
@@ -0,0 +1,16 @@
1
+ from datapipelab.app.node.tnode import TNode
2
+
3
+
4
+ class SparkSourceNode(TNode):
5
+ def __init__(self, spark, tnode_config):
6
+ super().__init__(spark=spark)
7
+ self.sql_query = tnode_config['options']['query']
8
+ self.node_name = tnode_config['name']
9
+
10
+ def __sql_query(self, sql_query):
11
+ self.node = self.spark.sql(sql_query)
12
+
13
+ def _process(self):
14
+ self.__sql_query(self.sql_query)
15
+ self._createOrReplaceTempView()
16
+ return self.node
@@ -0,0 +1,15 @@
1
+ class TNode:
2
+ def __init__(self, spark, node_type='SparkDataFrame'):
3
+ self.node_type = node_type
4
+ self.node = None
5
+ self.spark = spark
6
+
7
+ def _process(self):
8
+ raise NotImplementedError("Subclasses must implement _process method")
9
+
10
+ # Source and Processor nodes
11
+ def _createOrReplaceTempView(self):
12
+ self.node.createOrReplaceTempView(self.node_name)
13
+
14
+ def run(self):
15
+ return self._process()
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: datapipelab
3
+ Version: 0.1
4
+ Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
+ Requires-Dist: json5
6
+ Requires-Dist: loguru
7
+ Requires-Dist: azure-storage-blob
8
+ Requires-Dist: google-cloud-storage
9
+ Requires-Dist: pandas
10
+ Dynamic: requires-dist
11
+ Dynamic: summary
@@ -0,0 +1,25 @@
1
+ MANIFEST.in
2
+ README.md
3
+ setup.py
4
+ app/__init__.py
5
+ app/connector_node/__init__.py
6
+ app/node/__init__.py
7
+ app/node/tnode.py
8
+ app/node/processor/__init__.py
9
+ app/node/processor/custom_node.py
10
+ app/node/processor/spark_node.py
11
+ app/node/sink/__init__.py
12
+ app/node/sink/csv_node.py
13
+ app/node/sink/delta_node.py
14
+ app/node/sink/hive_node.py
15
+ app/node/sink/pandas_csv_node.py
16
+ app/node/sink/teams_notification_node.py
17
+ app/node/source/__init__.py
18
+ app/node/source/delta_node.py
19
+ app/node/source/hive_node.py
20
+ app/node/source/spark_node.py
21
+ datapipelab.egg-info/PKG-INFO
22
+ datapipelab.egg-info/SOURCES.txt
23
+ datapipelab.egg-info/dependency_links.txt
24
+ datapipelab.egg-info/requires.txt
25
+ datapipelab.egg-info/top_level.txt
@@ -0,0 +1,5 @@
1
+ json5
2
+ loguru
3
+ azure-storage-blob
4
+ google-cloud-storage
5
+ pandas
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,16 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='datapipelab',
5
+ version='0.1',
6
+ description='A data pipeline library with connectors, sources, processors, and sinks.',
7
+ packages=find_packages(),
8
+ include_package_data=True,
9
+ install_requires=[
10
+ 'json5',
11
+ 'loguru',
12
+ 'azure-storage-blob',
13
+ 'google-cloud-storage',
14
+ 'pandas'
15
+ ],
16
+ )