datapipelab 0.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipelab/app/node/source/__init__.py +0 -0
- datapipelab/engine.py +14 -0
- datapipelab/logger.py +35 -0
- datapipelab/pipeline.py +36 -0
- datapipelab/pipeline_config.py +71 -0
- datapipelab/pipeline_handler.py +74 -0
- {datapipelab-0.1.dist-info → datapipelab-0.1.2.dist-info}/METADATA +1 -1
- datapipelab-0.1.2.dist-info/RECORD +27 -0
- datapipelab-0.1.2.dist-info/top_level.txt +1 -0
- datapipelab-0.1.dist-info/RECORD +0 -21
- datapipelab-0.1.dist-info/top_level.txt +0 -1
- {app → datapipelab}/__init__.py +0 -0
- {app/connector_node → datapipelab/app}/__init__.py +0 -0
- {app/node → datapipelab/app/connector_node}/__init__.py +0 -0
- {app/node/processor → datapipelab/app/node}/__init__.py +0 -0
- {app/node/sink → datapipelab/app/node/processor}/__init__.py +0 -0
- {app → datapipelab/app}/node/processor/custom_node.py +0 -0
- {app → datapipelab/app}/node/processor/spark_node.py +0 -0
- {app/node/source → datapipelab/app/node/sink}/__init__.py +0 -0
- {app → datapipelab/app}/node/sink/csv_node.py +0 -0
- {app → datapipelab/app}/node/sink/delta_node.py +0 -0
- {app → datapipelab/app}/node/sink/hive_node.py +0 -0
- {app → datapipelab/app}/node/sink/pandas_csv_node.py +0 -0
- {app → datapipelab/app}/node/sink/teams_notification_node.py +0 -0
- {app → datapipelab/app}/node/source/delta_node.py +0 -0
- {app → datapipelab/app}/node/source/hive_node.py +0 -0
- {app → datapipelab/app}/node/source/spark_node.py +0 -0
- {app → datapipelab/app}/node/tnode.py +0 -0
- {datapipelab-0.1.dist-info → datapipelab-0.1.2.dist-info}/WHEEL +0 -0
File without changes
|
datapipelab/engine.py
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
from datapipelab.pipeline import Pipeline
|
2
|
+
from datapipelab.logger import logger
|
3
|
+
|
4
|
+
|
5
|
+
class Engine:
|
6
|
+
def __init__(self, engine_config_path, spark, params):
|
7
|
+
self.engine_config_path = engine_config_path
|
8
|
+
self.params = params
|
9
|
+
self.pipeline = None
|
10
|
+
self.spark = spark
|
11
|
+
|
12
|
+
def running_travelers(self):
|
13
|
+
self.pipeline = Pipeline(self.engine_config_path, self.spark, self.params)
|
14
|
+
self.pipeline.run()
|
datapipelab/logger.py
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
import sys
|
2
|
+
from loguru import logger
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
# Create logs directory
|
6
|
+
log_dir = Path(__file__).resolve().parent / "logs"
|
7
|
+
log_dir.mkdir(exist_ok=True)
|
8
|
+
|
9
|
+
# Log file path
|
10
|
+
log_file = log_dir / "app.log"
|
11
|
+
|
12
|
+
# Remove the default loguru handler
|
13
|
+
logger.remove()
|
14
|
+
|
15
|
+
# Add combined rotation: size and time
|
16
|
+
logger.add(
|
17
|
+
str(log_file),
|
18
|
+
rotation="10 MB", # Rotate if file exceeds 10MB
|
19
|
+
retention=100, # Keep last 100 files
|
20
|
+
compression="zip", # Compress rotated logs as .zip
|
21
|
+
enqueue=True,
|
22
|
+
backtrace=True,
|
23
|
+
diagnose=True,
|
24
|
+
level="INFO",
|
25
|
+
serialize=True
|
26
|
+
)
|
27
|
+
|
28
|
+
# Add console handler
|
29
|
+
logger.add(
|
30
|
+
sink=sys.stdout, # Output to console
|
31
|
+
level="INFO", # Set log level
|
32
|
+
backtrace=True,
|
33
|
+
diagnose=True,
|
34
|
+
colorize=True # Enable colored output
|
35
|
+
)
|
datapipelab/pipeline.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
from datapipelab.pipeline_config import PipelineConfig
|
2
|
+
from datapipelab.pipeline_handler import PipelineHandler
|
3
|
+
from datapipelab.logger import Logger
|
4
|
+
|
5
|
+
class Pipeline:
|
6
|
+
def __init__(self, pipeline_config_path, spark, config_params=None):
|
7
|
+
self.pipeline_config = None
|
8
|
+
self.pipeline_config_path = pipeline_config_path
|
9
|
+
self.params = config_params
|
10
|
+
self.__load_config()
|
11
|
+
self.spark = spark
|
12
|
+
|
13
|
+
def __load_config(self):
|
14
|
+
self.pipeline_config = PipelineConfig(self.pipeline_config_path, self.params)
|
15
|
+
self.pipeline_config.create_pipeline_nodes()
|
16
|
+
|
17
|
+
def __process(self):
|
18
|
+
logger.info('Fetch sources...')
|
19
|
+
print(self.pipeline_config.sources)
|
20
|
+
tnode = PipelineHandler(self.spark)
|
21
|
+
self.t_df = {}
|
22
|
+
for source in self.pipeline_config.sources:
|
23
|
+
self.t_df[source] = tnode.create_source_node(source, self.pipeline_config.sources[source])
|
24
|
+
|
25
|
+
logger.info('Running Processors...')
|
26
|
+
print(self.pipeline_config.processors)
|
27
|
+
for processor in self.pipeline_config.processors:
|
28
|
+
self.t_df[processor] = tnode.create_processor_node(processor, self.pipeline_config.processors[processor], self.t_df)
|
29
|
+
|
30
|
+
logger.info('Write into sinks...')
|
31
|
+
print(self.pipeline_config.sinks)
|
32
|
+
for sink in self.pipeline_config.sinks:
|
33
|
+
tnode.write_sink_node(self.pipeline_config.sinks[sink], self.t_df)
|
34
|
+
|
35
|
+
def run(self):
|
36
|
+
self.__process()
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import json5
|
2
|
+
|
3
|
+
|
4
|
+
class PipelineConfig:
|
5
|
+
def __init__(self, config_file, params):
|
6
|
+
print(config_file)
|
7
|
+
self.params = params
|
8
|
+
self.config_file = config_file
|
9
|
+
self.load_json_config_file()
|
10
|
+
self.sources = {}
|
11
|
+
self.processors = {}
|
12
|
+
self.sinks = {}
|
13
|
+
|
14
|
+
def load_json_config_file(self):
|
15
|
+
with open(self.config_file, 'r') as f:
|
16
|
+
# Read the file as a text file
|
17
|
+
json_config_file = f.read()
|
18
|
+
# Replace the placeholders with the actual values
|
19
|
+
for key, value in self.params.items():
|
20
|
+
json_config_file = json_config_file.replace(f"{{{key}}}", value)
|
21
|
+
# Convert to JSON file
|
22
|
+
self.pipeline_settings = json5.loads(json_config_file)
|
23
|
+
|
24
|
+
def create_pipeline_nodes(self):
|
25
|
+
for component in self.pipeline_settings:
|
26
|
+
if component['type'] == 'source':
|
27
|
+
source = {
|
28
|
+
'input_format': component['format'],
|
29
|
+
'name': component['name'],
|
30
|
+
'input_type': component['source'],
|
31
|
+
'options': component['options']
|
32
|
+
}
|
33
|
+
self.sources[component['name']] = source
|
34
|
+
elif component['type'] == 'processor':
|
35
|
+
processor = {
|
36
|
+
'format': component['format'],
|
37
|
+
'name': component['name'],
|
38
|
+
'options': component['options']
|
39
|
+
}
|
40
|
+
parents = component['options'].get('parents', [])
|
41
|
+
parent_sources = []
|
42
|
+
for parent in parents:
|
43
|
+
parent_source = self.sources.get(parent)
|
44
|
+
if not parent_source:
|
45
|
+
parent_processor = self.processors.get(parent)
|
46
|
+
if not parent_processor:
|
47
|
+
raise ValueError(f"Parent '{parent}' not found")
|
48
|
+
parent_source = parent_processor['source']
|
49
|
+
parent_sources.append(parent_source)
|
50
|
+
processor['source'] = parent_sources
|
51
|
+
self.processors[component['name']] = processor
|
52
|
+
elif component['type'] == 'sink':
|
53
|
+
sink = {
|
54
|
+
'output_format': component['format'],
|
55
|
+
'name': component['name'],
|
56
|
+
'output_type': component['sink'],
|
57
|
+
'options': component['options']
|
58
|
+
}
|
59
|
+
parent = component['options']['parents'][0]
|
60
|
+
parent_source = self.sources.get(parent)
|
61
|
+
if not parent_source:
|
62
|
+
parent_processor = self.processors.get(parent)
|
63
|
+
if not parent_processor:
|
64
|
+
raise ValueError(f"Parent '{parent}' not found")
|
65
|
+
parent_source = parent_processor['source']
|
66
|
+
sink['source'] = parent_source
|
67
|
+
self.sinks[component['name']] = sink
|
68
|
+
else:
|
69
|
+
raise ValueError(f"Invalid component type '{component['type']}'")
|
70
|
+
|
71
|
+
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from datapipelab.app.node.processor.custom_node import CustomNode
|
2
|
+
from datapipelab.app.node.source.spark_node import SparkSourceNode
|
3
|
+
from datapipelab.app.node.source.delta_node import DeltaSourceNode
|
4
|
+
from datapipelab.app.node.processor.spark_node import SparkProcessorNode
|
5
|
+
from datapipelab.app.node.sink.delta_node import DeltaSinkNode
|
6
|
+
from datapipelab.app.node.sink.csv_node import CSVSinkNode
|
7
|
+
from datapipelab.app.node.sink.pandas_csv_node import PandasCSVSinkNode
|
8
|
+
from datapipelab.app.node.sink.teams_notification_node import TeamsNotificationSinkNode
|
9
|
+
|
10
|
+
|
11
|
+
class PipelineHandler:
|
12
|
+
def __init__(self, spark=None):
|
13
|
+
self.spark = spark
|
14
|
+
|
15
|
+
def create_source_node(self, tnode_name, tnode_config):
|
16
|
+
input_type = tnode_config['input_type']
|
17
|
+
input_format = tnode_config['input_format']
|
18
|
+
print(tnode_name, input_type, input_format, tnode_config)
|
19
|
+
|
20
|
+
source_df = None
|
21
|
+
if input_type == 'SharedDrive':
|
22
|
+
if input_format == 'excel':
|
23
|
+
source_df = CTCSMBReaderSourceNode(tnode_config).run()
|
24
|
+
if input_type == "Oracle":
|
25
|
+
if input_format == "query":
|
26
|
+
source_df = OracleSourceNode(tnode_config).run()
|
27
|
+
if input_type == "SharePoint":
|
28
|
+
if input_format == "csv":
|
29
|
+
source_df = SharePointSourceNode(tnode_config).run()
|
30
|
+
if input_type == 'spark':
|
31
|
+
if input_format == 'spark':
|
32
|
+
source_df = SparkSourceNode(self.spark, tnode_config).run()
|
33
|
+
if input_type == 'adls_path':
|
34
|
+
if input_format == 'delta':
|
35
|
+
source_df = DeltaSourceNode(self.spark, tnode_config).run()
|
36
|
+
|
37
|
+
return source_df
|
38
|
+
|
39
|
+
def create_processor_node(self, tnode_name, tnode_config, t_df):
|
40
|
+
tnode_format = tnode_config['format']
|
41
|
+
print(tnode_name, tnode_format, tnode_config)
|
42
|
+
processor_df = None
|
43
|
+
if tnode_format == 'custom':
|
44
|
+
processor_df = CustomNode(self.spark, tnode_config).run()
|
45
|
+
|
46
|
+
if tnode_format == 'query':
|
47
|
+
processor_df = SparkProcessorNode(self.spark, tnode_config).run()
|
48
|
+
|
49
|
+
return processor_df
|
50
|
+
|
51
|
+
def write_sink_node(self, tnode_config, t_df):
|
52
|
+
tnode_type = tnode_config['output_type']
|
53
|
+
tnode_format = tnode_config['output_format']
|
54
|
+
tnode_name_df = tnode_config['options']['parents'][0]
|
55
|
+
print(tnode_type, tnode_format, tnode_name_df)
|
56
|
+
|
57
|
+
if tnode_type == 'SharePoint':
|
58
|
+
if tnode_format == 'csv':
|
59
|
+
sharepoint_sink = SharePointSinkNode(tnode_config, t_df).run()
|
60
|
+
print(sharepoint_sink)
|
61
|
+
# HiveSinkNode(self.spark, tnode_config, tnode_df[tnode_name_df]).run()
|
62
|
+
if tnode_type == "teams":
|
63
|
+
if tnode_format == "channel_notification":
|
64
|
+
TeamsNotificationSinkNode(self.spark, tnode_config, t_df).run() # TODO: Spark can be set to None
|
65
|
+
if tnode_type == "adls_path":
|
66
|
+
if tnode_format == "delta":
|
67
|
+
DeltaSinkNode(self.spark, tnode_config, t_df).run()
|
68
|
+
if tnode_format == "csv":
|
69
|
+
CSVSinkNode(self.spark, tnode_config, t_df).run()
|
70
|
+
if tnode_type == "local":
|
71
|
+
if tnode_format == "csv":
|
72
|
+
PandasCSVSinkNode(self.spark, tnode_config, t_df).run()
|
73
|
+
|
74
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
datapipelab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
datapipelab/engine.py,sha256=dYm39Yb0Eqt76pwdc-ilzZNxehMKdiuidE557YexHaU,437
|
3
|
+
datapipelab/logger.py,sha256=Ugv0A4TfD3JWCWXNWu0lURcnfAEyuVrK3IrvVVgcHBo,864
|
4
|
+
datapipelab/pipeline.py,sha256=kX_Wy-l6DZ7ega3OKk-8ZV14aQlOCNMcxX6AZ6bIHSU,1464
|
5
|
+
datapipelab/pipeline_config.py,sha256=MyEV2jz54_ZcEAgwmMR91KP8XG8W5raGoTRWctSeIrI,3038
|
6
|
+
datapipelab/pipeline_handler.py,sha256=1t5wwsaVUMXXmsEa-Qt-6jtMIyAZmX1hgo2I_UgbtiM,3265
|
7
|
+
datapipelab/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
datapipelab/app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
datapipelab/app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
datapipelab/app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
|
11
|
+
datapipelab/app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
datapipelab/app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
|
13
|
+
datapipelab/app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
|
14
|
+
datapipelab/app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
datapipelab/app/node/sink/csv_node.py,sha256=YaDporq4L358dJCkheCjpGmDBsYtvai6dnGiS-uf_Mc,1673
|
16
|
+
datapipelab/app/node/sink/delta_node.py,sha256=h3gnBsuYoXUlnhCouav-iwdQ4UVZPcvG4_5r9gxY8JM,2036
|
17
|
+
datapipelab/app/node/sink/hive_node.py,sha256=E3pP_U7LzCFdZ0LVkqR0LDlU8HZtfbKgOo72yJsLLT0,1098
|
18
|
+
datapipelab/app/node/sink/pandas_csv_node.py,sha256=bNF-Sb5pKMV1qAePzyDhwYLB075Rg7tjxfQ6BHcq1Wk,862
|
19
|
+
datapipelab/app/node/sink/teams_notification_node.py,sha256=ZDE-F4nfmrK7UnXoSI6mHuhUHWlgE8rUCdPFW5ZXr7E,3564
|
20
|
+
datapipelab/app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
+
datapipelab/app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
|
22
|
+
datapipelab/app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
|
23
|
+
datapipelab/app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
|
24
|
+
datapipelab-0.1.2.dist-info/METADATA,sha256=p8_ybC58-fobA9s_WGLjfux77Ju4aroL2UNMHMx4kHA,312
|
25
|
+
datapipelab-0.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
26
|
+
datapipelab-0.1.2.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
|
27
|
+
datapipelab-0.1.2.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
datapipelab
|
datapipelab-0.1.dist-info/RECORD
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
|
5
|
-
app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
|
7
|
-
app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
|
8
|
-
app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
app/node/sink/csv_node.py,sha256=YaDporq4L358dJCkheCjpGmDBsYtvai6dnGiS-uf_Mc,1673
|
10
|
-
app/node/sink/delta_node.py,sha256=h3gnBsuYoXUlnhCouav-iwdQ4UVZPcvG4_5r9gxY8JM,2036
|
11
|
-
app/node/sink/hive_node.py,sha256=E3pP_U7LzCFdZ0LVkqR0LDlU8HZtfbKgOo72yJsLLT0,1098
|
12
|
-
app/node/sink/pandas_csv_node.py,sha256=bNF-Sb5pKMV1qAePzyDhwYLB075Rg7tjxfQ6BHcq1Wk,862
|
13
|
-
app/node/sink/teams_notification_node.py,sha256=ZDE-F4nfmrK7UnXoSI6mHuhUHWlgE8rUCdPFW5ZXr7E,3564
|
14
|
-
app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
|
16
|
-
app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
|
17
|
-
app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
|
18
|
-
datapipelab-0.1.dist-info/METADATA,sha256=IYnShuv2Fa5EVRfJSdJhT7kZzT4Nedr9UIEafmrP2YM,310
|
19
|
-
datapipelab-0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
20
|
-
datapipelab-0.1.dist-info/top_level.txt,sha256=io9g7LCbfmTG1SFKgEOGXmCFB9uMP2H5lerm0HiHWQE,4
|
21
|
-
datapipelab-0.1.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
app
|
{app → datapipelab}/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|