datapipelab 0.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. datapipelab/app/node/source/__init__.py +0 -0
  2. datapipelab/engine.py +14 -0
  3. datapipelab/logger.py +35 -0
  4. datapipelab/pipeline.py +36 -0
  5. datapipelab/pipeline_config.py +71 -0
  6. datapipelab/pipeline_handler.py +74 -0
  7. {datapipelab-0.1.dist-info → datapipelab-0.1.2.dist-info}/METADATA +1 -1
  8. datapipelab-0.1.2.dist-info/RECORD +27 -0
  9. datapipelab-0.1.2.dist-info/top_level.txt +1 -0
  10. datapipelab-0.1.dist-info/RECORD +0 -21
  11. datapipelab-0.1.dist-info/top_level.txt +0 -1
  12. {app → datapipelab}/__init__.py +0 -0
  13. {app/connector_node → datapipelab/app}/__init__.py +0 -0
  14. {app/node → datapipelab/app/connector_node}/__init__.py +0 -0
  15. {app/node/processor → datapipelab/app/node}/__init__.py +0 -0
  16. {app/node/sink → datapipelab/app/node/processor}/__init__.py +0 -0
  17. {app → datapipelab/app}/node/processor/custom_node.py +0 -0
  18. {app → datapipelab/app}/node/processor/spark_node.py +0 -0
  19. {app/node/source → datapipelab/app/node/sink}/__init__.py +0 -0
  20. {app → datapipelab/app}/node/sink/csv_node.py +0 -0
  21. {app → datapipelab/app}/node/sink/delta_node.py +0 -0
  22. {app → datapipelab/app}/node/sink/hive_node.py +0 -0
  23. {app → datapipelab/app}/node/sink/pandas_csv_node.py +0 -0
  24. {app → datapipelab/app}/node/sink/teams_notification_node.py +0 -0
  25. {app → datapipelab/app}/node/source/delta_node.py +0 -0
  26. {app → datapipelab/app}/node/source/hive_node.py +0 -0
  27. {app → datapipelab/app}/node/source/spark_node.py +0 -0
  28. {app → datapipelab/app}/node/tnode.py +0 -0
  29. {datapipelab-0.1.dist-info → datapipelab-0.1.2.dist-info}/WHEEL +0 -0
File without changes
datapipelab/engine.py ADDED
@@ -0,0 +1,14 @@
1
+ from datapipelab.pipeline import Pipeline
2
+ from datapipelab.logger import logger
3
+
4
+
5
+ class Engine:
6
+ def __init__(self, engine_config_path, spark, params):
7
+ self.engine_config_path = engine_config_path
8
+ self.params = params
9
+ self.pipeline = None
10
+ self.spark = spark
11
+
12
+ def running_travelers(self):
13
+ self.pipeline = Pipeline(self.engine_config_path, self.spark, self.params)
14
+ self.pipeline.run()
datapipelab/logger.py ADDED
@@ -0,0 +1,35 @@
1
+ import sys
2
+ from loguru import logger
3
+ from pathlib import Path
4
+
5
+ # Create logs directory
6
+ log_dir = Path(__file__).resolve().parent / "logs"
7
+ log_dir.mkdir(exist_ok=True)
8
+
9
+ # Log file path
10
+ log_file = log_dir / "app.log"
11
+
12
+ # Remove the default loguru handler
13
+ logger.remove()
14
+
15
+ # Add combined rotation: size and time
16
+ logger.add(
17
+ str(log_file),
18
+ rotation="10 MB", # Rotate if file exceeds 10MB
19
+ retention=100, # Keep last 100 files
20
+ compression="zip", # Compress rotated logs as .zip
21
+ enqueue=True,
22
+ backtrace=True,
23
+ diagnose=True,
24
+ level="INFO",
25
+ serialize=True
26
+ )
27
+
28
+ # Add console handler
29
+ logger.add(
30
+ sink=sys.stdout, # Output to console
31
+ level="INFO", # Set log level
32
+ backtrace=True,
33
+ diagnose=True,
34
+ colorize=True # Enable colored output
35
+ )
@@ -0,0 +1,36 @@
1
+ from datapipelab.pipeline_config import PipelineConfig
2
+ from datapipelab.pipeline_handler import PipelineHandler
3
+ from datapipelab.logger import Logger
4
+
5
+ class Pipeline:
6
+ def __init__(self, pipeline_config_path, spark, config_params=None):
7
+ self.pipeline_config = None
8
+ self.pipeline_config_path = pipeline_config_path
9
+ self.params = config_params
10
+ self.__load_config()
11
+ self.spark = spark
12
+
13
+ def __load_config(self):
14
+ self.pipeline_config = PipelineConfig(self.pipeline_config_path, self.params)
15
+ self.pipeline_config.create_pipeline_nodes()
16
+
17
+ def __process(self):
18
+ logger.info('Fetch sources...')
19
+ print(self.pipeline_config.sources)
20
+ tnode = PipelineHandler(self.spark)
21
+ self.t_df = {}
22
+ for source in self.pipeline_config.sources:
23
+ self.t_df[source] = tnode.create_source_node(source, self.pipeline_config.sources[source])
24
+
25
+ logger.info('Running Processors...')
26
+ print(self.pipeline_config.processors)
27
+ for processor in self.pipeline_config.processors:
28
+ self.t_df[processor] = tnode.create_processor_node(processor, self.pipeline_config.processors[processor], self.t_df)
29
+
30
+ logger.info('Write into sinks...')
31
+ print(self.pipeline_config.sinks)
32
+ for sink in self.pipeline_config.sinks:
33
+ tnode.write_sink_node(self.pipeline_config.sinks[sink], self.t_df)
34
+
35
+ def run(self):
36
+ self.__process()
@@ -0,0 +1,71 @@
1
+ import json5
2
+
3
+
4
+ class PipelineConfig:
5
+ def __init__(self, config_file, params):
6
+ print(config_file)
7
+ self.params = params
8
+ self.config_file = config_file
9
+ self.load_json_config_file()
10
+ self.sources = {}
11
+ self.processors = {}
12
+ self.sinks = {}
13
+
14
+ def load_json_config_file(self):
15
+ with open(self.config_file, 'r') as f:
16
+ # Read the file as a text file
17
+ json_config_file = f.read()
18
+ # Replace the placeholders with the actual values
19
+ for key, value in self.params.items():
20
+ json_config_file = json_config_file.replace(f"{{{key}}}", value)
21
+ # Convert to JSON file
22
+ self.pipeline_settings = json5.loads(json_config_file)
23
+
24
+ def create_pipeline_nodes(self):
25
+ for component in self.pipeline_settings:
26
+ if component['type'] == 'source':
27
+ source = {
28
+ 'input_format': component['format'],
29
+ 'name': component['name'],
30
+ 'input_type': component['source'],
31
+ 'options': component['options']
32
+ }
33
+ self.sources[component['name']] = source
34
+ elif component['type'] == 'processor':
35
+ processor = {
36
+ 'format': component['format'],
37
+ 'name': component['name'],
38
+ 'options': component['options']
39
+ }
40
+ parents = component['options'].get('parents', [])
41
+ parent_sources = []
42
+ for parent in parents:
43
+ parent_source = self.sources.get(parent)
44
+ if not parent_source:
45
+ parent_processor = self.processors.get(parent)
46
+ if not parent_processor:
47
+ raise ValueError(f"Parent '{parent}' not found")
48
+ parent_source = parent_processor['source']
49
+ parent_sources.append(parent_source)
50
+ processor['source'] = parent_sources
51
+ self.processors[component['name']] = processor
52
+ elif component['type'] == 'sink':
53
+ sink = {
54
+ 'output_format': component['format'],
55
+ 'name': component['name'],
56
+ 'output_type': component['sink'],
57
+ 'options': component['options']
58
+ }
59
+ parent = component['options']['parents'][0]
60
+ parent_source = self.sources.get(parent)
61
+ if not parent_source:
62
+ parent_processor = self.processors.get(parent)
63
+ if not parent_processor:
64
+ raise ValueError(f"Parent '{parent}' not found")
65
+ parent_source = parent_processor['source']
66
+ sink['source'] = parent_source
67
+ self.sinks[component['name']] = sink
68
+ else:
69
+ raise ValueError(f"Invalid component type '{component['type']}'")
70
+
71
+
@@ -0,0 +1,74 @@
1
+ from datapipelab.app.node.processor.custom_node import CustomNode
2
+ from datapipelab.app.node.source.spark_node import SparkSourceNode
3
+ from datapipelab.app.node.source.delta_node import DeltaSourceNode
4
+ from datapipelab.app.node.processor.spark_node import SparkProcessorNode
5
+ from datapipelab.app.node.sink.delta_node import DeltaSinkNode
6
+ from datapipelab.app.node.sink.csv_node import CSVSinkNode
7
+ from datapipelab.app.node.sink.pandas_csv_node import PandasCSVSinkNode
8
+ from datapipelab.app.node.sink.teams_notification_node import TeamsNotificationSinkNode
9
+
10
+
11
+ class PipelineHandler:
12
+ def __init__(self, spark=None):
13
+ self.spark = spark
14
+
15
+ def create_source_node(self, tnode_name, tnode_config):
16
+ input_type = tnode_config['input_type']
17
+ input_format = tnode_config['input_format']
18
+ print(tnode_name, input_type, input_format, tnode_config)
19
+
20
+ source_df = None
21
+ if input_type == 'SharedDrive':
22
+ if input_format == 'excel':
23
+ source_df = CTCSMBReaderSourceNode(tnode_config).run()
24
+ if input_type == "Oracle":
25
+ if input_format == "query":
26
+ source_df = OracleSourceNode(tnode_config).run()
27
+ if input_type == "SharePoint":
28
+ if input_format == "csv":
29
+ source_df = SharePointSourceNode(tnode_config).run()
30
+ if input_type == 'spark':
31
+ if input_format == 'spark':
32
+ source_df = SparkSourceNode(self.spark, tnode_config).run()
33
+ if input_type == 'adls_path':
34
+ if input_format == 'delta':
35
+ source_df = DeltaSourceNode(self.spark, tnode_config).run()
36
+
37
+ return source_df
38
+
39
+ def create_processor_node(self, tnode_name, tnode_config, t_df):
40
+ tnode_format = tnode_config['format']
41
+ print(tnode_name, tnode_format, tnode_config)
42
+ processor_df = None
43
+ if tnode_format == 'custom':
44
+ processor_df = CustomNode(self.spark, tnode_config).run()
45
+
46
+ if tnode_format == 'query':
47
+ processor_df = SparkProcessorNode(self.spark, tnode_config).run()
48
+
49
+ return processor_df
50
+
51
+ def write_sink_node(self, tnode_config, t_df):
52
+ tnode_type = tnode_config['output_type']
53
+ tnode_format = tnode_config['output_format']
54
+ tnode_name_df = tnode_config['options']['parents'][0]
55
+ print(tnode_type, tnode_format, tnode_name_df)
56
+
57
+ if tnode_type == 'SharePoint':
58
+ if tnode_format == 'csv':
59
+ sharepoint_sink = SharePointSinkNode(tnode_config, t_df).run()
60
+ print(sharepoint_sink)
61
+ # HiveSinkNode(self.spark, tnode_config, tnode_df[tnode_name_df]).run()
62
+ if tnode_type == "teams":
63
+ if tnode_format == "channel_notification":
64
+ TeamsNotificationSinkNode(self.spark, tnode_config, t_df).run() # TODO: Spark can be set to None
65
+ if tnode_type == "adls_path":
66
+ if tnode_format == "delta":
67
+ DeltaSinkNode(self.spark, tnode_config, t_df).run()
68
+ if tnode_format == "csv":
69
+ CSVSinkNode(self.spark, tnode_config, t_df).run()
70
+ if tnode_type == "local":
71
+ if tnode_format == "csv":
72
+ PandasCSVSinkNode(self.spark, tnode_config, t_df).run()
73
+
74
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datapipelab
3
- Version: 0.1
3
+ Version: 0.1.2
4
4
  Summary: A data pipeline library with connectors, sources, processors, and sinks.
5
5
  Requires-Dist: json5
6
6
  Requires-Dist: loguru
@@ -0,0 +1,27 @@
1
+ datapipelab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ datapipelab/engine.py,sha256=dYm39Yb0Eqt76pwdc-ilzZNxehMKdiuidE557YexHaU,437
3
+ datapipelab/logger.py,sha256=Ugv0A4TfD3JWCWXNWu0lURcnfAEyuVrK3IrvVVgcHBo,864
4
+ datapipelab/pipeline.py,sha256=kX_Wy-l6DZ7ega3OKk-8ZV14aQlOCNMcxX6AZ6bIHSU,1464
5
+ datapipelab/pipeline_config.py,sha256=MyEV2jz54_ZcEAgwmMR91KP8XG8W5raGoTRWctSeIrI,3038
6
+ datapipelab/pipeline_handler.py,sha256=1t5wwsaVUMXXmsEa-Qt-6jtMIyAZmX1hgo2I_UgbtiM,3265
7
+ datapipelab/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ datapipelab/app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ datapipelab/app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ datapipelab/app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
11
+ datapipelab/app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ datapipelab/app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
13
+ datapipelab/app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
14
+ datapipelab/app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ datapipelab/app/node/sink/csv_node.py,sha256=YaDporq4L358dJCkheCjpGmDBsYtvai6dnGiS-uf_Mc,1673
16
+ datapipelab/app/node/sink/delta_node.py,sha256=h3gnBsuYoXUlnhCouav-iwdQ4UVZPcvG4_5r9gxY8JM,2036
17
+ datapipelab/app/node/sink/hive_node.py,sha256=E3pP_U7LzCFdZ0LVkqR0LDlU8HZtfbKgOo72yJsLLT0,1098
18
+ datapipelab/app/node/sink/pandas_csv_node.py,sha256=bNF-Sb5pKMV1qAePzyDhwYLB075Rg7tjxfQ6BHcq1Wk,862
19
+ datapipelab/app/node/sink/teams_notification_node.py,sha256=ZDE-F4nfmrK7UnXoSI6mHuhUHWlgE8rUCdPFW5ZXr7E,3564
20
+ datapipelab/app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ datapipelab/app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
22
+ datapipelab/app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
23
+ datapipelab/app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
24
+ datapipelab-0.1.2.dist-info/METADATA,sha256=p8_ybC58-fobA9s_WGLjfux77Ju4aroL2UNMHMx4kHA,312
25
+ datapipelab-0.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
+ datapipelab-0.1.2.dist-info/top_level.txt,sha256=HgeBjHvXorKzvNqU5BNPutoI771HtiqVit9_-0Zyrb4,12
27
+ datapipelab-0.1.2.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ datapipelab
@@ -1,21 +0,0 @@
1
- app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- app/connector_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- app/node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- app/node/tnode.py,sha256=npHG4fFZty5JZ3F_okO9xml-BRhu4DkrZuNE6oaLbvw,446
5
- app/node/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- app/node/processor/custom_node.py,sha256=1nqbJEhNiMP1rmN9ufpUuKO1IkuI2BEM5auW4JceGMA,933
7
- app/node/processor/spark_node.py,sha256=jzqdffIHUCgOfMFcoqjXdl8wFag-3gafxfNCdssKnwc,483
8
- app/node/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- app/node/sink/csv_node.py,sha256=YaDporq4L358dJCkheCjpGmDBsYtvai6dnGiS-uf_Mc,1673
10
- app/node/sink/delta_node.py,sha256=h3gnBsuYoXUlnhCouav-iwdQ4UVZPcvG4_5r9gxY8JM,2036
11
- app/node/sink/hive_node.py,sha256=E3pP_U7LzCFdZ0LVkqR0LDlU8HZtfbKgOo72yJsLLT0,1098
12
- app/node/sink/pandas_csv_node.py,sha256=bNF-Sb5pKMV1qAePzyDhwYLB075Rg7tjxfQ6BHcq1Wk,862
13
- app/node/sink/teams_notification_node.py,sha256=ZDE-F4nfmrK7UnXoSI6mHuhUHWlgE8rUCdPFW5ZXr7E,3564
14
- app/node/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- app/node/source/delta_node.py,sha256=gg7SfuKBAAfjk6OX2jNrot9XX61HoBe3us3D8O-dscE,529
16
- app/node/source/hive_node.py,sha256=h_AMCnnmon7uLRIGsaHAPWEReD3VaWZXnz9r0TpLGNM,478
17
- app/node/source/spark_node.py,sha256=S_x2atRFPDnXmhCUtcmaLc4BDFd2H4uQq6wnEJb7Uug,480
18
- datapipelab-0.1.dist-info/METADATA,sha256=IYnShuv2Fa5EVRfJSdJhT7kZzT4Nedr9UIEafmrP2YM,310
19
- datapipelab-0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
20
- datapipelab-0.1.dist-info/top_level.txt,sha256=io9g7LCbfmTG1SFKgEOGXmCFB9uMP2H5lerm0HiHWQE,4
21
- datapipelab-0.1.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- app
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes