pipelinehub 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pipelinehub-0.1.0/LICENSE +21 -0
- pipelinehub-0.1.0/PKG-INFO +192 -0
- pipelinehub-0.1.0/README.md +159 -0
- pipelinehub-0.1.0/pipelinehub/__init__.py +29 -0
- pipelinehub-0.1.0/pipelinehub/pipeline.py +151 -0
- pipelinehub-0.1.0/pipelinehub/utils.py +121 -0
- pipelinehub-0.1.0/pipelinehub.egg-info/PKG-INFO +192 -0
- pipelinehub-0.1.0/pipelinehub.egg-info/SOURCES.txt +17 -0
- pipelinehub-0.1.0/pipelinehub.egg-info/dependency_links.txt +1 -0
- pipelinehub-0.1.0/pipelinehub.egg-info/not-zip-safe +1 -0
- pipelinehub-0.1.0/pipelinehub.egg-info/requires.txt +6 -0
- pipelinehub-0.1.0/pipelinehub.egg-info/top_level.txt +1 -0
- pipelinehub-0.1.0/pyproject.toml +54 -0
- pipelinehub-0.1.0/setup.cfg +4 -0
- pipelinehub-0.1.0/setup.py +49 -0
- pipelinehub-0.1.0/tests/test_pipeline.py +58 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Rahul Paul
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pipelinehub
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A flexible data pipeline library for custom data processing workflows
|
|
5
|
+
Home-page: https://github.com/rahulxj100/pipelinehub
|
|
6
|
+
Author: Rahul Paul
|
|
7
|
+
Author-email: Rahul Paul <paul.rahulxj100@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/rahulxj100/pipelinehub
|
|
10
|
+
Project-URL: Bug Reports, https://github.com/rahulxj100/pipelinehub/issues
|
|
11
|
+
Project-URL: Source, https://github.com/rahulxj100/pipelinehub
|
|
12
|
+
Keywords: pipeline,data,processing,workflow,etl
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Requires-Python: >=3.7
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
30
|
+
Requires-Dist: black; extra == "dev"
|
|
31
|
+
Requires-Dist: flake8; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy; extra == "dev"
|
|
33
|
+
|
|
34
|
+
# PipelineHub
|
|
35
|
+
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
A flexible Python library for creating custom data processing workflows with ease.
|
|
40
|
+
|
|
41
|
+
## โจ Features
|
|
42
|
+
|
|
43
|
+
- ๐ง **Flexible**: Add any callable function as a processing step
|
|
44
|
+
- ๐ **Chainable**: Fluent method chaining for clean, readable code
|
|
45
|
+
- ๐ **Debuggable**: Verbose mode shows data flow between steps
|
|
46
|
+
- ๐งช **Testable**: Clear error handling with step identification
|
|
47
|
+
- ๐ฆ **Lightweight**: Zero external dependencies
|
|
48
|
+
- ๐ฏ **Type-friendly**: Full type hints for better IDE support
|
|
49
|
+
- ๐ **Performance**: Minimal overhead for maximum speed
|
|
50
|
+
- ๐ **Reusable**: Create pipelines once, use with different datasets
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
```bash
|
|
54
|
+
pip install pipelinehub
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## ๐ Quick Start
|
|
58
|
+
```python
|
|
59
|
+
from pipelinehub import DataPipeline, normalize_data, square_numbers
|
|
60
|
+
|
|
61
|
+
# Create a pipeline with multiple steps
|
|
62
|
+
pipeline = DataPipeline()
|
|
63
|
+
pipeline.add_step(lambda x: [i for i in x if i > 0], "filter_positive")
|
|
64
|
+
pipeline.add_step(square_numbers, "square")
|
|
65
|
+
pipeline.add_step(normalize_data, "normalize")
|
|
66
|
+
|
|
67
|
+
# Execute with sample data
|
|
68
|
+
data = [-2, -1, 0, 1, 2, 3, 4, 5]
|
|
69
|
+
result = pipeline.execute(data, verbose=True)
|
|
70
|
+
|
|
71
|
+
print(result)
|
|
72
|
+
```
|
|
73
|
+
## ๐ Method Chaining
|
|
74
|
+
Create pipelines fluently with method chaining:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from pipelinehub import DataPipeline, add_constant
|
|
78
|
+
|
|
79
|
+
# Chain operations together
|
|
80
|
+
result = (DataPipeline()
|
|
81
|
+
.add_step(lambda x: [i for i in x if i % 2 == 0], "filter_even")
|
|
82
|
+
.add_step(add_constant(10), "add_10")
|
|
83
|
+
.add_step(lambda x: sorted(x, reverse=True), "sort_desc")
|
|
84
|
+
.execute([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
|
|
85
|
+
|
|
86
|
+
print(result)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## ๐ Comprehensive Examples
|
|
90
|
+
|
|
91
|
+
### Data Cleaning Pipeline
|
|
92
|
+
```python
|
|
93
|
+
from pipelinehub import DataPipeline, outlier_removal, normalize_data, calculate_stats
|
|
94
|
+
|
|
95
|
+
# Create a data cleaning pipeline
|
|
96
|
+
cleaning_pipeline = (DataPipeline()
|
|
97
|
+
.add_step(lambda x: [float(i) for i in x if i is not None], "convert_and_filter")
|
|
98
|
+
.add_step(lambda x: outlier_removal(x, threshold=2.5), "remove_outliers")
|
|
99
|
+
.add_step(normalize_data, "normalize")
|
|
100
|
+
.add_step(calculate_stats, "final_stats"))
|
|
101
|
+
|
|
102
|
+
# Process messy data
|
|
103
|
+
messy_data = [1, 2, 3, None, 100, 4, 5, 6, 7, 8, 9]
|
|
104
|
+
stats = cleaning_pipeline.execute(messy_data, verbose=True)
|
|
105
|
+
print(stats)
|
|
106
|
+
```
|
|
107
|
+
### Text Processing Pipeline
|
|
108
|
+
```python
|
|
109
|
+
import re
|
|
110
|
+
from pipelinehub import DataPipeline
|
|
111
|
+
|
|
112
|
+
def clean_text(text):
|
|
113
|
+
"""Remove special characters and extra whitespace."""
|
|
114
|
+
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
|
115
|
+
return ' '.join(text.split())
|
|
116
|
+
|
|
117
|
+
def extract_keywords(words, min_length=4):
|
|
118
|
+
"""Extract words longer than min_length."""
|
|
119
|
+
return [word for word in words if len(word) >= min_length]
|
|
120
|
+
|
|
121
|
+
# Build text processing pipeline
|
|
122
|
+
text_pipeline = (DataPipeline()
|
|
123
|
+
.add_step(str.lower, "lowercase")
|
|
124
|
+
.add_step(clean_text, "clean")
|
|
125
|
+
.add_step(str.split, "tokenize")
|
|
126
|
+
.add_step(lambda words: extract_keywords(words, min_length=4), "extract_keywords")
|
|
127
|
+
.add_step(lambda words: sorted(set(words)), "unique_and_sort"))
|
|
128
|
+
|
|
129
|
+
# Process text
|
|
130
|
+
text = "Hello World! This is a Sample Text for Processing... With special chars!!!"
|
|
131
|
+
keywords = text_pipeline.execute(text, verbose=True)
|
|
132
|
+
print(keywords)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Pipeline Management
|
|
136
|
+
```python
|
|
137
|
+
pipeline = DataPipeline()
|
|
138
|
+
pipeline.add_step(lambda x: [i*2 for i in x], "double")
|
|
139
|
+
pipeline.add_step(lambda x: [i+1 for i in x], "add_one")
|
|
140
|
+
|
|
141
|
+
# Inspect pipeline
|
|
142
|
+
print(len(pipeline)) # 2
|
|
143
|
+
print(pipeline.get_steps()) # ['double', 'add_one']
|
|
144
|
+
print(pipeline) # DataPipeline(2 steps: double, add_one)
|
|
145
|
+
|
|
146
|
+
# Remove steps
|
|
147
|
+
pipeline.remove_step(0) # Remove first step
|
|
148
|
+
print(pipeline.get_steps()) # ['add_one']
|
|
149
|
+
|
|
150
|
+
# Clear all steps
|
|
151
|
+
pipeline.clear_steps()
|
|
152
|
+
print(len(pipeline)) # 0
|
|
153
|
+
```
|
|
154
|
+
## ๐ Performance Tips
|
|
155
|
+
|
|
156
|
+
- Use built-in functions when possible - they're optimized
|
|
157
|
+
- Avoid creating large intermediate data structures
|
|
158
|
+
- Consider using generators for large datasets:
|
|
159
|
+
```python
|
|
160
|
+
def generator_step(data):
|
|
161
|
+
"""Use generator for memory efficiency."""
|
|
162
|
+
for item in data:
|
|
163
|
+
if item > 0:
|
|
164
|
+
yield item * 2
|
|
165
|
+
|
|
166
|
+
pipeline = DataPipeline().add_step(lambda x: list(generator_step(x)), "process")
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## ๐ค Contributing
|
|
170
|
+
Contributions are welcome! Here's how to get started:
|
|
171
|
+
|
|
172
|
+
- Fork the repository
|
|
173
|
+
- Create a feature branch: git checkout -b feature/amazing-feature
|
|
174
|
+
- Make your changes and add tests
|
|
175
|
+
- Run tests: pytest tests/
|
|
176
|
+
- Commit your changes: git commit -m 'Add amazing feature'
|
|
177
|
+
- Push to branch: git push origin feature/amazing-feature
|
|
178
|
+
- Open a Pull Request
|
|
179
|
+
|
|
180
|
+
## ๐ License
|
|
181
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
182
|
+
## ๐โโ๏ธ Support
|
|
183
|
+
|
|
184
|
+
Discussions: GitHub Discussions
|
|
185
|
+
|
|
186
|
+
## ๐ Acknowledgments
|
|
187
|
+
|
|
188
|
+
- Inspired by functional programming and Unix pipes philosophy
|
|
189
|
+
- Built with โค๏ธ for the Python community
|
|
190
|
+
- Thanks to all contributors and users!
|
|
191
|
+
|
|
192
|
+
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# PipelineHub
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
A flexible Python library for creating custom data processing workflows with ease.
|
|
7
|
+
|
|
8
|
+
## โจ Features
|
|
9
|
+
|
|
10
|
+
- ๐ง **Flexible**: Add any callable function as a processing step
|
|
11
|
+
- ๐ **Chainable**: Fluent method chaining for clean, readable code
|
|
12
|
+
- ๐ **Debuggable**: Verbose mode shows data flow between steps
|
|
13
|
+
- ๐งช **Testable**: Clear error handling with step identification
|
|
14
|
+
- ๐ฆ **Lightweight**: Zero external dependencies
|
|
15
|
+
- ๐ฏ **Type-friendly**: Full type hints for better IDE support
|
|
16
|
+
- ๐ **Performance**: Minimal overhead for maximum speed
|
|
17
|
+
- ๐ **Reusable**: Create pipelines once, use with different datasets
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
```bash
|
|
21
|
+
pip install pipelinehub
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## ๐ Quick Start
|
|
25
|
+
```python
|
|
26
|
+
from pipelinehub import DataPipeline, normalize_data, square_numbers
|
|
27
|
+
|
|
28
|
+
# Create a pipeline with multiple steps
|
|
29
|
+
pipeline = DataPipeline()
|
|
30
|
+
pipeline.add_step(lambda x: [i for i in x if i > 0], "filter_positive")
|
|
31
|
+
pipeline.add_step(square_numbers, "square")
|
|
32
|
+
pipeline.add_step(normalize_data, "normalize")
|
|
33
|
+
|
|
34
|
+
# Execute with sample data
|
|
35
|
+
data = [-2, -1, 0, 1, 2, 3, 4, 5]
|
|
36
|
+
result = pipeline.execute(data, verbose=True)
|
|
37
|
+
|
|
38
|
+
print(result)
|
|
39
|
+
```
|
|
40
|
+
## ๐ Method Chaining
|
|
41
|
+
Create pipelines fluently with method chaining:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from pipelinehub import DataPipeline, add_constant
|
|
45
|
+
|
|
46
|
+
# Chain operations together
|
|
47
|
+
result = (DataPipeline()
|
|
48
|
+
.add_step(lambda x: [i for i in x if i % 2 == 0], "filter_even")
|
|
49
|
+
.add_step(add_constant(10), "add_10")
|
|
50
|
+
.add_step(lambda x: sorted(x, reverse=True), "sort_desc")
|
|
51
|
+
.execute([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
|
|
52
|
+
|
|
53
|
+
print(result)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## ๐ Comprehensive Examples
|
|
57
|
+
|
|
58
|
+
### Data Cleaning Pipeline
|
|
59
|
+
```python
|
|
60
|
+
from pipelinehub import DataPipeline, outlier_removal, normalize_data, calculate_stats
|
|
61
|
+
|
|
62
|
+
# Create a data cleaning pipeline
|
|
63
|
+
cleaning_pipeline = (DataPipeline()
|
|
64
|
+
.add_step(lambda x: [float(i) for i in x if i is not None], "convert_and_filter")
|
|
65
|
+
.add_step(lambda x: outlier_removal(x, threshold=2.5), "remove_outliers")
|
|
66
|
+
.add_step(normalize_data, "normalize")
|
|
67
|
+
.add_step(calculate_stats, "final_stats"))
|
|
68
|
+
|
|
69
|
+
# Process messy data
|
|
70
|
+
messy_data = [1, 2, 3, None, 100, 4, 5, 6, 7, 8, 9]
|
|
71
|
+
stats = cleaning_pipeline.execute(messy_data, verbose=True)
|
|
72
|
+
print(stats)
|
|
73
|
+
```
|
|
74
|
+
### Text Processing Pipeline
|
|
75
|
+
```python
|
|
76
|
+
import re
|
|
77
|
+
from pipelinehub import DataPipeline
|
|
78
|
+
|
|
79
|
+
def clean_text(text):
|
|
80
|
+
"""Remove special characters and extra whitespace."""
|
|
81
|
+
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
|
82
|
+
return ' '.join(text.split())
|
|
83
|
+
|
|
84
|
+
def extract_keywords(words, min_length=4):
|
|
85
|
+
"""Extract words longer than min_length."""
|
|
86
|
+
return [word for word in words if len(word) >= min_length]
|
|
87
|
+
|
|
88
|
+
# Build text processing pipeline
|
|
89
|
+
text_pipeline = (DataPipeline()
|
|
90
|
+
.add_step(str.lower, "lowercase")
|
|
91
|
+
.add_step(clean_text, "clean")
|
|
92
|
+
.add_step(str.split, "tokenize")
|
|
93
|
+
.add_step(lambda words: extract_keywords(words, min_length=4), "extract_keywords")
|
|
94
|
+
.add_step(lambda words: sorted(set(words)), "unique_and_sort"))
|
|
95
|
+
|
|
96
|
+
# Process text
|
|
97
|
+
text = "Hello World! This is a Sample Text for Processing... With special chars!!!"
|
|
98
|
+
keywords = text_pipeline.execute(text, verbose=True)
|
|
99
|
+
print(keywords)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Pipeline Management
|
|
103
|
+
```python
|
|
104
|
+
pipeline = DataPipeline()
|
|
105
|
+
pipeline.add_step(lambda x: [i*2 for i in x], "double")
|
|
106
|
+
pipeline.add_step(lambda x: [i+1 for i in x], "add_one")
|
|
107
|
+
|
|
108
|
+
# Inspect pipeline
|
|
109
|
+
print(len(pipeline)) # 2
|
|
110
|
+
print(pipeline.get_steps()) # ['double', 'add_one']
|
|
111
|
+
print(pipeline) # DataPipeline(2 steps: double, add_one)
|
|
112
|
+
|
|
113
|
+
# Remove steps
|
|
114
|
+
pipeline.remove_step(0) # Remove first step
|
|
115
|
+
print(pipeline.get_steps()) # ['add_one']
|
|
116
|
+
|
|
117
|
+
# Clear all steps
|
|
118
|
+
pipeline.clear_steps()
|
|
119
|
+
print(len(pipeline)) # 0
|
|
120
|
+
```
|
|
121
|
+
## ๐ Performance Tips
|
|
122
|
+
|
|
123
|
+
- Use built-in functions when possible - they're optimized
|
|
124
|
+
- Avoid creating large intermediate data structures
|
|
125
|
+
- Consider using generators for large datasets:
|
|
126
|
+
```python
|
|
127
|
+
def generator_step(data):
|
|
128
|
+
"""Use generator for memory efficiency."""
|
|
129
|
+
for item in data:
|
|
130
|
+
if item > 0:
|
|
131
|
+
yield item * 2
|
|
132
|
+
|
|
133
|
+
pipeline = DataPipeline().add_step(lambda x: list(generator_step(x)), "process")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## ๐ค Contributing
|
|
137
|
+
Contributions are welcome! Here's how to get started:
|
|
138
|
+
|
|
139
|
+
- Fork the repository
|
|
140
|
+
- Create a feature branch: git checkout -b feature/amazing-feature
|
|
141
|
+
- Make your changes and add tests
|
|
142
|
+
- Run tests: pytest tests/
|
|
143
|
+
- Commit your changes: git commit -m 'Add amazing feature'
|
|
144
|
+
- Push to branch: git push origin feature/amazing-feature
|
|
145
|
+
- Open a Pull Request
|
|
146
|
+
|
|
147
|
+
## ๐ License
|
|
148
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
149
|
+
## ๐โโ๏ธ Support
|
|
150
|
+
|
|
151
|
+
Discussions: GitHub Discussions
|
|
152
|
+
|
|
153
|
+
## ๐ Acknowledgments
|
|
154
|
+
|
|
155
|
+
- Inspired by functional programming and Unix pipes philosophy
|
|
156
|
+
- Built with โค๏ธ for the Python community
|
|
157
|
+
- Thanks to all contributors and users!
|
|
158
|
+
|
|
159
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PipelineHub - A library for creating custom data processing workflows.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .pipeline import DataPipeline
|
|
6
|
+
from .utils import (
|
|
7
|
+
filter_numbers,
|
|
8
|
+
square_numbers,
|
|
9
|
+
sum_data,
|
|
10
|
+
normalize_data,
|
|
11
|
+
add_constant,
|
|
12
|
+
calculate_stats,
|
|
13
|
+
outlier_removal,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__version__ = "1.0.0"
|
|
17
|
+
__author__ = "Rahul Paul"
|
|
18
|
+
__email__ = "paul.rahulxj100@gmail.com"
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"DataPipeline",
|
|
22
|
+
"filter_numbers",
|
|
23
|
+
"square_numbers",
|
|
24
|
+
"sum_data",
|
|
25
|
+
"normalize_data",
|
|
26
|
+
"add_constant",
|
|
27
|
+
"calculate_stats",
|
|
28
|
+
"outlier_removal",
|
|
29
|
+
]
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core DataPipeline class for creating flexible data processing workflows.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Callable, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataPipeline:
|
|
9
|
+
"""
|
|
10
|
+
A flexible data pipeline that allows adding custom processing steps.
|
|
11
|
+
Each step is a function that transforms the data.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, data: Any = None):
|
|
15
|
+
"""
|
|
16
|
+
Initialize a new DataPipeline.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
data: Optional initial data for the pipeline
|
|
20
|
+
"""
|
|
21
|
+
self.data = data
|
|
22
|
+
self.steps: List[Callable] = []
|
|
23
|
+
self.step_names: List[str] = []
|
|
24
|
+
|
|
25
|
+
def add_step(self, func: Callable, name: Optional[str] = None) -> "DataPipeline":
|
|
26
|
+
"""
|
|
27
|
+
Add a custom processing step to the pipeline.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
func: A function that takes data as input and returns transformed data
|
|
31
|
+
name: Optional name for the step (for debugging/logging)
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
self (for method chaining)
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ValueError: If func is not callable
|
|
38
|
+
"""
|
|
39
|
+
if not callable(func):
|
|
40
|
+
raise ValueError("Step must be a callable function")
|
|
41
|
+
|
|
42
|
+
self.steps.append(func)
|
|
43
|
+
step_name = name or getattr(func, '__name__', f"step_{len(self.steps)}")
|
|
44
|
+
self.step_names.append(step_name)
|
|
45
|
+
|
|
46
|
+
return self
|
|
47
|
+
|
|
48
|
+
def set_data(self, data: Any) -> "DataPipeline":
|
|
49
|
+
"""
|
|
50
|
+
Set the initial data for the pipeline.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
data: The data to process
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
self (for method chaining)
|
|
57
|
+
"""
|
|
58
|
+
self.data = data
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def execute(self, data: Any = None, verbose: bool = False) -> Any:
|
|
62
|
+
"""
|
|
63
|
+
Execute all steps in the pipeline.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
data: Optional data to process (overrides instance data)
|
|
67
|
+
verbose: Print step-by-step execution info
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Transformed data after all steps
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ValueError: If no data is provided
|
|
74
|
+
RuntimeError: If any step fails during execution
|
|
75
|
+
"""
|
|
76
|
+
current_data = data if data is not None else self.data
|
|
77
|
+
|
|
78
|
+
if current_data is None:
|
|
79
|
+
raise ValueError("No data provided. Use set_data() or pass data to execute()")
|
|
80
|
+
|
|
81
|
+
if verbose:
|
|
82
|
+
print(f"Starting pipeline with {len(self.steps)} steps")
|
|
83
|
+
data_info = self._get_data_info(current_data)
|
|
84
|
+
print(f"Initial data: {data_info}")
|
|
85
|
+
|
|
86
|
+
for i, (step, step_name) in enumerate(zip(self.steps, self.step_names)):
|
|
87
|
+
try:
|
|
88
|
+
if verbose:
|
|
89
|
+
print(f"\nStep {i+1}: {step_name}")
|
|
90
|
+
|
|
91
|
+
current_data = step(current_data)
|
|
92
|
+
|
|
93
|
+
if verbose:
|
|
94
|
+
data_info = self._get_data_info(current_data)
|
|
95
|
+
print(f" Output: {data_info}")
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
raise RuntimeError(f"Error in step {i+1} ({step_name}): {str(e)}")
|
|
99
|
+
|
|
100
|
+
return current_data
|
|
101
|
+
|
|
102
|
+
def clear_steps(self) -> "DataPipeline":
|
|
103
|
+
"""
|
|
104
|
+
Remove all steps from the pipeline.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
self (for method chaining)
|
|
108
|
+
"""
|
|
109
|
+
self.steps.clear()
|
|
110
|
+
self.step_names.clear()
|
|
111
|
+
return self
|
|
112
|
+
|
|
113
|
+
def remove_step(self, index: int) -> "DataPipeline":
|
|
114
|
+
"""
|
|
115
|
+
Remove a step by index.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
index: Index of the step to remove
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
self (for method chaining)
|
|
122
|
+
"""
|
|
123
|
+
if 0 <= index < len(self.steps):
|
|
124
|
+
self.steps.pop(index)
|
|
125
|
+
self.step_names.pop(index)
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
def get_steps(self) -> List[str]:
|
|
129
|
+
"""
|
|
130
|
+
Get list of step names.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of step names
|
|
134
|
+
"""
|
|
135
|
+
return self.step_names.copy()
|
|
136
|
+
|
|
137
|
+
def _get_data_info(self, data: Any) -> str:
|
|
138
|
+
"""Get descriptive info about data for verbose output."""
|
|
139
|
+
data_type = type(data).__name__
|
|
140
|
+
if hasattr(data, '__len__'):
|
|
141
|
+
return f"{data_type} with {len(data)} elements"
|
|
142
|
+
return f"{data_type}"
|
|
143
|
+
|
|
144
|
+
def __len__(self) -> int:
|
|
145
|
+
"""Return number of steps in the pipeline."""
|
|
146
|
+
return len(self.steps)
|
|
147
|
+
|
|
148
|
+
def __repr__(self) -> str:
|
|
149
|
+
"""String representation of the pipeline."""
|
|
150
|
+
steps_str = ', '.join(self.step_names) if self.step_names else "no steps"
|
|
151
|
+
return f"DataPipeline({len(self.steps)} steps: {steps_str})"
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for common data processing operations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Callable, Dict, List, Union
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def filter_numbers(data: List[Union[int, float]], min_val: Union[int, float] = 0) -> List[Union[int, float]]:
|
|
9
|
+
"""
|
|
10
|
+
Filter numbers greater than min_val.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
data: List of numbers
|
|
14
|
+
min_val: Minimum value threshold
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Filtered list of numbers
|
|
18
|
+
"""
|
|
19
|
+
return [x for x in data if x > min_val]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def square_numbers(data: List[Union[int, float]]) -> List[Union[int, float]]:
|
|
23
|
+
"""
|
|
24
|
+
Square all numbers in the data.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
data: List of numbers
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
List of squared numbers
|
|
31
|
+
"""
|
|
32
|
+
return [x ** 2 for x in data]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def sum_data(data: List[Union[int, float]]) -> Union[int, float]:
|
|
36
|
+
"""
|
|
37
|
+
Sum all numbers in the data.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
data: List of numbers
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Sum of all numbers
|
|
44
|
+
"""
|
|
45
|
+
return sum(data)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def normalize_data(data: List[Union[int, float]]) -> List[float]:
|
|
49
|
+
"""
|
|
50
|
+
Normalize data to 0-1 range.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
data: List of numbers
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Normalized list of numbers
|
|
57
|
+
"""
|
|
58
|
+
if not data:
|
|
59
|
+
return data
|
|
60
|
+
min_val, max_val = min(data), max(data)
|
|
61
|
+
if min_val == max_val:
|
|
62
|
+
return [0.0] * len(data)
|
|
63
|
+
return [(x - min_val) / (max_val - min_val) for x in data]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def add_constant(constant: Union[int, float]) -> Callable:
|
|
67
|
+
"""
|
|
68
|
+
Create a function that adds a constant to all values.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
constant: Value to add to each element
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Function that adds the constant to data
|
|
75
|
+
"""
|
|
76
|
+
def add_const(data: List[Union[int, float]]) -> List[Union[int, float]]:
|
|
77
|
+
return [x + constant for x in data]
|
|
78
|
+
|
|
79
|
+
add_const.__name__ = f"add_{constant}"
|
|
80
|
+
return add_const
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def calculate_stats(data: List[Union[int, float]]) -> Dict[str, Union[int, float]]:
|
|
84
|
+
"""
|
|
85
|
+
Calculate basic statistics.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
data: List of numbers
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Dictionary with count, mean, min, max
|
|
92
|
+
"""
|
|
93
|
+
if not data:
|
|
94
|
+
return {}
|
|
95
|
+
return {
|
|
96
|
+
'count': len(data),
|
|
97
|
+
'mean': sum(data) / len(data),
|
|
98
|
+
'min': min(data),
|
|
99
|
+
'max': max(data)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def outlier_removal(data: List[Union[int, float]], threshold: float = 2.0) -> List[Union[int, float]]:
|
|
104
|
+
"""
|
|
105
|
+
Remove outliers using simple threshold method.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
data: List of numbers
|
|
109
|
+
threshold: Standard deviation threshold
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Data with outliers removed
|
|
113
|
+
"""
|
|
114
|
+
if len(data) < 2:
|
|
115
|
+
return data
|
|
116
|
+
|
|
117
|
+
mean = sum(data) / len(data)
|
|
118
|
+
variance = sum((x - mean) ** 2 for x in data) / len(data)
|
|
119
|
+
std_dev = variance ** 0.5
|
|
120
|
+
|
|
121
|
+
return [x for x in data if abs(x - mean) <= threshold * std_dev]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pipelinehub
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A flexible data pipeline library for custom data processing workflows
|
|
5
|
+
Home-page: https://github.com/rahulxj100/pipelinehub
|
|
6
|
+
Author: Rahul Paul
|
|
7
|
+
Author-email: Rahul Paul <paul.rahulxj100@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/rahulxj100/pipelinehub
|
|
10
|
+
Project-URL: Bug Reports, https://github.com/rahulxj100/pipelinehub/issues
|
|
11
|
+
Project-URL: Source, https://github.com/rahulxj100/pipelinehub
|
|
12
|
+
Keywords: pipeline,data,processing,workflow,etl
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Requires-Python: >=3.7
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
30
|
+
Requires-Dist: black; extra == "dev"
|
|
31
|
+
Requires-Dist: flake8; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy; extra == "dev"
|
|
33
|
+
|
|
34
|
+
# PipelineHub
|
|
35
|
+
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
A flexible Python library for creating custom data processing workflows with ease.
|
|
40
|
+
|
|
41
|
+
## โจ Features
|
|
42
|
+
|
|
43
|
+
- ๐ง **Flexible**: Add any callable function as a processing step
|
|
44
|
+
- ๐ **Chainable**: Fluent method chaining for clean, readable code
|
|
45
|
+
- ๐ **Debuggable**: Verbose mode shows data flow between steps
|
|
46
|
+
- ๐งช **Testable**: Clear error handling with step identification
|
|
47
|
+
- ๐ฆ **Lightweight**: Zero external dependencies
|
|
48
|
+
- ๐ฏ **Type-friendly**: Full type hints for better IDE support
|
|
49
|
+
- ๐ **Performance**: Minimal overhead for maximum speed
|
|
50
|
+
- ๐ **Reusable**: Create pipelines once, use with different datasets
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
```bash
|
|
54
|
+
pip install pipelinehub
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## ๐ Quick Start
|
|
58
|
+
```python
|
|
59
|
+
from pipelinehub import DataPipeline, normalize_data, square_numbers
|
|
60
|
+
|
|
61
|
+
# Create a pipeline with multiple steps
|
|
62
|
+
pipeline = DataPipeline()
|
|
63
|
+
pipeline.add_step(lambda x: [i for i in x if i > 0], "filter_positive")
|
|
64
|
+
pipeline.add_step(square_numbers, "square")
|
|
65
|
+
pipeline.add_step(normalize_data, "normalize")
|
|
66
|
+
|
|
67
|
+
# Execute with sample data
|
|
68
|
+
data = [-2, -1, 0, 1, 2, 3, 4, 5]
|
|
69
|
+
result = pipeline.execute(data, verbose=True)
|
|
70
|
+
|
|
71
|
+
print(result)
|
|
72
|
+
```
|
|
73
|
+
## ๐ Method Chaining
|
|
74
|
+
Create pipelines fluently with method chaining:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from pipelinehub import DataPipeline, add_constant
|
|
78
|
+
|
|
79
|
+
# Chain operations together
|
|
80
|
+
result = (DataPipeline()
|
|
81
|
+
.add_step(lambda x: [i for i in x if i % 2 == 0], "filter_even")
|
|
82
|
+
.add_step(add_constant(10), "add_10")
|
|
83
|
+
.add_step(lambda x: sorted(x, reverse=True), "sort_desc")
|
|
84
|
+
.execute([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
|
|
85
|
+
|
|
86
|
+
print(result)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## ๐ Comprehensive Examples
|
|
90
|
+
|
|
91
|
+
### Data Cleaning Pipeline
|
|
92
|
+
```python
|
|
93
|
+
from pipelinehub import DataPipeline, outlier_removal, normalize_data, calculate_stats
|
|
94
|
+
|
|
95
|
+
# Create a data cleaning pipeline
|
|
96
|
+
cleaning_pipeline = (DataPipeline()
|
|
97
|
+
.add_step(lambda x: [float(i) for i in x if i is not None], "convert_and_filter")
|
|
98
|
+
.add_step(lambda x: outlier_removal(x, threshold=2.5), "remove_outliers")
|
|
99
|
+
.add_step(normalize_data, "normalize")
|
|
100
|
+
.add_step(calculate_stats, "final_stats"))
|
|
101
|
+
|
|
102
|
+
# Process messy data
|
|
103
|
+
messy_data = [1, 2, 3, None, 100, 4, 5, 6, 7, 8, 9]
|
|
104
|
+
stats = cleaning_pipeline.execute(messy_data, verbose=True)
|
|
105
|
+
print(stats)
|
|
106
|
+
```
|
|
107
|
+
### Text Processing Pipeline
|
|
108
|
+
```python
|
|
109
|
+
import re
|
|
110
|
+
from pipelinehub import DataPipeline
|
|
111
|
+
|
|
112
|
+
def clean_text(text):
|
|
113
|
+
"""Remove special characters and extra whitespace."""
|
|
114
|
+
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
|
115
|
+
return ' '.join(text.split())
|
|
116
|
+
|
|
117
|
+
def extract_keywords(words, min_length=4):
|
|
118
|
+
"""Extract words longer than min_length."""
|
|
119
|
+
return [word for word in words if len(word) >= min_length]
|
|
120
|
+
|
|
121
|
+
# Build text processing pipeline
|
|
122
|
+
text_pipeline = (DataPipeline()
|
|
123
|
+
.add_step(str.lower, "lowercase")
|
|
124
|
+
.add_step(clean_text, "clean")
|
|
125
|
+
.add_step(str.split, "tokenize")
|
|
126
|
+
.add_step(lambda words: extract_keywords(words, min_length=4), "extract_keywords")
|
|
127
|
+
.add_step(lambda words: sorted(set(words)), "unique_and_sort"))
|
|
128
|
+
|
|
129
|
+
# Process text
|
|
130
|
+
text = "Hello World! This is a Sample Text for Processing... With special chars!!!"
|
|
131
|
+
keywords = text_pipeline.execute(text, verbose=True)
|
|
132
|
+
print(keywords)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Pipeline Management
|
|
136
|
+
```python
|
|
137
|
+
pipeline = DataPipeline()
|
|
138
|
+
pipeline.add_step(lambda x: [i*2 for i in x], "double")
|
|
139
|
+
pipeline.add_step(lambda x: [i+1 for i in x], "add_one")
|
|
140
|
+
|
|
141
|
+
# Inspect pipeline
|
|
142
|
+
print(len(pipeline)) # 2
|
|
143
|
+
print(pipeline.get_steps()) # ['double', 'add_one']
|
|
144
|
+
print(pipeline) # DataPipeline(2 steps: double, add_one)
|
|
145
|
+
|
|
146
|
+
# Remove steps
|
|
147
|
+
pipeline.remove_step(0) # Remove first step
|
|
148
|
+
print(pipeline.get_steps()) # ['add_one']
|
|
149
|
+
|
|
150
|
+
# Clear all steps
|
|
151
|
+
pipeline.clear_steps()
|
|
152
|
+
print(len(pipeline)) # 0
|
|
153
|
+
```
|
|
154
|
+
## ๐ Performance Tips
|
|
155
|
+
|
|
156
|
+
- Use built-in functions when possible - they're optimized
|
|
157
|
+
- Avoid creating large intermediate data structures
|
|
158
|
+
- Consider using generators for large datasets:
|
|
159
|
+
```python
|
|
160
|
+
def generator_step(data):
|
|
161
|
+
"""Use generator for memory efficiency."""
|
|
162
|
+
for item in data:
|
|
163
|
+
if item > 0:
|
|
164
|
+
yield item * 2
|
|
165
|
+
|
|
166
|
+
pipeline = DataPipeline().add_step(lambda x: list(generator_step(x)), "process")
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## ๐ค Contributing
|
|
170
|
+
Contributions are welcome! Here's how to get started:
|
|
171
|
+
|
|
172
|
+
- Fork the repository
|
|
173
|
+
- Create a feature branch: git checkout -b feature/amazing-feature
|
|
174
|
+
- Make your changes and add tests
|
|
175
|
+
- Run tests: pytest tests/
|
|
176
|
+
- Commit your changes: git commit -m 'Add amazing feature'
|
|
177
|
+
- Push to branch: git push origin feature/amazing-feature
|
|
178
|
+
- Open a Pull Request
|
|
179
|
+
|
|
180
|
+
## ๐ License
|
|
181
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
182
|
+
## ๐โโ๏ธ Support
|
|
183
|
+
|
|
184
|
+
Discussions: GitHub Discussions
|
|
185
|
+
|
|
186
|
+
## ๐ Acknowledgments
|
|
187
|
+
|
|
188
|
+
- Inspired by functional programming and Unix pipes philosophy
|
|
189
|
+
- Built with โค๏ธ for the Python community
|
|
190
|
+
- Thanks to all contributors and users!
|
|
191
|
+
|
|
192
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
./pipelinehub/__init__.py
|
|
6
|
+
./pipelinehub/pipeline.py
|
|
7
|
+
./pipelinehub/utils.py
|
|
8
|
+
pipelinehub/__init__.py
|
|
9
|
+
pipelinehub/pipeline.py
|
|
10
|
+
pipelinehub/utils.py
|
|
11
|
+
pipelinehub.egg-info/PKG-INFO
|
|
12
|
+
pipelinehub.egg-info/SOURCES.txt
|
|
13
|
+
pipelinehub.egg-info/dependency_links.txt
|
|
14
|
+
pipelinehub.egg-info/not-zip-safe
|
|
15
|
+
pipelinehub.egg-info/requires.txt
|
|
16
|
+
pipelinehub.egg-info/top_level.txt
|
|
17
|
+
tests/test_pipeline.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pipelinehub
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pipelinehub"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Rahul Paul", email = "paul.rahulxj100@gmail.com"},
|
|
10
|
+
]
|
|
11
|
+
description = "A flexible data pipeline library for custom data processing workflows"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = {text = "MIT"}
|
|
14
|
+
requires-python = ">=3.7"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.7",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
28
|
+
]
|
|
29
|
+
keywords = ["pipeline", "data", "processing", "workflow", "etl"]
|
|
30
|
+
dependencies = []
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = ["pytest>=6.0", "black", "flake8", "mypy"]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
"Homepage" = "https://github.com/rahulxj100/pipelinehub"
|
|
37
|
+
"Bug Reports" = "https://github.com/rahulxj100/pipelinehub/issues"
|
|
38
|
+
"Source" = "https://github.com/rahulxj100/pipelinehub"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools]
|
|
41
|
+
packages = ["pipelinehub"]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.package-dir]
|
|
44
|
+
"" = "."
|
|
45
|
+
|
|
46
|
+
[tool.black]
|
|
47
|
+
line-length = 88
|
|
48
|
+
target-version = ['py37']
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
testpaths = ["tests"]
|
|
52
|
+
python_files = ["test_*.py"]
|
|
53
|
+
python_classes = ["Test*"]
|
|
54
|
+
python_functions = ["test_*"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
# Read README file
|
|
5
|
+
def read_readme():
|
|
6
|
+
readme_path = os.path.join(os.path.dirname(__file__), 'README.md')
|
|
7
|
+
try:
|
|
8
|
+
with open(readme_path, 'r', encoding='utf-8') as f:
|
|
9
|
+
return f.read()
|
|
10
|
+
except FileNotFoundError:
|
|
11
|
+
return "A flexible data pipeline library for custom data processing workflows"
|
|
12
|
+
|
|
13
|
+
setup(
|
|
14
|
+
name="pipelinehub",
|
|
15
|
+
version="0.1.0",
|
|
16
|
+
author="Rahul Paul",
|
|
17
|
+
author_email="paul.rahulxj100@gmail.com",
|
|
18
|
+
description="A flexible data pipeline library for custom data processing workflows",
|
|
19
|
+
long_description=read_readme(),
|
|
20
|
+
long_description_content_type="text/markdown",
|
|
21
|
+
url="https://github.com/rahulxj100/pipelinehub",
|
|
22
|
+
project_urls={
|
|
23
|
+
"Bug Reports": "https://github.com/rahulxj100/pipelinehub/issues",
|
|
24
|
+
"Source": "https://github.com/rahulxj100/pipelinehub",
|
|
25
|
+
},
|
|
26
|
+
packages=find_packages(),
|
|
27
|
+
classifiers=[
|
|
28
|
+
"Development Status :: 4 - Beta",
|
|
29
|
+
"Intended Audience :: Developers",
|
|
30
|
+
"License :: OSI Approved :: MIT License",
|
|
31
|
+
"Programming Language :: Python :: 3",
|
|
32
|
+
"Programming Language :: Python :: 3.7",
|
|
33
|
+
"Programming Language :: Python :: 3.8",
|
|
34
|
+
"Programming Language :: Python :: 3.9",
|
|
35
|
+
"Programming Language :: Python :: 3.10",
|
|
36
|
+
"Programming Language :: Python :: 3.11",
|
|
37
|
+
"Programming Language :: Python :: 3.12",
|
|
38
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
39
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
40
|
+
],
|
|
41
|
+
keywords=["pipeline", "data", "processing", "workflow", "etl"],
|
|
42
|
+
python_requires=">=3.7",
|
|
43
|
+
install_requires=[],
|
|
44
|
+
extras_require={
|
|
45
|
+
"dev": ["pytest>=6.0", "black", "flake8", "mypy"],
|
|
46
|
+
},
|
|
47
|
+
include_package_data=True,
|
|
48
|
+
zip_safe=False,
|
|
49
|
+
)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pipelinehub import DataPipeline
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TestDataPipeline:
|
|
6
|
+
|
|
7
|
+
def test_init(self):
|
|
8
|
+
pipeline = DataPipeline()
|
|
9
|
+
assert len(pipeline) == 0
|
|
10
|
+
assert pipeline.data is None
|
|
11
|
+
|
|
12
|
+
def test_init_with_data(self):
|
|
13
|
+
data = [1, 2, 3]
|
|
14
|
+
pipeline = DataPipeline(data)
|
|
15
|
+
assert pipeline.data == data
|
|
16
|
+
|
|
17
|
+
def test_add_step(self):
|
|
18
|
+
pipeline = DataPipeline()
|
|
19
|
+
pipeline.add_step(lambda x: x)
|
|
20
|
+
assert len(pipeline) == 1
|
|
21
|
+
|
|
22
|
+
def test_add_step_invalid(self):
|
|
23
|
+
pipeline = DataPipeline()
|
|
24
|
+
with pytest.raises(ValueError):
|
|
25
|
+
pipeline.add_step("not a function")
|
|
26
|
+
|
|
27
|
+
def test_execute_basic(self):
|
|
28
|
+
pipeline = DataPipeline()
|
|
29
|
+
pipeline.add_step(lambda x: [i * 2 for i in x])
|
|
30
|
+
result = pipeline.execute([1, 2, 3])
|
|
31
|
+
assert result == [2, 4, 6]
|
|
32
|
+
|
|
33
|
+
def test_execute_no_data(self):
|
|
34
|
+
pipeline = DataPipeline()
|
|
35
|
+
pipeline.add_step(lambda x: x)
|
|
36
|
+
with pytest.raises(ValueError):
|
|
37
|
+
pipeline.execute()
|
|
38
|
+
|
|
39
|
+
def test_method_chaining(self):
|
|
40
|
+
result = (DataPipeline()
|
|
41
|
+
.add_step(lambda x: [i * 2 for i in x])
|
|
42
|
+
.add_step(lambda x: [i + 1 for i in x])
|
|
43
|
+
.execute([1, 2, 3]))
|
|
44
|
+
assert result == [3, 5, 7]
|
|
45
|
+
|
|
46
|
+
def test_clear_steps(self):
|
|
47
|
+
pipeline = DataPipeline()
|
|
48
|
+
pipeline.add_step(lambda x: x)
|
|
49
|
+
pipeline.clear_steps()
|
|
50
|
+
assert len(pipeline) == 0
|
|
51
|
+
|
|
52
|
+
def test_remove_step(self):
|
|
53
|
+
pipeline = DataPipeline()
|
|
54
|
+
pipeline.add_step(lambda x: x, "step1")
|
|
55
|
+
pipeline.add_step(lambda x: x, "step2")
|
|
56
|
+
pipeline.remove_step(0)
|
|
57
|
+
assert len(pipeline) == 1
|
|
58
|
+
assert pipeline.get_steps() == ["step2"]
|