pipelinehub 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Rahul Paul
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,192 @@
1
+ Metadata-Version: 2.1
2
+ Name: pipelinehub
3
+ Version: 0.1.0
4
+ Summary: A flexible data pipeline library for custom data processing workflows
5
+ Home-page: https://github.com/rahulxj100/pipelinehub
6
+ Author: Rahul Paul
7
+ Author-email: Rahul Paul <paul.rahulxj100@gmail.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/rahulxj100/pipelinehub
10
+ Project-URL: Bug Reports, https://github.com/rahulxj100/pipelinehub/issues
11
+ Project-URL: Source, https://github.com/rahulxj100/pipelinehub
12
+ Keywords: pipeline,data,processing,workflow,etl
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
+ Requires-Python: >=3.7
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=6.0; extra == "dev"
30
+ Requires-Dist: black; extra == "dev"
31
+ Requires-Dist: flake8; extra == "dev"
32
+ Requires-Dist: mypy; extra == "dev"
33
+
34
+ # PipelineHub
35
+
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
37
+
38
+
39
+ A flexible Python library for creating custom data processing workflows with ease.
40
+
41
+ ## โœจ Features
42
+
43
+ - ๐Ÿ”ง **Flexible**: Add any callable function as a processing step
44
+ - ๐Ÿ”— **Chainable**: Fluent method chaining for clean, readable code
45
+ - ๐Ÿ› **Debuggable**: Verbose mode shows data flow between steps
46
+ - ๐Ÿงช **Testable**: Clear error handling with step identification
47
+ - ๐Ÿ“ฆ **Lightweight**: Zero external dependencies
48
+ - ๐ŸŽฏ **Type-friendly**: Full type hints for better IDE support
49
+ - ๐Ÿš€ **Performance**: Minimal overhead for maximum speed
50
+ - ๐Ÿ”„ **Reusable**: Create pipelines once, use with different datasets
51
+
52
+ ## Installation
53
+ ```bash
54
+ pip install pipelinehub
55
+ ```
56
+
57
+ ## ๐Ÿ“– Quick Start
58
+ ```python
59
+ from pipelinehub import DataPipeline, normalize_data, square_numbers
60
+
61
+ # Create a pipeline with multiple steps
62
+ pipeline = DataPipeline()
63
+ pipeline.add_step(lambda x: [i for i in x if i > 0], "filter_positive")
64
+ pipeline.add_step(square_numbers, "square")
65
+ pipeline.add_step(normalize_data, "normalize")
66
+
67
+ # Execute with sample data
68
+ data = [-2, -1, 0, 1, 2, 3, 4, 5]
69
+ result = pipeline.execute(data, verbose=True)
70
+
71
+ print(result)
72
+ ```
73
+ ## ๐Ÿ”— Method Chaining
74
+ Create pipelines fluently with method chaining:
75
+
76
+ ```python
77
+ from pipelinehub import DataPipeline, add_constant
78
+
79
+ # Chain operations together
80
+ result = (DataPipeline()
81
+ .add_step(lambda x: [i for i in x if i % 2 == 0], "filter_even")
82
+ .add_step(add_constant(10), "add_10")
83
+ .add_step(lambda x: sorted(x, reverse=True), "sort_desc")
84
+ .execute([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
85
+
86
+ print(result)
87
+ ```
88
+
89
+ ## ๐Ÿ“š Comprehensive Examples
90
+
91
+ ### Data Cleaning Pipeline
92
+ ```python
93
+ from pipelinehub import DataPipeline, outlier_removal, normalize_data, calculate_stats
94
+
95
+ # Create a data cleaning pipeline
96
+ cleaning_pipeline = (DataPipeline()
97
+ .add_step(lambda x: [float(i) for i in x if i is not None], "convert_and_filter")
98
+ .add_step(lambda x: outlier_removal(x, threshold=2.5), "remove_outliers")
99
+ .add_step(normalize_data, "normalize")
100
+ .add_step(calculate_stats, "final_stats"))
101
+
102
+ # Process messy data
103
+ messy_data = [1, 2, 3, None, 100, 4, 5, 6, 7, 8, 9]
104
+ stats = cleaning_pipeline.execute(messy_data, verbose=True)
105
+ print(stats)
106
+ ```
107
+ ### Text Processing Pipeline
108
+ ```python
109
+ import re
110
+ from pipelinehub import DataPipeline
111
+
112
+ def clean_text(text):
113
+ """Remove special characters and extra whitespace."""
114
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
115
+ return ' '.join(text.split())
116
+
117
+ def extract_keywords(words, min_length=4):
118
+ """Extract words longer than min_length."""
119
+ return [word for word in words if len(word) >= min_length]
120
+
121
+ # Build text processing pipeline
122
+ text_pipeline = (DataPipeline()
123
+ .add_step(str.lower, "lowercase")
124
+ .add_step(clean_text, "clean")
125
+ .add_step(str.split, "tokenize")
126
+ .add_step(lambda words: extract_keywords(words, min_length=4), "extract_keywords")
127
+ .add_step(lambda words: sorted(set(words)), "unique_and_sort"))
128
+
129
+ # Process text
130
+ text = "Hello World! This is a Sample Text for Processing... With special chars!!!"
131
+ keywords = text_pipeline.execute(text, verbose=True)
132
+ print(keywords)
133
+ ```
134
+
135
+ ## Pipeline Management
136
+ ```python
137
+ pipeline = DataPipeline()
138
+ pipeline.add_step(lambda x: [i*2 for i in x], "double")
139
+ pipeline.add_step(lambda x: [i+1 for i in x], "add_one")
140
+
141
+ # Inspect pipeline
142
+ print(len(pipeline)) # 2
143
+ print(pipeline.get_steps()) # ['double', 'add_one']
144
+ print(pipeline) # DataPipeline(2 steps: double, add_one)
145
+
146
+ # Remove steps
147
+ pipeline.remove_step(0) # Remove first step
148
+ print(pipeline.get_steps()) # ['add_one']
149
+
150
+ # Clear all steps
151
+ pipeline.clear_steps()
152
+ print(len(pipeline)) # 0
153
+ ```
154
+ ## ๐Ÿš€ Performance Tips
155
+
156
+ - Use built-in functions when possible - they're optimized
157
+ - Avoid creating large intermediate data structures
158
+ - Consider using generators for large datasets:
159
+ ```python
160
+ def generator_step(data):
161
+ """Use generator for memory efficiency."""
162
+ for item in data:
163
+ if item > 0:
164
+ yield item * 2
165
+
166
+ pipeline = DataPipeline().add_step(lambda x: list(generator_step(x)), "process")
167
+ ```
168
+
169
+ ## ๐Ÿค Contributing
170
+ Contributions are welcome! Here's how to get started:
171
+
172
+ - Fork the repository
173
+ - Create a feature branch: git checkout -b feature/amazing-feature
174
+ - Make your changes and add tests
175
+ - Run tests: pytest tests/
176
+ - Commit your changes: git commit -m 'Add amazing feature'
177
+ - Push to branch: git push origin feature/amazing-feature
178
+ - Open a Pull Request
179
+
180
+ ## ๐Ÿ“„ License
181
+ This project is licensed under the MIT License - see the LICENSE file for details.
182
+ ## ๐Ÿ™‹โ€โ™‚๏ธ Support
183
+
184
+ Discussions: GitHub Discussions
185
+
186
+ ## ๐ŸŽ‰ Acknowledgments
187
+
188
+ - Inspired by functional programming and Unix pipes philosophy
189
+ - Built with โค๏ธ for the Python community
190
+ - Thanks to all contributors and users!
191
+
192
+
@@ -0,0 +1,159 @@
1
+ # PipelineHub
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+
5
+
6
+ A flexible Python library for creating custom data processing workflows with ease.
7
+
8
+ ## โœจ Features
9
+
10
+ - ๐Ÿ”ง **Flexible**: Add any callable function as a processing step
11
+ - ๐Ÿ”— **Chainable**: Fluent method chaining for clean, readable code
12
+ - ๐Ÿ› **Debuggable**: Verbose mode shows data flow between steps
13
+ - ๐Ÿงช **Testable**: Clear error handling with step identification
14
+ - ๐Ÿ“ฆ **Lightweight**: Zero external dependencies
15
+ - ๐ŸŽฏ **Type-friendly**: Full type hints for better IDE support
16
+ - ๐Ÿš€ **Performance**: Minimal overhead for maximum speed
17
+ - ๐Ÿ”„ **Reusable**: Create pipelines once, use with different datasets
18
+
19
+ ## Installation
20
+ ```bash
21
+ pip install pipelinehub
22
+ ```
23
+
24
+ ## ๐Ÿ“– Quick Start
25
+ ```python
26
+ from pipelinehub import DataPipeline, normalize_data, square_numbers
27
+
28
+ # Create a pipeline with multiple steps
29
+ pipeline = DataPipeline()
30
+ pipeline.add_step(lambda x: [i for i in x if i > 0], "filter_positive")
31
+ pipeline.add_step(square_numbers, "square")
32
+ pipeline.add_step(normalize_data, "normalize")
33
+
34
+ # Execute with sample data
35
+ data = [-2, -1, 0, 1, 2, 3, 4, 5]
36
+ result = pipeline.execute(data, verbose=True)
37
+
38
+ print(result)
39
+ ```
40
+ ## ๐Ÿ”— Method Chaining
41
+ Create pipelines fluently with method chaining:
42
+
43
+ ```python
44
+ from pipelinehub import DataPipeline, add_constant
45
+
46
+ # Chain operations together
47
+ result = (DataPipeline()
48
+ .add_step(lambda x: [i for i in x if i % 2 == 0], "filter_even")
49
+ .add_step(add_constant(10), "add_10")
50
+ .add_step(lambda x: sorted(x, reverse=True), "sort_desc")
51
+ .execute([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
52
+
53
+ print(result)
54
+ ```
55
+
56
+ ## ๐Ÿ“š Comprehensive Examples
57
+
58
+ ### Data Cleaning Pipeline
59
+ ```python
60
+ from pipelinehub import DataPipeline, outlier_removal, normalize_data, calculate_stats
61
+
62
+ # Create a data cleaning pipeline
63
+ cleaning_pipeline = (DataPipeline()
64
+ .add_step(lambda x: [float(i) for i in x if i is not None], "convert_and_filter")
65
+ .add_step(lambda x: outlier_removal(x, threshold=2.5), "remove_outliers")
66
+ .add_step(normalize_data, "normalize")
67
+ .add_step(calculate_stats, "final_stats"))
68
+
69
+ # Process messy data
70
+ messy_data = [1, 2, 3, None, 100, 4, 5, 6, 7, 8, 9]
71
+ stats = cleaning_pipeline.execute(messy_data, verbose=True)
72
+ print(stats)
73
+ ```
74
+ ### Text Processing Pipeline
75
+ ```python
76
+ import re
77
+ from pipelinehub import DataPipeline
78
+
79
+ def clean_text(text):
80
+ """Remove special characters and extra whitespace."""
81
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
82
+ return ' '.join(text.split())
83
+
84
+ def extract_keywords(words, min_length=4):
85
+ """Extract words longer than min_length."""
86
+ return [word for word in words if len(word) >= min_length]
87
+
88
+ # Build text processing pipeline
89
+ text_pipeline = (DataPipeline()
90
+ .add_step(str.lower, "lowercase")
91
+ .add_step(clean_text, "clean")
92
+ .add_step(str.split, "tokenize")
93
+ .add_step(lambda words: extract_keywords(words, min_length=4), "extract_keywords")
94
+ .add_step(lambda words: sorted(set(words)), "unique_and_sort"))
95
+
96
+ # Process text
97
+ text = "Hello World! This is a Sample Text for Processing... With special chars!!!"
98
+ keywords = text_pipeline.execute(text, verbose=True)
99
+ print(keywords)
100
+ ```
101
+
102
+ ## Pipeline Management
103
+ ```python
104
+ pipeline = DataPipeline()
105
+ pipeline.add_step(lambda x: [i*2 for i in x], "double")
106
+ pipeline.add_step(lambda x: [i+1 for i in x], "add_one")
107
+
108
+ # Inspect pipeline
109
+ print(len(pipeline)) # 2
110
+ print(pipeline.get_steps()) # ['double', 'add_one']
111
+ print(pipeline) # DataPipeline(2 steps: double, add_one)
112
+
113
+ # Remove steps
114
+ pipeline.remove_step(0) # Remove first step
115
+ print(pipeline.get_steps()) # ['add_one']
116
+
117
+ # Clear all steps
118
+ pipeline.clear_steps()
119
+ print(len(pipeline)) # 0
120
+ ```
121
+ ## ๐Ÿš€ Performance Tips
122
+
123
+ - Use built-in functions when possible - they're optimized
124
+ - Avoid creating large intermediate data structures
125
+ - Consider using generators for large datasets:
126
+ ```python
127
+ def generator_step(data):
128
+ """Use generator for memory efficiency."""
129
+ for item in data:
130
+ if item > 0:
131
+ yield item * 2
132
+
133
+ pipeline = DataPipeline().add_step(lambda x: list(generator_step(x)), "process")
134
+ ```
135
+
136
+ ## ๐Ÿค Contributing
137
+ Contributions are welcome! Here's how to get started:
138
+
139
+ - Fork the repository
140
+ - Create a feature branch: git checkout -b feature/amazing-feature
141
+ - Make your changes and add tests
142
+ - Run tests: pytest tests/
143
+ - Commit your changes: git commit -m 'Add amazing feature'
144
+ - Push to branch: git push origin feature/amazing-feature
145
+ - Open a Pull Request
146
+
147
+ ## ๐Ÿ“„ License
148
+ This project is licensed under the MIT License - see the LICENSE file for details.
149
+ ## ๐Ÿ™‹โ€โ™‚๏ธ Support
150
+
151
+ Discussions: GitHub Discussions
152
+
153
+ ## ๐ŸŽ‰ Acknowledgments
154
+
155
+ - Inspired by functional programming and Unix pipes philosophy
156
+ - Built with โค๏ธ for the Python community
157
+ - Thanks to all contributors and users!
158
+
159
+
@@ -0,0 +1,29 @@
1
+ """
2
+ PipelineHub - A library for creating custom data processing workflows.
3
+ """
4
+
5
+ from .pipeline import DataPipeline
6
+ from .utils import (
7
+ filter_numbers,
8
+ square_numbers,
9
+ sum_data,
10
+ normalize_data,
11
+ add_constant,
12
+ calculate_stats,
13
+ outlier_removal,
14
+ )
15
+
16
+ __version__ = "1.0.0"
17
+ __author__ = "Rahul Paul"
18
+ __email__ = "paul.rahulxj100@gmail.com"
19
+
20
+ __all__ = [
21
+ "DataPipeline",
22
+ "filter_numbers",
23
+ "square_numbers",
24
+ "sum_data",
25
+ "normalize_data",
26
+ "add_constant",
27
+ "calculate_stats",
28
+ "outlier_removal",
29
+ ]
@@ -0,0 +1,151 @@
1
+ """
2
+ Core DataPipeline class for creating flexible data processing workflows.
3
+ """
4
+
5
+ from typing import Any, Callable, List, Optional, Union
6
+
7
+
8
+ class DataPipeline:
9
+ """
10
+ A flexible data pipeline that allows adding custom processing steps.
11
+ Each step is a function that transforms the data.
12
+ """
13
+
14
+ def __init__(self, data: Any = None):
15
+ """
16
+ Initialize a new DataPipeline.
17
+
18
+ Args:
19
+ data: Optional initial data for the pipeline
20
+ """
21
+ self.data = data
22
+ self.steps: List[Callable] = []
23
+ self.step_names: List[str] = []
24
+
25
+ def add_step(self, func: Callable, name: Optional[str] = None) -> "DataPipeline":
26
+ """
27
+ Add a custom processing step to the pipeline.
28
+
29
+ Args:
30
+ func: A function that takes data as input and returns transformed data
31
+ name: Optional name for the step (for debugging/logging)
32
+
33
+ Returns:
34
+ self (for method chaining)
35
+
36
+ Raises:
37
+ ValueError: If func is not callable
38
+ """
39
+ if not callable(func):
40
+ raise ValueError("Step must be a callable function")
41
+
42
+ self.steps.append(func)
43
+ step_name = name or getattr(func, '__name__', f"step_{len(self.steps)}")
44
+ self.step_names.append(step_name)
45
+
46
+ return self
47
+
48
+ def set_data(self, data: Any) -> "DataPipeline":
49
+ """
50
+ Set the initial data for the pipeline.
51
+
52
+ Args:
53
+ data: The data to process
54
+
55
+ Returns:
56
+ self (for method chaining)
57
+ """
58
+ self.data = data
59
+ return self
60
+
61
+ def execute(self, data: Any = None, verbose: bool = False) -> Any:
62
+ """
63
+ Execute all steps in the pipeline.
64
+
65
+ Args:
66
+ data: Optional data to process (overrides instance data)
67
+ verbose: Print step-by-step execution info
68
+
69
+ Returns:
70
+ Transformed data after all steps
71
+
72
+ Raises:
73
+ ValueError: If no data is provided
74
+ RuntimeError: If any step fails during execution
75
+ """
76
+ current_data = data if data is not None else self.data
77
+
78
+ if current_data is None:
79
+ raise ValueError("No data provided. Use set_data() or pass data to execute()")
80
+
81
+ if verbose:
82
+ print(f"Starting pipeline with {len(self.steps)} steps")
83
+ data_info = self._get_data_info(current_data)
84
+ print(f"Initial data: {data_info}")
85
+
86
+ for i, (step, step_name) in enumerate(zip(self.steps, self.step_names)):
87
+ try:
88
+ if verbose:
89
+ print(f"\nStep {i+1}: {step_name}")
90
+
91
+ current_data = step(current_data)
92
+
93
+ if verbose:
94
+ data_info = self._get_data_info(current_data)
95
+ print(f" Output: {data_info}")
96
+
97
+ except Exception as e:
98
+ raise RuntimeError(f"Error in step {i+1} ({step_name}): {str(e)}")
99
+
100
+ return current_data
101
+
102
+ def clear_steps(self) -> "DataPipeline":
103
+ """
104
+ Remove all steps from the pipeline.
105
+
106
+ Returns:
107
+ self (for method chaining)
108
+ """
109
+ self.steps.clear()
110
+ self.step_names.clear()
111
+ return self
112
+
113
+ def remove_step(self, index: int) -> "DataPipeline":
114
+ """
115
+ Remove a step by index.
116
+
117
+ Args:
118
+ index: Index of the step to remove
119
+
120
+ Returns:
121
+ self (for method chaining)
122
+ """
123
+ if 0 <= index < len(self.steps):
124
+ self.steps.pop(index)
125
+ self.step_names.pop(index)
126
+ return self
127
+
128
+ def get_steps(self) -> List[str]:
129
+ """
130
+ Get list of step names.
131
+
132
+ Returns:
133
+ List of step names
134
+ """
135
+ return self.step_names.copy()
136
+
137
+ def _get_data_info(self, data: Any) -> str:
138
+ """Get descriptive info about data for verbose output."""
139
+ data_type = type(data).__name__
140
+ if hasattr(data, '__len__'):
141
+ return f"{data_type} with {len(data)} elements"
142
+ return f"{data_type}"
143
+
144
+ def __len__(self) -> int:
145
+ """Return number of steps in the pipeline."""
146
+ return len(self.steps)
147
+
148
+ def __repr__(self) -> str:
149
+ """String representation of the pipeline."""
150
+ steps_str = ', '.join(self.step_names) if self.step_names else "no steps"
151
+ return f"DataPipeline({len(self.steps)} steps: {steps_str})"
@@ -0,0 +1,121 @@
1
+ """
2
+ Utility functions for common data processing operations.
3
+ """
4
+
5
+ from typing import Any, Callable, Dict, List, Union
6
+
7
+
8
+ def filter_numbers(data: List[Union[int, float]], min_val: Union[int, float] = 0) -> List[Union[int, float]]:
9
+ """
10
+ Filter numbers greater than min_val.
11
+
12
+ Args:
13
+ data: List of numbers
14
+ min_val: Minimum value threshold
15
+
16
+ Returns:
17
+ Filtered list of numbers
18
+ """
19
+ return [x for x in data if x > min_val]
20
+
21
+
22
+ def square_numbers(data: List[Union[int, float]]) -> List[Union[int, float]]:
23
+ """
24
+ Square all numbers in the data.
25
+
26
+ Args:
27
+ data: List of numbers
28
+
29
+ Returns:
30
+ List of squared numbers
31
+ """
32
+ return [x ** 2 for x in data]
33
+
34
+
35
+ def sum_data(data: List[Union[int, float]]) -> Union[int, float]:
36
+ """
37
+ Sum all numbers in the data.
38
+
39
+ Args:
40
+ data: List of numbers
41
+
42
+ Returns:
43
+ Sum of all numbers
44
+ """
45
+ return sum(data)
46
+
47
+
48
+ def normalize_data(data: List[Union[int, float]]) -> List[float]:
49
+ """
50
+ Normalize data to 0-1 range.
51
+
52
+ Args:
53
+ data: List of numbers
54
+
55
+ Returns:
56
+ Normalized list of numbers
57
+ """
58
+ if not data:
59
+ return data
60
+ min_val, max_val = min(data), max(data)
61
+ if min_val == max_val:
62
+ return [0.0] * len(data)
63
+ return [(x - min_val) / (max_val - min_val) for x in data]
64
+
65
+
66
+ def add_constant(constant: Union[int, float]) -> Callable:
67
+ """
68
+ Create a function that adds a constant to all values.
69
+
70
+ Args:
71
+ constant: Value to add to each element
72
+
73
+ Returns:
74
+ Function that adds the constant to data
75
+ """
76
+ def add_const(data: List[Union[int, float]]) -> List[Union[int, float]]:
77
+ return [x + constant for x in data]
78
+
79
+ add_const.__name__ = f"add_{constant}"
80
+ return add_const
81
+
82
+
83
+ def calculate_stats(data: List[Union[int, float]]) -> Dict[str, Union[int, float]]:
84
+ """
85
+ Calculate basic statistics.
86
+
87
+ Args:
88
+ data: List of numbers
89
+
90
+ Returns:
91
+ Dictionary with count, mean, min, max
92
+ """
93
+ if not data:
94
+ return {}
95
+ return {
96
+ 'count': len(data),
97
+ 'mean': sum(data) / len(data),
98
+ 'min': min(data),
99
+ 'max': max(data)
100
+ }
101
+
102
+
103
+ def outlier_removal(data: List[Union[int, float]], threshold: float = 2.0) -> List[Union[int, float]]:
104
+ """
105
+ Remove outliers using simple threshold method.
106
+
107
+ Args:
108
+ data: List of numbers
109
+ threshold: Standard deviation threshold
110
+
111
+ Returns:
112
+ Data with outliers removed
113
+ """
114
+ if len(data) < 2:
115
+ return data
116
+
117
+ mean = sum(data) / len(data)
118
+ variance = sum((x - mean) ** 2 for x in data) / len(data)
119
+ std_dev = variance ** 0.5
120
+
121
+ return [x for x in data if abs(x - mean) <= threshold * std_dev]
@@ -0,0 +1,192 @@
1
+ Metadata-Version: 2.1
2
+ Name: pipelinehub
3
+ Version: 0.1.0
4
+ Summary: A flexible data pipeline library for custom data processing workflows
5
+ Home-page: https://github.com/rahulxj100/pipelinehub
6
+ Author: Rahul Paul
7
+ Author-email: Rahul Paul <paul.rahulxj100@gmail.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/rahulxj100/pipelinehub
10
+ Project-URL: Bug Reports, https://github.com/rahulxj100/pipelinehub/issues
11
+ Project-URL: Source, https://github.com/rahulxj100/pipelinehub
12
+ Keywords: pipeline,data,processing,workflow,etl
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
+ Requires-Python: >=3.7
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=6.0; extra == "dev"
30
+ Requires-Dist: black; extra == "dev"
31
+ Requires-Dist: flake8; extra == "dev"
32
+ Requires-Dist: mypy; extra == "dev"
33
+
34
+ # PipelineHub
35
+
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
37
+
38
+
39
+ A flexible Python library for creating custom data processing workflows with ease.
40
+
41
+ ## โœจ Features
42
+
43
+ - ๐Ÿ”ง **Flexible**: Add any callable function as a processing step
44
+ - ๐Ÿ”— **Chainable**: Fluent method chaining for clean, readable code
45
+ - ๐Ÿ› **Debuggable**: Verbose mode shows data flow between steps
46
+ - ๐Ÿงช **Testable**: Clear error handling with step identification
47
+ - ๐Ÿ“ฆ **Lightweight**: Zero external dependencies
48
+ - ๐ŸŽฏ **Type-friendly**: Full type hints for better IDE support
49
+ - ๐Ÿš€ **Performance**: Minimal overhead for maximum speed
50
+ - ๐Ÿ”„ **Reusable**: Create pipelines once, use with different datasets
51
+
52
+ ## Installation
53
+ ```bash
54
+ pip install pipelinehub
55
+ ```
56
+
57
+ ## ๐Ÿ“– Quick Start
58
+ ```python
59
+ from pipelinehub import DataPipeline, normalize_data, square_numbers
60
+
61
+ # Create a pipeline with multiple steps
62
+ pipeline = DataPipeline()
63
+ pipeline.add_step(lambda x: [i for i in x if i > 0], "filter_positive")
64
+ pipeline.add_step(square_numbers, "square")
65
+ pipeline.add_step(normalize_data, "normalize")
66
+
67
+ # Execute with sample data
68
+ data = [-2, -1, 0, 1, 2, 3, 4, 5]
69
+ result = pipeline.execute(data, verbose=True)
70
+
71
+ print(result)
72
+ ```
73
+ ## ๐Ÿ”— Method Chaining
74
+ Create pipelines fluently with method chaining:
75
+
76
+ ```python
77
+ from pipelinehub import DataPipeline, add_constant
78
+
79
+ # Chain operations together
80
+ result = (DataPipeline()
81
+ .add_step(lambda x: [i for i in x if i % 2 == 0], "filter_even")
82
+ .add_step(add_constant(10), "add_10")
83
+ .add_step(lambda x: sorted(x, reverse=True), "sort_desc")
84
+ .execute([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
85
+
86
+ print(result)
87
+ ```
88
+
89
+ ## ๐Ÿ“š Comprehensive Examples
90
+
91
+ ### Data Cleaning Pipeline
92
+ ```python
93
+ from pipelinehub import DataPipeline, outlier_removal, normalize_data, calculate_stats
94
+
95
+ # Create a data cleaning pipeline
96
+ cleaning_pipeline = (DataPipeline()
97
+ .add_step(lambda x: [float(i) for i in x if i is not None], "convert_and_filter")
98
+ .add_step(lambda x: outlier_removal(x, threshold=2.5), "remove_outliers")
99
+ .add_step(normalize_data, "normalize")
100
+ .add_step(calculate_stats, "final_stats"))
101
+
102
+ # Process messy data
103
+ messy_data = [1, 2, 3, None, 100, 4, 5, 6, 7, 8, 9]
104
+ stats = cleaning_pipeline.execute(messy_data, verbose=True)
105
+ print(stats)
106
+ ```
107
+ ### Text Processing Pipeline
108
+ ```python
109
+ import re
110
+ from pipelinehub import DataPipeline
111
+
112
+ def clean_text(text):
113
+ """Remove special characters and extra whitespace."""
114
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
115
+ return ' '.join(text.split())
116
+
117
+ def extract_keywords(words, min_length=4):
118
+ """Extract words longer than min_length."""
119
+ return [word for word in words if len(word) >= min_length]
120
+
121
+ # Build text processing pipeline
122
+ text_pipeline = (DataPipeline()
123
+ .add_step(str.lower, "lowercase")
124
+ .add_step(clean_text, "clean")
125
+ .add_step(str.split, "tokenize")
126
+ .add_step(lambda words: extract_keywords(words, min_length=4), "extract_keywords")
127
+ .add_step(lambda words: sorted(set(words)), "unique_and_sort"))
128
+
129
+ # Process text
130
+ text = "Hello World! This is a Sample Text for Processing... With special chars!!!"
131
+ keywords = text_pipeline.execute(text, verbose=True)
132
+ print(keywords)
133
+ ```
134
+
135
+ ## Pipeline Management
136
+ ```python
137
+ pipeline = DataPipeline()
138
+ pipeline.add_step(lambda x: [i*2 for i in x], "double")
139
+ pipeline.add_step(lambda x: [i+1 for i in x], "add_one")
140
+
141
+ # Inspect pipeline
142
+ print(len(pipeline)) # 2
143
+ print(pipeline.get_steps()) # ['double', 'add_one']
144
+ print(pipeline) # DataPipeline(2 steps: double, add_one)
145
+
146
+ # Remove steps
147
+ pipeline.remove_step(0) # Remove first step
148
+ print(pipeline.get_steps()) # ['add_one']
149
+
150
+ # Clear all steps
151
+ pipeline.clear_steps()
152
+ print(len(pipeline)) # 0
153
+ ```
154
+ ## ๐Ÿš€ Performance Tips
155
+
156
+ - Use built-in functions when possible - they're optimized
157
+ - Avoid creating large intermediate data structures
158
+ - Consider using generators for large datasets:
159
+ ```python
160
+ def generator_step(data):
161
+ """Use generator for memory efficiency."""
162
+ for item in data:
163
+ if item > 0:
164
+ yield item * 2
165
+
166
+ pipeline = DataPipeline().add_step(lambda x: list(generator_step(x)), "process")
167
+ ```
168
+
169
+ ## ๐Ÿค Contributing
170
+ Contributions are welcome! Here's how to get started:
171
+
172
+ - Fork the repository
173
+ - Create a feature branch: git checkout -b feature/amazing-feature
174
+ - Make your changes and add tests
175
+ - Run tests: pytest tests/
176
+ - Commit your changes: git commit -m 'Add amazing feature'
177
+ - Push to branch: git push origin feature/amazing-feature
178
+ - Open a Pull Request
179
+
180
+ ## ๐Ÿ“„ License
181
+ This project is licensed under the MIT License - see the LICENSE file for details.
182
+ ## ๐Ÿ™‹โ€โ™‚๏ธ Support
183
+
184
+ Discussions: GitHub Discussions
185
+
186
+ ## ๐ŸŽ‰ Acknowledgments
187
+
188
+ - Inspired by functional programming and Unix pipes philosophy
189
+ - Built with โค๏ธ for the Python community
190
+ - Thanks to all contributors and users!
191
+
192
+
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ ./pipelinehub/__init__.py
6
+ ./pipelinehub/pipeline.py
7
+ ./pipelinehub/utils.py
8
+ pipelinehub/__init__.py
9
+ pipelinehub/pipeline.py
10
+ pipelinehub/utils.py
11
+ pipelinehub.egg-info/PKG-INFO
12
+ pipelinehub.egg-info/SOURCES.txt
13
+ pipelinehub.egg-info/dependency_links.txt
14
+ pipelinehub.egg-info/not-zip-safe
15
+ pipelinehub.egg-info/requires.txt
16
+ pipelinehub.egg-info/top_level.txt
17
+ tests/test_pipeline.py
@@ -0,0 +1,6 @@
1
+
2
+ [dev]
3
+ pytest>=6.0
4
+ black
5
+ flake8
6
+ mypy
@@ -0,0 +1 @@
1
+ pipelinehub
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pipelinehub"
7
+ version = "0.1.0"
8
+ authors = [
9
+ {name = "Rahul Paul", email = "paul.rahulxj100@gmail.com"},
10
+ ]
11
+ description = "A flexible data pipeline library for custom data processing workflows"
12
+ readme = "README.md"
13
+ license = {text = "MIT"}
14
+ requires-python = ">=3.7"
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.7",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Software Development :: Libraries :: Python Modules",
27
+ "Topic :: Scientific/Engineering :: Information Analysis",
28
+ ]
29
+ keywords = ["pipeline", "data", "processing", "workflow", "etl"]
30
+ dependencies = []
31
+
32
+ [project.optional-dependencies]
33
+ dev = ["pytest>=6.0", "black", "flake8", "mypy"]
34
+
35
+ [project.urls]
36
+ "Homepage" = "https://github.com/rahulxj100/pipelinehub"
37
+ "Bug Reports" = "https://github.com/rahulxj100/pipelinehub/issues"
38
+ "Source" = "https://github.com/rahulxj100/pipelinehub"
39
+
40
+ [tool.setuptools]
41
+ packages = ["pipelinehub"]
42
+
43
+ [tool.setuptools.package-dir]
44
+ "" = "."
45
+
46
+ [tool.black]
47
+ line-length = 88
48
+ target-version = ['py37']
49
+
50
+ [tool.pytest.ini_options]
51
+ testpaths = ["tests"]
52
+ python_files = ["test_*.py"]
53
+ python_classes = ["Test*"]
54
+ python_functions = ["test_*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,49 @@
1
+ from setuptools import setup, find_packages
2
+ import os
3
+
4
+ # Read README file
5
+ def read_readme():
6
+ readme_path = os.path.join(os.path.dirname(__file__), 'README.md')
7
+ try:
8
+ with open(readme_path, 'r', encoding='utf-8') as f:
9
+ return f.read()
10
+ except FileNotFoundError:
11
+ return "A flexible data pipeline library for custom data processing workflows"
12
+
13
+ setup(
14
+ name="pipelinehub",
15
+ version="0.1.0",
16
+ author="Rahul Paul",
17
+ author_email="paul.rahulxj100@gmail.com",
18
+ description="A flexible data pipeline library for custom data processing workflows",
19
+ long_description=read_readme(),
20
+ long_description_content_type="text/markdown",
21
+ url="https://github.com/rahulxj100/pipelinehub",
22
+ project_urls={
23
+ "Bug Reports": "https://github.com/rahulxj100/pipelinehub/issues",
24
+ "Source": "https://github.com/rahulxj100/pipelinehub",
25
+ },
26
+ packages=find_packages(),
27
+ classifiers=[
28
+ "Development Status :: 4 - Beta",
29
+ "Intended Audience :: Developers",
30
+ "License :: OSI Approved :: MIT License",
31
+ "Programming Language :: Python :: 3",
32
+ "Programming Language :: Python :: 3.7",
33
+ "Programming Language :: Python :: 3.8",
34
+ "Programming Language :: Python :: 3.9",
35
+ "Programming Language :: Python :: 3.10",
36
+ "Programming Language :: Python :: 3.11",
37
+ "Programming Language :: Python :: 3.12",
38
+ "Topic :: Software Development :: Libraries :: Python Modules",
39
+ "Topic :: Scientific/Engineering :: Information Analysis",
40
+ ],
41
+ keywords=["pipeline", "data", "processing", "workflow", "etl"],
42
+ python_requires=">=3.7",
43
+ install_requires=[],
44
+ extras_require={
45
+ "dev": ["pytest>=6.0", "black", "flake8", "mypy"],
46
+ },
47
+ include_package_data=True,
48
+ zip_safe=False,
49
+ )
@@ -0,0 +1,58 @@
1
+ import pytest
2
+ from pipelinehub import DataPipeline
3
+
4
+
5
+ class TestDataPipeline:
6
+
7
+ def test_init(self):
8
+ pipeline = DataPipeline()
9
+ assert len(pipeline) == 0
10
+ assert pipeline.data is None
11
+
12
+ def test_init_with_data(self):
13
+ data = [1, 2, 3]
14
+ pipeline = DataPipeline(data)
15
+ assert pipeline.data == data
16
+
17
+ def test_add_step(self):
18
+ pipeline = DataPipeline()
19
+ pipeline.add_step(lambda x: x)
20
+ assert len(pipeline) == 1
21
+
22
+ def test_add_step_invalid(self):
23
+ pipeline = DataPipeline()
24
+ with pytest.raises(ValueError):
25
+ pipeline.add_step("not a function")
26
+
27
+ def test_execute_basic(self):
28
+ pipeline = DataPipeline()
29
+ pipeline.add_step(lambda x: [i * 2 for i in x])
30
+ result = pipeline.execute([1, 2, 3])
31
+ assert result == [2, 4, 6]
32
+
33
+ def test_execute_no_data(self):
34
+ pipeline = DataPipeline()
35
+ pipeline.add_step(lambda x: x)
36
+ with pytest.raises(ValueError):
37
+ pipeline.execute()
38
+
39
+ def test_method_chaining(self):
40
+ result = (DataPipeline()
41
+ .add_step(lambda x: [i * 2 for i in x])
42
+ .add_step(lambda x: [i + 1 for i in x])
43
+ .execute([1, 2, 3]))
44
+ assert result == [3, 5, 7]
45
+
46
+ def test_clear_steps(self):
47
+ pipeline = DataPipeline()
48
+ pipeline.add_step(lambda x: x)
49
+ pipeline.clear_steps()
50
+ assert len(pipeline) == 0
51
+
52
+ def test_remove_step(self):
53
+ pipeline = DataPipeline()
54
+ pipeline.add_step(lambda x: x, "step1")
55
+ pipeline.add_step(lambda x: x, "step2")
56
+ pipeline.remove_step(0)
57
+ assert len(pipeline) == 1
58
+ assert pipeline.get_steps() == ["step2"]