mlfastflow 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Xileven
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.2
2
+ Name: mlfastflow
3
+ Version: 0.0.1
4
+ Summary: Packages for fast dataflow and workflow processing
5
+ Home-page: https://github.com/xileven/mlfastflow
6
+ Author: Xileven
7
+ Author-email: Xileven <hi@bringyouhome.com>
8
+ License: MIT
9
+ Project-URL: Documentation, https://github.com/xileven/mlfastflow
10
+ Project-URL: Source, https://github.com/xileven/mlfastflow
11
+ Project-URL: Tracker, https://github.com/xileven/mlfastflow/issues
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Dynamic: author
24
+ Dynamic: home-page
25
+ Dynamic: requires-python
26
+
27
+ # MLFastFlow
28
+
29
+ A Python package for fast dataflow and workflow processing.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install mlfastflow
35
+ ```
36
+
37
+ ## Features
38
+
39
+ - Easy-to-use data sourcing with the Sourcing class
40
+ - Flexible vector search capabilities
41
+ - Optimized for data processing workflows
42
+
43
+ ## Quick Start
44
+
45
+ ```python
46
+ from mlfastflow import Sourcing
47
+
48
+ # Create a sourcing instance
49
+ sourcing = Sourcing(
50
+ query_df=your_query_dataframe,
51
+ db_df=your_database_dataframe,
52
+ columns_for_sourcing=["column1", "column2"],
53
+ label="your_label"
54
+ )
55
+
56
+ # Process your data
57
+ sourced_db_df_without_label, sourced_db_df_with_label = (
58
+ sourcing.sourcing()
59
+ )
60
+ ```
61
+
62
+ ## License
63
+
64
+ MIT
65
+
66
+ ## Author
67
+
68
+ Xileven
@@ -0,0 +1,42 @@
1
+ # MLFastFlow
2
+
3
+ A Python package for fast dataflow and workflow processing.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install mlfastflow
9
+ ```
10
+
11
+ ## Features
12
+
13
+ - Easy-to-use data sourcing with the Sourcing class
14
+ - Flexible vector search capabilities
15
+ - Optimized for data processing workflows
16
+
17
+ ## Quick Start
18
+
19
+ ```python
20
+ from mlfastflow import Sourcing
21
+
22
+ # Create a sourcing instance
23
+ sourcing = Sourcing(
24
+ query_df=your_query_dataframe,
25
+ db_df=your_database_dataframe,
26
+ columns_for_sourcing=["column1", "column2"],
27
+ label="your_label"
28
+ )
29
+
30
+ # Process your data
31
+ sourced_db_df_without_label, sourced_db_df_with_label = (
32
+ sourcing.sourcing()
33
+ )
34
+ ```
35
+
36
+ ## License
37
+
38
+ MIT
39
+
40
+ ## Author
41
+
42
+ Xileven
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,20 @@
1
+ """Example showing the placeholder Flow class.
2
+
3
+ This is a simplified example as the Flow class is currently a placeholder.
4
+ """
5
+
6
+ from pyfastflow.core import Flow
7
+
8
+
9
+ def main():
10
+ """Show a simple example with the placeholder Flow class."""
11
+ # Create a flow instance
12
+ flow = Flow()
13
+
14
+ # Display information about the flow
15
+ print(f"Created: {flow}")
16
+ print("Note: This is currently a placeholder implementation.")
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()
@@ -0,0 +1,37 @@
1
+ """Simple example using the Sourcing class."""
2
+
3
+ from pyfastflow import Sourcing
4
+ import pandas as pd
5
+
6
+ def main():
7
+ """Run a simple example using the Sourcing class."""
8
+ # Create sample dataframes
9
+ query_df = pd.DataFrame({
10
+ 'id': [1, 2, 3],
11
+ 'text': ['sample query 1', 'sample query 2', 'sample query 3']
12
+ })
13
+
14
+ db_df = pd.DataFrame({
15
+ 'id': [101, 102, 103, 104, 105],
16
+ 'text': ['database item 1', 'database item 2', 'database item 3',
17
+ 'database item 4', 'database item 5']
18
+ })
19
+
20
+ # Initialize Sourcing instance
21
+ sourcing = Sourcing(
22
+ query_df=query_df,
23
+ db_df=db_df,
24
+ columns_for_sourcing=['text'],
25
+ label='id'
26
+ )
27
+
28
+ # Process data
29
+ print("Initialized Sourcing instance.")
30
+ print(f"Working with {len(query_df)} query items and {len(db_df)} database items.")
31
+
32
+ # Note: This is a simplified example and may need adjustment based on
33
+ # the exact implementation of the Sourcing class
34
+
35
+
36
+ if __name__ == "__main__":
37
+ main()
@@ -0,0 +1,12 @@
1
+ """MLFastFlow - packages for fast dataflow and workflow processing."""
2
+
3
+ __version__ = "0.0.1"
4
+
5
+ # Import core components
6
+ from mlfastflow.core import Flow
7
+
8
+ # Import sourcing functionality
9
+ from mlfastflow.sourcing import Sourcing
10
+
11
+ # Make these classes available at the package level
12
+ __all__ = ['Flow', 'Sourcing']
@@ -0,0 +1,20 @@
1
+ """Core functionality for the mlfastflow package."""
2
+
3
+ class Flow:
4
+ """Placeholder class for Flow functionality.
5
+
6
+ This is a stub implementation that may be expanded in future versions.
7
+ """
8
+
9
+ def __init__(self, *args, **kwargs):
10
+ """Initialize a placeholder Flow object.
11
+
12
+ All parameters are currently ignored.
13
+ """
14
+ self.name = "placeholder_flow"
15
+
16
+ def __str__(self):
17
+ return f"Flow(name='{self.name}')"
18
+
19
+ def __repr__(self):
20
+ return self.__str__()
@@ -0,0 +1,208 @@
1
+ """Module for data sourcing functionality in the mlfastflow package."""
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import faiss
6
+ from getpass import getpass
7
+ import datetime
8
+
9
+
10
+ class Sourcing:
11
+ def __init__(self,
12
+ query_df: pd.DataFrame,
13
+ db_df: pd.DataFrame,
14
+ columns_for_sourcing: list[str],
15
+ label: str,
16
+ # query_remove_columns: list[str] = None,
17
+ # db_remove_columns: list[str] = None,
18
+ fillna_method: str = 'zero',
19
+ k: int = 2000,
20
+ sourcing_rate: float = 0.2,
21
+ credentials: str = None
22
+ ):
23
+ self.query_df = query_df # assume it has label
24
+ self.db_df = db_df
25
+
26
+ # keep a copy for validation
27
+ self.query_df_raw = query_df.copy()
28
+ self.db_df_raw = db_df.copy()
29
+ self.sourced_db_df_with_label = None,
30
+
31
+ self.columns_for_sourcing = columns_for_sourcing
32
+ self.label = label
33
+
34
+ # self.query_remove_columns = query_remove_columns
35
+ # self.db_remove_columns = db_remove_columns
36
+
37
+ self.fillna_method = fillna_method
38
+ self.k = k
39
+ self.sourcing_rate = sourcing_rate
40
+
41
+ self.D = None
42
+ self.I = None
43
+
44
+ self.indices = [] # Initialize as empty list
45
+ self.sourced_db_df = pd.DataFrame() # Initialize as empty DataFrame
46
+ self.credentials = credentials
47
+
48
+ start = datetime.datetime.now()
49
+ self.pre_process()
50
+ end = datetime.datetime.now()
51
+ print(f"Preprocessing took {end-start}")
52
+
53
+ def pre_process(self):
54
+ """
55
+ Prepare query and database DataFrames for similarity matching.
56
+
57
+ This method performs the following preprocessing steps:
58
+ 1. Filter query DataFrame to include only rows where label=1 (if label column exists)
59
+ 2. Select only the columns specified for sourcing from both DataFrames
60
+ 3. Handle missing values in both DataFrames by filling them
61
+
62
+ No parameters are required as it operates on the instance variables.
63
+ No return value as it modifies the DataFrames in-place.
64
+ """
65
+
66
+ print("Pre-processing started")
67
+ if self.label in self.query_df.columns:
68
+ # Filter query DataFrame to only include rows where the label column equals 1
69
+ self.query_df = self.query_df[self.query_df[self.label]==1]
70
+ # Note: If label column doesn't exist, assume query_df is already filtered
71
+
72
+ # Select columns for sourcing from both DataFrames
73
+ self.query_df = self.query_df[self.columns_for_sourcing]
74
+ print("Query DataFrame filtered and columns selected")
75
+
76
+ self.db_df = self.db_df[self.columns_for_sourcing]
77
+ print("Database DataFrame filtered and columns selected")
78
+
79
+
80
+ # Fill missing values in both DataFrames
81
+ self.query_df = self.fillna(self.query_df)
82
+ print("Missing values in query DataFrame filled")
83
+
84
+ self.db_df = self.fillna(self.db_df)
85
+ print("Missing values in database DataFrame filled")
86
+
87
+ print("Pre-processing completed")
88
+
89
+ def set_fillna_method(self, method):
90
+ self.fillna_method = method
91
+
92
+ def fillna(self, df):
93
+ try:
94
+ if self.fillna_method == 'zero':
95
+ return df.fillna(0)
96
+ elif self.fillna_method == 'mean':
97
+ mean_values = df.mean()
98
+ if mean_values.isna().any():
99
+ print("Warning: Some columns have all NA values, using 0 for those columns")
100
+ df = df.fillna(mean_values)
101
+ return df.fillna(0) # Fill remaining NAs with 0
102
+ return df.fillna(mean_values)
103
+ elif self.fillna_method == 'median':
104
+ median_values = df.median()
105
+ if median_values.isna().any():
106
+ print("Warning: Some columns have all NA values, using 0 for those columns")
107
+ df = df.fillna(median_values)
108
+ return df.fillna(0)
109
+ return df.fillna(median_values)
110
+ elif self.fillna_method == 'mode':
111
+ mode_values = df.mode().iloc[0]
112
+ if mode_values.isna().any():
113
+ print("Warning: Some columns have all NA values, using 0 for those columns")
114
+ df = df.fillna(mode_values)
115
+ return df.fillna(0)
116
+ return df.fillna(mode_values)
117
+ elif self.fillna_method == 'max':
118
+ max_values = df.max()
119
+ if max_values.isna().any():
120
+ print("Warning: Some columns have all NA values, using 0 for those columns")
121
+ df = df.fillna(max_values)
122
+ return df.fillna(0)
123
+ return df.fillna(max_values)
124
+ elif self.fillna_method == 'min':
125
+ min_values = df.min()
126
+ if min_values.isna().any():
127
+ print("Warning: Some columns have all NA values, using 0 for those columns")
128
+ df = df.fillna(min_values)
129
+ return df.fillna(0)
130
+ return df.fillna(min_values)
131
+ else:
132
+ print("Invalid fillna_method. Using 'zero' method instead.")
133
+ return df.fillna(0)
134
+ except Exception as e:
135
+ print(f"Error in fillna: {str(e)}. Using 'zero' method instead.")
136
+ return df.fillna(0)
137
+
138
+ def _get_credentials(self):
139
+ """
140
+ Internal method for credential verification.
141
+
142
+ Prompts user for credentials and returns the input credential.
143
+ Used for access control in the run method.
144
+ """
145
+ credential = getpass("Please enter your credentials: ")
146
+ return credential
147
+
148
+ def indexing(self):
149
+ # credential box
150
+ # input_credential = self._get_credentials()
151
+ # if input_credential != 'hijinwen':
152
+ # print("Access denied: Invalid credentials")
153
+ # return False
154
+
155
+ start = datetime.datetime.now()
156
+ try:
157
+
158
+ # faiss
159
+ index = faiss.IndexFlatL2(self.query_df.shape[1])
160
+ index.add(self.db_df)
161
+
162
+ # Ensure k doesn't exceed database size
163
+ effective_k = min(self.k, len(self.db_df))
164
+ self.D, self.I = index.search(self.query_df, effective_k)
165
+
166
+ self.indices = list(set([index for sublist in self.I for index in sublist]))
167
+
168
+ end = datetime.datetime.now()
169
+ print(f"Indexing took {end-start}")
170
+ return self.indices
171
+
172
+ except Exception as e:
173
+ print(f"Error running sourcing: {str(e)}")
174
+ return False
175
+
176
+
177
+ def sourcing(self):
178
+ """Returns the sourcing results as a DataFrame."""
179
+ self.indexing()
180
+ self.sourced_db_df = self.db_df.iloc[self.indices]
181
+ self.sourced_db_df_with_label = self.db_df_raw.iloc[self.indices]
182
+ # self.sourced_db_df = self.sourced_db_df.drop_duplicates()
183
+ return self.sourced_db_df, self.sourced_db_df_with_label
184
+
185
+ def validate(self):
186
+ if self.label not in self.db_df_raw.columns:
187
+ print("Label is needed in database for validation.")
188
+ return
189
+
190
+ """Validates the sourcing results by comparing label counts."""
191
+ try:
192
+ if self.indices is None or not self.indices:
193
+ print("No results to validate. Run the sourcing process first.")
194
+ return
195
+
196
+ # labels in db_db
197
+ self.raw_label_number = self.db_df_raw[self.db_df_raw[self.label]==1].shape[0]
198
+
199
+ self.sourced_label_number = self.sourced_db_df_with_label[self.sourced_db_df_with_label[self.label]==1].shape[0]
200
+
201
+ print("Label before sourcing: " + str(self.raw_label_number))
202
+ print("Label after sourcing: " + str(self.sourced_label_number))
203
+
204
+ print("Number of rows before sourcing: " + str(self.db_df_raw.shape[0]))
205
+ print("Number of rows after sourcing: " + str(self.sourced_db_df.shape[0]))
206
+
207
+ except Exception as e:
208
+ print(f"Error validating: {str(e)}")
@@ -0,0 +1,20 @@
1
+ """Utility functions for the mlfastflow package."""
2
+
3
+ def validate_flow(flow):
4
+ """Validate the structure of a flow.
5
+
6
+ Args:
7
+ flow: The flow to validate.
8
+
9
+ Returns:
10
+ bool: True if the flow is valid, False otherwise.
11
+ """
12
+ if not hasattr(flow, 'nodes') or not isinstance(flow.nodes, list):
13
+ return False
14
+
15
+ # Check if all nodes have a process method
16
+ for node in flow.nodes:
17
+ if not hasattr(node, 'process') or not callable(node.process):
18
+ return False
19
+
20
+ return True
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.2
2
+ Name: mlfastflow
3
+ Version: 0.0.1
4
+ Summary: Packages for fast dataflow and workflow processing
5
+ Home-page: https://github.com/xileven/mlfastflow
6
+ Author: Xileven
7
+ Author-email: Xileven <hi@bringyouhome.com>
8
+ License: MIT
9
+ Project-URL: Documentation, https://github.com/xileven/mlfastflow
10
+ Project-URL: Source, https://github.com/xileven/mlfastflow
11
+ Project-URL: Tracker, https://github.com/xileven/mlfastflow/issues
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Dynamic: author
24
+ Dynamic: home-page
25
+ Dynamic: requires-python
26
+
27
+ # MLFastFlow
28
+
29
+ A Python package for fast dataflow and workflow processing.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install mlfastflow
35
+ ```
36
+
37
+ ## Features
38
+
39
+ - Easy-to-use data sourcing with the Sourcing class
40
+ - Flexible vector search capabilities
41
+ - Optimized for data processing workflows
42
+
43
+ ## Quick Start
44
+
45
+ ```python
46
+ from mlfastflow import Sourcing
47
+
48
+ # Create a sourcing instance
49
+ sourcing = Sourcing(
50
+ query_df=your_query_dataframe,
51
+ db_df=your_database_dataframe,
52
+ columns_for_sourcing=["column1", "column2"],
53
+ label="your_label"
54
+ )
55
+
56
+ # Process your data
57
+ sourced_db_df_without_label, sourced_db_df_with_label = (
58
+ sourcing.sourcing()
59
+ )
60
+ ```
61
+
62
+ ## License
63
+
64
+ MIT
65
+
66
+ ## Author
67
+
68
+ Xileven
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ examples/__init__.py
6
+ examples/simple_flow.py
7
+ examples/simple_sourcing.py
8
+ mlfastflow/__init__.py
9
+ mlfastflow/core.py
10
+ mlfastflow/sourcing.py
11
+ mlfastflow/utils.py
12
+ mlfastflow.egg-info/PKG-INFO
13
+ mlfastflow.egg-info/SOURCES.txt
14
+ mlfastflow.egg-info/dependency_links.txt
15
+ mlfastflow.egg-info/top_level.txt
16
+ tests/__init__.py
17
+ tests/test_core.py
@@ -0,0 +1,3 @@
1
+ examples
2
+ mlfastflow
3
+ tests
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "mlfastflow"
7
+ dynamic = ["version"]
8
+ description = "Packages for fast dataflow and workflow processing"
9
+ readme = "README.md"
10
+ authors = [
11
+ {name = "Xileven", email = "hi@bringyouhome.com"}
12
+ ]
13
+ license = {text = "MIT"}
14
+ requires-python = ">=3.8"
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ ]
25
+
26
+ [project.urls]
27
+ Documentation = "https://github.com/xileven/mlfastflow"
28
+ Source = "https://github.com/xileven/mlfastflow"
29
+ Tracker = "https://github.com/xileven/mlfastflow/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,38 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="mlfastflow",
5
+ version="0.0.1",
6
+ author="Xileven",
7
+ author_email="hi@bringyouhome.com",
8
+ description="packages for fast dataflow and workflow processing",
9
+ long_description=open("README.md").read(),
10
+ long_description_content_type="text/markdown",
11
+ url="https://github.com/xileven/mlfastflow",
12
+ packages=find_packages(),
13
+ classifiers=[
14
+ "Development Status :: 3 - Alpha",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ ],
24
+ python_requires=">=3.9",
25
+ install_requires=[
26
+ "pandas",
27
+ "numpy",
28
+ "faiss-cpu",
29
+ ],
30
+ package_data={
31
+ '': ['README.md'],
32
+ },
33
+ project_urls={
34
+ 'Documentation': 'https://github.com/xileven/mlfastflow',
35
+ 'Source': 'https://github.com/xileven/mlfastflow',
36
+ 'Tracker': 'https://github.com/xileven/mlfastflow/issues',
37
+ },
38
+ )
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,27 @@
1
+ """Tests for the placeholder functionality of the pyfastflow package."""
2
+
3
+ import unittest
4
+ from pyfastflow.core import Flow
5
+
6
+
7
+ class TestFlow(unittest.TestCase):
8
+ """Test cases for the placeholder Flow class."""
9
+
10
+ def test_flow_initialization(self):
11
+ """Test that a Flow can be initialized."""
12
+ flow = Flow()
13
+ self.assertEqual(flow.name, "placeholder_flow")
14
+
15
+ # Test with arguments that should be ignored
16
+ flow_with_args = Flow(name="test_flow", some_arg=123)
17
+ self.assertEqual(flow_with_args.name, "placeholder_flow")
18
+
19
+ def test_string_representation(self):
20
+ """Test string representation of Flow."""
21
+ flow = Flow()
22
+ self.assertEqual(str(flow), "Flow(name='placeholder_flow')")
23
+ self.assertEqual(repr(flow), "Flow(name='placeholder_flow')")
24
+
25
+
26
+ if __name__ == "__main__":
27
+ unittest.main()