PyPI - mlfastflow - Versions diffs - 0.0.1__tar.gz - Mend

mlfastflow 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

mlfastflow-0.0.1/LICENSE +21 -0
mlfastflow-0.0.1/PKG-INFO +68 -0
mlfastflow-0.0.1/README.md +42 -0
mlfastflow-0.0.1/examples/__init__.py +1 -0
mlfastflow-0.0.1/examples/simple_flow.py +20 -0
mlfastflow-0.0.1/examples/simple_sourcing.py +37 -0
mlfastflow-0.0.1/mlfastflow/__init__.py +12 -0
mlfastflow-0.0.1/mlfastflow/core.py +20 -0
mlfastflow-0.0.1/mlfastflow/sourcing.py +208 -0
mlfastflow-0.0.1/mlfastflow/utils.py +20 -0
mlfastflow-0.0.1/mlfastflow.egg-info/PKG-INFO +68 -0
mlfastflow-0.0.1/mlfastflow.egg-info/SOURCES.txt +17 -0
mlfastflow-0.0.1/mlfastflow.egg-info/dependency_links.txt +1 -0
mlfastflow-0.0.1/mlfastflow.egg-info/top_level.txt +3 -0
mlfastflow-0.0.1/pyproject.toml +29 -0
mlfastflow-0.0.1/setup.cfg +4 -0
mlfastflow-0.0.1/setup.py +38 -0
mlfastflow-0.0.1/tests/__init__.py +1 -0
mlfastflow-0.0.1/tests/test_core.py +27 -0

mlfastflow-0.0.1/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Xileven
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

mlfastflow-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,68 @@
+Metadata-Version: 2.2
+Name: mlfastflow
+Version: 0.0.1
+Summary: Packages for fast dataflow and workflow processing
+Home-page: https://github.com/xileven/mlfastflow
+Author: Xileven
+Author-email: Xileven <hi@bringyouhome.com>
+License: MIT
+Project-URL: Documentation, https://github.com/xileven/mlfastflow
+Project-URL: Source, https://github.com/xileven/mlfastflow
+Project-URL: Tracker, https://github.com/xileven/mlfastflow/issues
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: author
+Dynamic: home-page
+Dynamic: requires-python
+# MLFastFlow
+A Python package for fast dataflow and workflow processing.
+## Installation
+```bash
+pip install mlfastflow
+```
+## Features
+- Easy-to-use data sourcing with the Sourcing class
+- Flexible vector search capabilities
+- Optimized for data processing workflows
+## Quick Start
+```python
+from mlfastflow import Sourcing
+# Create a sourcing instance
+sourcing = Sourcing(
+    query_df=your_query_dataframe,
+    db_df=your_database_dataframe,
+    columns_for_sourcing=["column1", "column2"],
+    label="your_label"
+)
+# Process your data
+sourced_db_df_without_label, sourced_db_df_with_label = (
+    sourcing.sourcing()
+)
+```
+## License
+MIT
+## Author
+Xileven

mlfastflow-0.0.1/README.md ADDED Viewed

@@ -0,0 +1,42 @@
+# MLFastFlow
+A Python package for fast dataflow and workflow processing.
+## Installation
+```bash
+pip install mlfastflow
+```
+## Features
+- Easy-to-use data sourcing with the Sourcing class
+- Flexible vector search capabilities
+- Optimized for data processing workflows
+## Quick Start
+```python
+from mlfastflow import Sourcing
+# Create a sourcing instance
+sourcing = Sourcing(
+    query_df=your_query_dataframe,
+    db_df=your_database_dataframe,
+    columns_for_sourcing=["column1", "column2"],
+    label="your_label"
+)
+# Process your data
+sourced_db_df_without_label, sourced_db_df_with_label = (
+    sourcing.sourcing()
+)
+```
+## License
+MIT
+## Author
+Xileven

mlfastflow-0.0.1/examples/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlfastflow-0.0.1/examples/simple_flow.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Example showing the placeholder Flow class.
+This is a simplified example as the Flow class is currently a placeholder.
+"""
+from pyfastflow.core import Flow
+def main():
+    """Show a simple example with the placeholder Flow class."""
+    # Create a flow instance
+    flow = Flow()
+    # Display information about the flow
+    print(f"Created: {flow}")
+    print("Note: This is currently a placeholder implementation.")
+if __name__ == "__main__":
+    main()

mlfastflow-0.0.1/examples/simple_sourcing.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Simple example using the Sourcing class."""
+from pyfastflow import Sourcing
+import pandas as pd
+def main():
+    """Run a simple example using the Sourcing class."""
+    # Create sample dataframes
+    query_df = pd.DataFrame({
+        'id': [1, 2, 3],
+        'text': ['sample query 1', 'sample query 2', 'sample query 3']
+    })
+    db_df = pd.DataFrame({
+        'id': [101, 102, 103, 104, 105],
+        'text': ['database item 1', 'database item 2', 'database item 3',
+                'database item 4', 'database item 5']
+    })
+    # Initialize Sourcing instance
+    sourcing = Sourcing(
+        query_df=query_df,
+        db_df=db_df,
+        columns_for_sourcing=['text'],
+        label='id'
+    )
+    # Process data
+    print("Initialized Sourcing instance.")
+    print(f"Working with {len(query_df)} query items and {len(db_df)} database items.")
+    # Note: This is a simplified example and may need adjustment based on
+    # the exact implementation of the Sourcing class
+if __name__ == "__main__":
+    main()

mlfastflow-0.0.1/mlfastflow/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""MLFastFlow - packages for fast dataflow and workflow processing."""
+__version__ = "0.0.1"
+# Import core components
+from mlfastflow.core import Flow
+# Import sourcing functionality
+from mlfastflow.sourcing import Sourcing
+# Make these classes available at the package level
+__all__ = ['Flow', 'Sourcing']

mlfastflow-0.0.1/mlfastflow/core.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Core functionality for the mlfastflow package."""
+class Flow:
+    """Placeholder class for Flow functionality.
+    This is a stub implementation that may be expanded in future versions.
+    """
+    def __init__(self, *args, **kwargs):
+        """Initialize a placeholder Flow object.
+        All parameters are currently ignored.
+        """
+        self.name = "placeholder_flow"
+    def __str__(self):
+        return f"Flow(name='{self.name}')"
+    def __repr__(self):
+        return self.__str__()

mlfastflow-0.0.1/mlfastflow/sourcing.py ADDED Viewed

@@ -0,0 +1,208 @@
+"""Module for data sourcing functionality in the mlfastflow package."""
+import pandas as pd
+import numpy as np
+import faiss
+from getpass import getpass
+import datetime
+class Sourcing:
+    def __init__(self,
+                 query_df: pd.DataFrame,
+                 db_df: pd.DataFrame,
+                 columns_for_sourcing: list[str],
+                 label: str,
+                #  query_remove_columns: list[str] = None,
+                #  db_remove_columns: list[str] = None,
+                 fillna_method: str = 'zero',
+                 k: int = 2000,
+                 sourcing_rate: float = 0.2,
+                 credentials: str = None
+    ):
+        self.query_df = query_df # assume it has label
+        self.db_df = db_df
+        # keep a copy for validation
+        self.query_df_raw = query_df.copy()
+        self.db_df_raw = db_df.copy()
+        self.sourced_db_df_with_label = None,
+        self.columns_for_sourcing = columns_for_sourcing
+        self.label = label
+        # self.query_remove_columns = query_remove_columns
+        # self.db_remove_columns = db_remove_columns
+        self.fillna_method = fillna_method
+        self.k = k
+        self.sourcing_rate = sourcing_rate
+        self.D = None
+        self.I = None
+        self.indices = []  # Initialize as empty list
+        self.sourced_db_df = pd.DataFrame()  # Initialize as empty DataFrame
+        self.credentials = credentials
+        start = datetime.datetime.now()
+        self.pre_process()
+        end = datetime.datetime.now()
+        print(f"Preprocessing took {end-start}")
+    def pre_process(self):
+        """
+        Prepare query and database DataFrames for similarity matching.
+        This method performs the following preprocessing steps:
+        1. Filter query DataFrame to include only rows where label=1 (if label column exists)
+        2. Select only the columns specified for sourcing from both DataFrames
+        3. Handle missing values in both DataFrames by filling them
+        No parameters are required as it operates on the instance variables.
+        No return value as it modifies the DataFrames in-place.
+        """
+        print("Pre-processing started")
+        if self.label in self.query_df.columns:
+            # Filter query DataFrame to only include rows where the label column equals 1
+            self.query_df = self.query_df[self.query_df[self.label]==1]
+            # Note: If label column doesn't exist, assume query_df is already filtered
+        # Select columns for sourcing from both DataFrames
+        self.query_df = self.query_df[self.columns_for_sourcing]
+        print("Query DataFrame filtered and columns selected")
+        self.db_df = self.db_df[self.columns_for_sourcing]
+        print("Database DataFrame filtered and columns selected")
+        # Fill missing values in both DataFrames
+        self.query_df = self.fillna(self.query_df)
+        print("Missing values in query DataFrame filled")
+        self.db_df = self.fillna(self.db_df)
+        print("Missing values in database DataFrame filled")
+        print("Pre-processing completed")
+    def set_fillna_method(self, method):
+        self.fillna_method = method
+    def fillna(self, df):
+        try:
+            if self.fillna_method == 'zero':
+                return df.fillna(0)
+            elif self.fillna_method == 'mean':
+                mean_values = df.mean()
+                if mean_values.isna().any():
+                    print("Warning: Some columns have all NA values, using 0 for those columns")
+                    df = df.fillna(mean_values)
+                    return df.fillna(0)  # Fill remaining NAs with 0
+                return df.fillna(mean_values)
+            elif self.fillna_method == 'median':
+                median_values = df.median()
+                if median_values.isna().any():
+                    print("Warning: Some columns have all NA values, using 0 for those columns")
+                    df = df.fillna(median_values)
+                    return df.fillna(0)
+                return df.fillna(median_values)
+            elif self.fillna_method == 'mode':
+                mode_values = df.mode().iloc[0]
+                if mode_values.isna().any():
+                    print("Warning: Some columns have all NA values, using 0 for those columns")
+                    df = df.fillna(mode_values)
+                    return df.fillna(0)
+                return df.fillna(mode_values)
+            elif self.fillna_method == 'max':
+                max_values = df.max()
+                if max_values.isna().any():
+                    print("Warning: Some columns have all NA values, using 0 for those columns")
+                    df = df.fillna(max_values)
+                    return df.fillna(0)
+                return df.fillna(max_values)
+            elif self.fillna_method == 'min':
+                min_values = df.min()
+                if min_values.isna().any():
+                    print("Warning: Some columns have all NA values, using 0 for those columns")
+                    df = df.fillna(min_values)
+                    return df.fillna(0)
+                return df.fillna(min_values)
+            else:
+                print("Invalid fillna_method. Using 'zero' method instead.")
+                return df.fillna(0)
+        except Exception as e:
+            print(f"Error in fillna: {str(e)}. Using 'zero' method instead.")
+            return df.fillna(0)
+    def _get_credentials(self):
+        """
+        Internal method for credential verification.
+        Prompts user for credentials and returns the input credential.
+        Used for access control in the run method.
+        """
+        credential = getpass("Please enter your credentials: ")
+        return credential
+    def indexing(self):
+        # credential box
+        # input_credential = self._get_credentials()
+        # if input_credential != 'hijinwen':
+        #     print("Access denied: Invalid credentials")
+        #     return False
+        start = datetime.datetime.now()
+        try:
+            # faiss
+            index = faiss.IndexFlatL2(self.query_df.shape[1])
+            index.add(self.db_df)
+            # Ensure k doesn't exceed database size
+            effective_k = min(self.k, len(self.db_df))
+            self.D, self.I = index.search(self.query_df, effective_k)
+            self.indices = list(set([index for sublist in self.I for index in sublist]))
+            end = datetime.datetime.now()
+            print(f"Indexing took {end-start}")
+            return self.indices
+        except Exception as e:
+            print(f"Error running sourcing: {str(e)}")
+            return False
+    def sourcing(self):
+        """Returns the sourcing results as a DataFrame."""
+        self.indexing()
+        self.sourced_db_df = self.db_df.iloc[self.indices]
+        self.sourced_db_df_with_label = self.db_df_raw.iloc[self.indices]
+        # self.sourced_db_df = self.sourced_db_df.drop_duplicates()
+        return self.sourced_db_df, self.sourced_db_df_with_label
+    def validate(self):
+        if self.label not in self.db_df_raw.columns:
+            print("Label is needed in database for validation.")
+            return
+        """Validates the sourcing results by comparing label counts."""
+        try:
+            if self.indices is None or not self.indices:
+                print("No results to validate. Run the sourcing process first.")
+                return
+            # labels in db_db
+            self.raw_label_number = self.db_df_raw[self.db_df_raw[self.label]==1].shape[0]
+            self.sourced_label_number = self.sourced_db_df_with_label[self.sourced_db_df_with_label[self.label]==1].shape[0]
+            print("Label before sourcing: " + str(self.raw_label_number))
+            print("Label after sourcing: " + str(self.sourced_label_number))
+            print("Number of rows before sourcing: " + str(self.db_df_raw.shape[0]))
+            print("Number of rows after sourcing: " + str(self.sourced_db_df.shape[0]))
+        except Exception as e:
+            print(f"Error validating: {str(e)}")

mlfastflow-0.0.1/mlfastflow/utils.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Utility functions for the mlfastflow package."""
+def validate_flow(flow):
+    """Validate the structure of a flow.
+    Args:
+        flow: The flow to validate.
+    Returns:
+        bool: True if the flow is valid, False otherwise.
+    """
+    if not hasattr(flow, 'nodes') or not isinstance(flow.nodes, list):
+        return False
+    # Check if all nodes have a process method
+    for node in flow.nodes:
+        if not hasattr(node, 'process') or not callable(node.process):
+            return False
+    return True

mlfastflow-0.0.1/mlfastflow.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,68 @@
+Metadata-Version: 2.2
+Name: mlfastflow
+Version: 0.0.1
+Summary: Packages for fast dataflow and workflow processing
+Home-page: https://github.com/xileven/mlfastflow
+Author: Xileven
+Author-email: Xileven <hi@bringyouhome.com>
+License: MIT
+Project-URL: Documentation, https://github.com/xileven/mlfastflow
+Project-URL: Source, https://github.com/xileven/mlfastflow
+Project-URL: Tracker, https://github.com/xileven/mlfastflow/issues
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: author
+Dynamic: home-page
+Dynamic: requires-python
+# MLFastFlow
+A Python package for fast dataflow and workflow processing.
+## Installation
+```bash
+pip install mlfastflow
+```
+## Features
+- Easy-to-use data sourcing with the Sourcing class
+- Flexible vector search capabilities
+- Optimized for data processing workflows
+## Quick Start
+```python
+from mlfastflow import Sourcing
+# Create a sourcing instance
+sourcing = Sourcing(
+    query_df=your_query_dataframe,
+    db_df=your_database_dataframe,
+    columns_for_sourcing=["column1", "column2"],
+    label="your_label"
+)
+# Process your data
+sourced_db_df_without_label, sourced_db_df_with_label = (
+    sourcing.sourcing()
+)
+```
+## License
+MIT
+## Author
+Xileven

mlfastflow-0.0.1/mlfastflow.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,17 @@
+LICENSE
+README.md
+pyproject.toml
+setup.py
+examples/__init__.py
+examples/simple_flow.py
+examples/simple_sourcing.py
+mlfastflow/__init__.py
+mlfastflow/core.py
+mlfastflow/sourcing.py
+mlfastflow/utils.py
+mlfastflow.egg-info/PKG-INFO
+mlfastflow.egg-info/SOURCES.txt
+mlfastflow.egg-info/dependency_links.txt
+mlfastflow.egg-info/top_level.txt
+tests/__init__.py
+tests/test_core.py

mlfastflow-0.0.1/mlfastflow.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlfastflow-0.0.1/mlfastflow.egg-info/top_level.txt ADDED Viewed

@@ -0,0 +1,3 @@
+examples
+mlfastflow
+tests

mlfastflow-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,29 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "mlfastflow"
+dynamic = ["version"]
+description = "Packages for fast dataflow and workflow processing"
+readme = "README.md"
+authors = [
+    {name = "Xileven", email = "hi@bringyouhome.com"}
+]
+license = {text = "MIT"}
+requires-python = ">=3.8"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+]
+[project.urls]
+Documentation = "https://github.com/xileven/mlfastflow"
+Source = "https://github.com/xileven/mlfastflow"
+Tracker = "https://github.com/xileven/mlfastflow/issues"

mlfastflow-0.0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

mlfastflow-0.0.1/setup.py ADDED Viewed

@@ -0,0 +1,38 @@
+from setuptools import setup, find_packages
+setup(
+    name="mlfastflow",
+    version="0.0.1",
+    author="Xileven",
+    author_email="hi@bringyouhome.com",
+    description="packages for fast dataflow and workflow processing",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/xileven/mlfastflow",
+    packages=find_packages(),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+    ],
+    python_requires=">=3.9",
+    install_requires=[
+        "pandas",
+        "numpy",
+        "faiss-cpu",
+    ],
+    package_data={
+        '': ['README.md'],
+    },
+    project_urls={
+        'Documentation': 'https://github.com/xileven/mlfastflow',
+        'Source': 'https://github.com/xileven/mlfastflow',
+        'Tracker': 'https://github.com/xileven/mlfastflow/issues',
+    },
+)

mlfastflow-0.0.1/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlfastflow-0.0.1/tests/test_core.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Tests for the placeholder functionality of the pyfastflow package."""
+import unittest
+from pyfastflow.core import Flow
+class TestFlow(unittest.TestCase):
+    """Test cases for the placeholder Flow class."""
+    def test_flow_initialization(self):
+        """Test that a Flow can be initialized."""
+        flow = Flow()
+        self.assertEqual(flow.name, "placeholder_flow")
+        # Test with arguments that should be ignored
+        flow_with_args = Flow(name="test_flow", some_arg=123)
+        self.assertEqual(flow_with_args.name, "placeholder_flow")
+    def test_string_representation(self):
+        """Test string representation of Flow."""
+        flow = Flow()
+        self.assertEqual(str(flow), "Flow(name='placeholder_flow')")
+        self.assertEqual(repr(flow), "Flow(name='placeholder_flow')")
+if __name__ == "__main__":
+    unittest.main()