PyPI - oxenai - Versions diffs - 0.42.4__cp312-cp312-macosx_10_13_x86_64.whl - Mend

oxenai 0.42.4__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

oxen/__init__.py +55 -0
oxen/auth.py +40 -0
oxen/clone.py +58 -0
oxen/config.py +16 -0
oxen/data_frame.py +462 -0
oxen/datasets.py +106 -0
oxen/df_utils.py +54 -0
oxen/diff/__init__.py +0 -0
oxen/diff/change_type.py +12 -0
oxen/diff/diff.py +143 -0
oxen/diff/line_diff.py +41 -0
oxen/diff/tabular_diff.py +22 -0
oxen/diff/text_diff.py +48 -0
oxen/features.py +58 -0
oxen/fs.py +57 -0
oxen/init.py +19 -0
oxen/oxen.cpython-312-darwin.so +0 -0
oxen/oxen_fs.py +351 -0
oxen/providers/__init__.py +0 -0
oxen/providers/dataset_path_provider.py +26 -0
oxen/providers/mock_provider.py +73 -0
oxen/providers/oxen_data_frame_provider.py +61 -0
oxen/remote_repo.py +656 -0
oxen/repo.py +239 -0
oxen/streaming_dataset.py +242 -0
oxen/user.py +40 -0
oxen/util/__init__.py +0 -0
oxen/workspace.py +210 -0
oxenai-0.42.4.dist-info/METADATA +92 -0
oxenai-0.42.4.dist-info/RECORD +32 -0
oxenai-0.42.4.dist-info/WHEEL +4 -0
oxenai-0.42.4.dist-info/entry_points.txt +2 -0

oxen/oxen_fs.py ADDED Viewed

@@ -0,0 +1,351 @@
+from __future__ import annotations
+import logging
+import os
+import tempfile
+from typing import Optional
+import fsspec
+from fsspec.utils import infer_storage_options
+from .remote_repo import RemoteRepo
+from .oxen import PyEntry
+logger = logging.getLogger(__name__)
+class OxenFS(fsspec.AbstractFileSystem):
+    """
+    OxenFS is a filesystem interface for Oxen repositories that implements the
+    [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) protocol. This
+    allows you to interact with Oxen repositories using familiar filesystem
+    operations and integrate with other compatible libraries like Pandas.
+    ## Basic Usage
+    ### Creating a Filesystem Instance
+    ```python
+    import oxen
+    # For Oxen Hub repositories
+    fs = oxen.OxenFS("ox", "Flowers")
+    # For local oxen-server
+    fs = oxen.OxenFS("ox", "test-repo", host="localhost:3000", scheme="http")
+    ```
+    ### Reading Files
+    ```python
+    with fs.open("data/train.csv") as f:
+        content = f.read()
+    ```
+    ### Writing Files
+    You must have write access to the repository to write files. See:
+    https://docs.oxen.ai/getting-started/python#private-repositories
+    OxenFS will automatically commit the file to the repository when the
+    context is exited (or the file is closed some other way). New
+    directories are automatically created as needed.
+    ```python
+    # Write with custom commit message
+    with fs.open("data/test.txt", mode="wb", commit_message="Added test.txt") as f:
+        f.write("Hello, world!")
+    # You can also set/update the commit message inside the context
+    with fs.open("data/test.txt", mode="wb") as f:
+        f.commit_message = "Updated test.txt"
+        f.write("Hello, world again!")
+    ```
+    #### Writing file objects
+    If you're integrating Oxen in a situation where you already have a file object,
+    you can save it to your repo by using `shutil.copyfileobj` like this:
+    ```python
+    import shutil
+    file_object_from_somewhere = open("data.csv")
+    with fs.open("train/data.csv", mode="wb") as output_file:
+        output_file.commit_message = "Copy from a file object"
+        shutil.copyfileobj(file_object_from_somewhere, output_file)
+    ```
+    ## Integration with Third Party Libraries (Pandas, etc.)
+    OxenFS works seamlessly with Pandas and other fsspec-compatible libraries using
+    the URL format: `oxen://namespace:repo@revision/path/to/file`
+    ### Reading Data
+    These will work with Pandas `{to,from}_{csv,parquet,json,etc.}` functions.
+    ```python
+    import pandas as pd
+    # Read parquet directly from Oxen repository
+    df = pd.read_parquet("oxen://openai:gsm8k@main/gsm8k_test.parquet")
+    ```
+    ### Writing Data
+    ```python
+    # Write DataFrame directly to Oxen repository
+    df.to_csv("oxen://ox:my-repo@main/data/test.csv", index=False)
+    ```
+    ## Notes
+    - Only binary read ("rb") and write ("wb") modes are currently supported
+        - But writing will automatically encode strings to bytes
+    - Does not yet support streaming files. All operations use temporary local files.
+    """
+    def __init__(
+        self,
+        namespace: str,
+        repo: str,
+        host: str = "hub.oxen.ai",
+        revision: str = "main",
+        scheme: str = "https",
+        **kwargs,
+    ):
+        """
+        Initialize the OxenFS instance.
+        Args:
+            namespace: `str`
+                The namespace of the repository.
+            repo: `str`
+                The name of the repository.
+            host: `str`
+                The host to connect to. Defaults to 'hub.oxen.ai'
+            revision: `str`
+                The branch name or commit id to checkout. Defaults to 'main'
+            scheme: `str`
+                The scheme to use for the remote url. Default: 'https'
+        """
+        super().__init__(**kwargs)
+        self.namespace = namespace
+        self.repo_name = repo
+        self.revision = revision
+        self.scheme = scheme
+        self.host = host
+        self.repo = RemoteRepo(f"{namespace}/{repo}", host, revision, scheme)
+        if not self.repo.exists():
+            raise ValueError(f"Repo {namespace}/{repo} not found on host {host}")
+        logger.debug(f"Initialized OxenFS for {namespace}/{repo}@{revision} on {host}")
+    def __repr__(self):
+        return f"OxenFS(namespace='{self.namespace}', repo='{self.repo_name}', revision='{self.revision}', host='{self.host}', scheme='{self.scheme}')"
+    def exists(self, path: str) -> bool:
+        return self.repo.metadata(path) is not None
+    def isfile(self, path: str) -> bool:
+        metadata = self.repo.metadata(path)
+        return metadata is not None and not metadata.is_dir
+    def isdir(self, path: str) -> bool:
+        metadata = self.repo.metadata(path)
+        return metadata is not None and metadata.is_dir
+    def ls(self, path: str = "", detail: bool = False):
+        """
+        List the contents of a directory.
+        Args:
+            path: `str`
+                The path to list the contents of.
+            detail: `bool`
+                If True, return a list of dictionaries with detailed metadata.
+                Otherwise, return a list of strings with the filenames.
+        """
+        logger.debug(f"OxenFS.ls: '{path}'")
+        metadata = self.repo.metadata(path)
+        if not metadata:
+            return []
+        if metadata.is_dir:
+            entries = self.repo.ls(path)
+            return [
+                self._metadata_entry_to_ls_entry(entry, detail) for entry in entries
+            ]
+        else:
+            return [self._metadata_entry_to_ls_entry(metadata, detail)]
+    @staticmethod
+    def _metadata_entry_to_ls_entry(entry: PyEntry, detail: bool = False):
+        if detail:
+            return {
+                "name": entry.path,
+                "type": "directory" if entry.is_dir else "file",
+                "size": entry.size,
+                "hash": entry.hash,
+            }
+        else:
+            return entry.path
+    def _open(self, path: str, mode: str = "rb", **kwargs):
+        """
+        Open a file in the OxenFS backend.
+        This is normally called through `OxenFS.open()` or `fsspec.open()`.
+        """
+        if mode == "rb":
+            return self._open_read(path, **kwargs)
+        if mode == "wb":
+            return self._open_write(path, **kwargs)
+        else:
+            raise ValueError(
+                "Unsupported file mode. Only rb and wb modes are supported"
+            )
+    def _open_read(self, path: str, **kwargs):
+        logger.debug(f"Opening file {path} for reading")
+        metadata = self.repo.metadata(path)
+        if metadata.is_dir:
+            raise ValueError("Cannot open directories")
+        tmp_file = tempfile.NamedTemporaryFile()
+        dst_path = tmp_file.file.name
+        self.repo.download(path, dst_path)
+        logger.debug(f"Downloaded file {path} to temp file {dst_path}")
+        return open(dst_path, "rb")
+    def _open_write(
+        self,
+        path: str,
+        commit_message: Optional[str] = None,
+        **kwargs,
+    ):
+        path = os.path.normpath(path)
+        logger.debug(f"Opening file {path} for writing")
+        target_dir = os.path.dirname(path)
+        file_name = os.path.basename(path).strip()
+        if file_name == "" or file_name == ".":
+            raise ValueError("File name cannot be empty")
+        try:
+            metadata = self.repo.metadata(target_dir)
+            if metadata and not metadata.is_dir:
+                raise ValueError("target_dir cannot be an existing file")
+        except ValueError as e:
+            if "not found" in str(e):
+                # If the directory does not exist, it will be created on the server
+                pass
+            else:
+                raise e
+        return OxenFSFileWriter(self.repo, file_name, target_dir, commit_message)
+    @classmethod
+    def _strip_protocol(cls, path):
+        opts = infer_storage_options(path)
+        if "username" not in opts:
+            return super()._strip_protocol(path)
+        return opts["path"].lstrip("/")
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        opts = infer_storage_options(path)
+        if "username" not in opts:
+            return {}
+        out = {"namespace": opts["username"], "repo": opts["password"]}
+        if opts["host"]:
+            out["revision"] = opts["host"]
+        return out
+class OxenFSFileWriter:
+    """
+    A file writer for the OxenFS backend.
+    This is normally called through `OxenFS.open()` or `fsspec.open()`.
+    """
+    def __init__(
+        self,
+        repo: RemoteRepo,
+        path: str,
+        target_dir: str = "",
+        commit_message: Optional[str] = None,
+    ):
+        self.repo = repo
+        self.path = path
+        self.commit_message = commit_message or "Auto-commit from OxenFS"
+        self.target_dir = target_dir
+        self._tmp_file = tempfile.NamedTemporaryFile()
+        self.closed = False
+        logger.debug(f"Initialized OxenFSFileWriter for {path} in '{target_dir}'")
+    def __enter__(self) -> OxenFSFileWriter:
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type is not None:
+            logger.error(
+                f"Error writing to {self.repo} {self.path}: {exc_type} {exc_value} {traceback}"
+            )
+        self.close()
+        # Don't suppress exceptions
+        return False
+    def write(self, data: str | bytes):
+        """
+        Write string or binary data to the file.
+        """
+        if isinstance(data, str):
+            data = data.encode("utf-8")
+        self._tmp_file.write(data)
+    def flush(self):
+        """
+        Flush the file to disk.
+        """
+        self._tmp_file.flush()
+    def tell(self):
+        """
+        Return the current position of the file.
+        """
+        return self._tmp_file.tell()
+    def seek(self, offset: int, whence: int = os.SEEK_SET):
+        """
+        Seek to a specific position in the file.
+        """
+        self._tmp_file.seek(offset, whence)
+    def commit(self, commit_message: Optional[str] = None):
+        """
+        Commit the file to the remote repo.
+        """
+        logger.debug(f"Committing file {self.path} to dir '{self.target_dir}'")
+        self.repo.upload(
+            self._tmp_file.name,
+            commit_message=commit_message or self.commit_message,
+            file_name=self.path,
+            dst_dir=self.target_dir,
+        )
+        logger.info(f"Committed file {self.path} to dir '{self.target_dir}'")
+    def close(self):
+        """
+        Close the file writer. This will commit the file to the remote repo.
+        """
+        if self.closed:
+            return
+        logger.debug(
+            f"Closing OxenFSFileWriter for {self.path} in dir '{self.target_dir}'"
+        )
+        self.flush()
+        self.commit()
+        self._tmp_file.close()
+        self.closed = True
+        logger.debug(
+            f"Closed OxenFSFileWriter for {self.path} in dir '{self.target_dir}'"
+        )

oxen/providers/__init__.py ADDED Viewed

File without changes

oxen/providers/dataset_path_provider.py ADDED Viewed

@@ -0,0 +1,26 @@
+class DatasetPathProvider:
+    """An interface for providing data by path and index"""
+    @property
+    def paths(self):
+        """Get the paths to the data files"""
+        raise NotImplementedError
+    def size(self, path) -> int:
+        """Get the size of the dataframe at the given path"""
+        raise NotImplementedError
+    def slice(self, path, start, end):
+        """
+        Get a slice of the dataframe at the given path
+        Parameters
+        ----------
+        path : str
+            The path to the dataframe
+        start : int
+            The start index
+        end : int
+            The end index
+        """
+        raise NotImplementedError

oxen/providers/mock_provider.py ADDED Viewed

@@ -0,0 +1,73 @@
+from oxen.providers.dataset_path_provider import DatasetPathProvider
+import time
+class MockPathProvider(DatasetPathProvider):
+    """
+    A mock implementation for providing data by path and index
+    It generates mock data with the given columns and number of rows
+    for the set of paths.
+    """
+    def __init__(
+        self,
+        paths=["path_1.csv", "path_2.csv"],
+        num_rows=1024,
+        columns=["path", "x", "y"],
+        download_time=0.1,  # mock a slow download
+    ):
+        self._paths = paths
+        self._num_rows = num_rows
+        self._columns = columns
+        self._download_time = download_time
+        self._setup()
+    def _setup(self):
+        self._data_frame_paths = {}
+        for i, path in enumerate(self._paths):
+            self._data_frame_paths[path] = self._make_data_frame(i)
+    def _make_data_frame(self, i):
+        df = []
+        for j in range(self._num_rows):
+            row = {}
+            for col in self._columns:
+                idx = i * self._num_rows + j
+                row[col] = f"{col}_{idx}"
+            df.append(row)
+        return df
+    @property
+    def paths(self):
+        return self._paths
+    def size(self, path) -> int:
+        """Get the size of the dataframe at the given path"""
+        if path not in self._data_frame_paths:
+            # Make sure the path exists
+            return 0, 0
+        if len(self._data_frame_paths[path]) == 0:
+            # Make sure the path has data
+            return 0, 0
+        # width x height
+        return len(self._data_frame_paths[path][0]), len(self._data_frame_paths[path])
+    def slice(self, path, start, end):
+        """
+        Get a slice of the dataframe at the given path
+        Parameters
+        ----------
+        path : str
+            The path to the dataframe
+        start : int
+            The start index
+        end : int
+            The end index
+        """
+        # mock a slow download
+        time.sleep(self._download_time)
+        return self._data_frame_paths[path][start:end]

oxen/providers/oxen_data_frame_provider.py ADDED Viewed

@@ -0,0 +1,61 @@
+from oxen.providers.dataset_path_provider import DatasetPathProvider
+from oxen import RemoteRepo
+from typing import List
+import json
+class OxenDataFrameProvider(DatasetPathProvider):
+    """
+    An implementation for providing data by path and index
+    It grabs rows of data from the oxen server.
+    """
+    def __init__(
+        self, repo: RemoteRepo, paths: List[str], columns: List[str] | None = None
+    ):
+        """
+        Initialize
+        Parameters
+        ----------
+        repo : RemoteRepo
+            The oxen repository you are loading data from
+        paths : List[str]
+            The paths to the data files needed to load the dataset
+        columns : List[str] | None
+            The columns of the dataset (default: None)
+        """
+        if len(paths) == 0:
+            raise ValueError("Paths must not be empty")
+        self._repo = repo
+        self._paths = paths
+        self._columns = columns
+    @property
+    def paths(self):
+        return self._paths
+    def size(self, path) -> int:
+        """Get the size of the dataframe at the given path"""
+        # width x height
+        return self._repo.get_df_size(path)
+    def slice(self, path, start, end):
+        """
+        Get a slice of the dataframe at the given path
+        Parameters
+        ----------
+        path : str
+            The path to the dataframe
+        start : int
+            The start index
+        end : int
+            The end index
+        """
+        data = self._repo.get_df_slice(path, start, end)
+        json_data = json.loads(data)
+        return json_data