PyPI - oxenai - Versions diffs - 0.42.4__cp312-cp312-macosx_10_13_x86_64.whl - Mend

oxenai 0.42.4__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

oxen/__init__.py +55 -0
oxen/auth.py +40 -0
oxen/clone.py +58 -0
oxen/config.py +16 -0
oxen/data_frame.py +462 -0
oxen/datasets.py +106 -0
oxen/df_utils.py +54 -0
oxen/diff/__init__.py +0 -0
oxen/diff/change_type.py +12 -0
oxen/diff/diff.py +143 -0
oxen/diff/line_diff.py +41 -0
oxen/diff/tabular_diff.py +22 -0
oxen/diff/text_diff.py +48 -0
oxen/features.py +58 -0
oxen/fs.py +57 -0
oxen/init.py +19 -0
oxen/oxen.cpython-312-darwin.so +0 -0
oxen/oxen_fs.py +351 -0
oxen/providers/__init__.py +0 -0
oxen/providers/dataset_path_provider.py +26 -0
oxen/providers/mock_provider.py +73 -0
oxen/providers/oxen_data_frame_provider.py +61 -0
oxen/remote_repo.py +656 -0
oxen/repo.py +239 -0
oxen/streaming_dataset.py +242 -0
oxen/user.py +40 -0
oxen/util/__init__.py +0 -0
oxen/workspace.py +210 -0
oxenai-0.42.4.dist-info/METADATA +92 -0
oxenai-0.42.4.dist-info/RECORD +32 -0
oxenai-0.42.4.dist-info/WHEEL +4 -0
oxenai-0.42.4.dist-info/entry_points.txt +2 -0

oxen/datasets.py ADDED Viewed

@@ -0,0 +1,106 @@
+import logging
+from typing import Optional
+from oxen import RemoteRepo
+logger = logging.getLogger(__name__)
+def load_dataset(repo_id: str, path: str, fmt: str = "hugging_face", revision=None):
+    """
+    Load a dataset from an Oxen repository into memory using the HuggingFace datasets library.
+    Args:
+        repo_id: `str`
+            The namespace/repo_name of the oxen repository to load the dataset from
+        path: `str` | Sequence[str]
+            The path to the dataset we want to load
+        fmt: `str`
+            The format of the data files. Currently only "hugging_face" is supported.
+        revision: `str` | None
+            The commit id or branch name of the version of the data to download
+    Example:
+    ```python
+    from oxen.datasets import load_dataset
+    dataset = load_dataset("datasets/gsm8k", "train.jsonl")
+    # use datasets functions as you normally would
+    dataset.shuffle()[:10]
+    ```
+    """
+    logger.info(
+        f"Loading dataset {repo_id} from {path} with format {fmt} and revision {revision}"
+    )
+    if fmt == "hugging_face":
+        # Download the data from Oxen.ai
+        download(repo_id, path, revision=revision)
+        # Use the Hugging Face datasets library to load the data
+        return _load_hf(path)
+    else:
+        raise ValueError(f"Unsupported load format: {fmt}")
+def _load_hf(path: str):
+    from datasets import load_dataset as hf_load_dataset
+    if path.endswith(".csv"):
+        return hf_load_dataset("csv", data_files=path)
+    elif path.endswith(".json") or path.endswith(".jsonl"):
+        return hf_load_dataset("json", data_files=path)
+    elif path.endswith(".parquet"):
+        return hf_load_dataset("parquet", data_files=path)
+    else:
+        raise ValueError(f"Unsupported file extension: {path}")
+def download(
+    repo_id: str, path: str, revision=None, dst=None, host="hub.oxen.ai", scheme="https"
+):
+    """
+    Download files or directories from a remote Oxen repository.
+    Args:
+        repo_id: `str`
+            The namespace/repo_name of the oxen repository to load the dataset from
+        path: `str`
+            The path to the data files
+        revision: `str | None`
+            The commit id or branch name of the version of the data to download
+        dst: `str | None`
+            The path to download the data to.
+        host: `str`
+            The host to download the data from.
+        scheme: `str`
+            The scheme to download the data with. (default: "https")
+    """
+    repo = RemoteRepo(repo_id, host=host, scheme=scheme)
+    repo.download(path, revision=revision, dst=dst)
+def upload(
+    repo_id: str, path: str, message: str, branch: Optional[str] = None, dst: str = ""
+):
+    """
+    Upload files or directories to a remote Oxen repository.
+    Args:
+        repo_id: `str`
+            The namespace/repo_name of the oxen repository to upload the dataset to
+        path: `str`
+            The path to the data files
+        message: `str`
+            The commit message to use when uploading the data
+        branch: `str | None`
+            The branch to upload the data to. If None, the `main` branch is used.
+        dst: `str | None`
+            The directory to upload the data to.
+    """
+    repo = RemoteRepo(repo_id)
+    if branch is not None:
+        repo.checkout(branch)
+    repo.add(path, dst=dst)
+    return repo.commit(message)

oxen/df_utils.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""
+The `df_utils` module provides a consistent interface for loading data frames and saving them to disk.
+Supported types: csv, parquet, json, jsonl, arrow
+Example usage:
+```python
+import os
+from oxen import df_utils
+# load a data frame
+df = df_utils.load("path/to/data.csv")
+# save a data frame
+df_utils.save(df, "path/to/save.csv")
+```
+"""
+from .oxen import df_utils
+import os
+from polars import DataFrame
+def load(
+    path: os.PathLike,
+):
+    """
+    Reads a file into a data frame. The file format is inferred from the file extension.
+    Supported types: csv, parquet, json, jsonl, arrow
+    Args:
+        path: `os.PathLike`
+            The path to the file to read.
+    """
+    return df_utils.load(path)
+def save(
+    data_frame: DataFrame,
+    path: os.PathLike,
+):
+    """
+    Saves a data frame to a file. The file format is inferred from the file extension.
+    Args:
+        data_frame: `DataFrame`
+            The polars data frame to save.
+        path: `os.PathLike`
+            The path to save the data frame to.
+    """
+    return df_utils.save(data_frame, path)

oxen/diff/__init__.py ADDED Viewed

File without changes

oxen/diff/change_type.py ADDED Viewed

@@ -0,0 +1,12 @@
+from enum import Enum
+class ChangeType(Enum):
+    """
+    An enum representing the type of change in a diff.
+    """
+    ADDED = "Added"
+    REMOVED = "Removed"
+    MODIFIED = "Modified"
+    UNCHANGED = "Unchanged"

oxen/diff/diff.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""
+Oxen can be used to compare data frames and return a tabular diff.
+There is more information about the diff in the
+[Diff Getting Started Documentation](/concepts/diffs).
+For example comparing two data frames will give you an output data frame,
+where the `.oxen.diff.status` column shows if the row was `added`, `removed`,
+or `modified`.
+```
+shape: (6, 7)
++-------------+-----+-----+-------+--------+-------------+-------------------+
+| file        | x   | y   | width | height | label.right | .oxen.diff.status |
+| ---         | --- | --- | ---   | ---    | ---         | ---               |
+| str         | i64 | i64 | i64   | i64    | str         | str               |
++-------------+-----+-----+-------+--------+-------------+-------------------+
+| image_0.jpg | 0   | 0   | 10    | 10     | cat         | modified          |
+| image_1.jpg | 1   | 2   | 10    | 20     | null        | removed           |
+| image_1.jpg | 200 | 100 | 10    | 20     | dog         | added             |
+| image_2.jpg | 4   | 10  | 20    | 20     | null        | removed           |
+| image_3.jpg | 4   | 10  | 20    | 20     | dog         | added             |
+| image_4.jpg | 10  | 10  | 10    | 10     | dog         | added             |
++-------------+-----+-----+-------+--------+-------------+-------------------+
+```
+## Usage
+```python
+import os
+import oxen
+result = oxen.diff("dataset_1.csv", "dataset_2.csv")
+print(result.get())
+```
+"""
+from ..oxen import PyDiff
+from ..oxen import diff as py_diff
+from oxen import df_utils
+from oxen.diff.tabular_diff import TabularDiff
+from oxen.diff.text_diff import TextDiff
+import os
+from typing import Optional
+def diff(
+    path: os.PathLike,
+    to: Optional[os.PathLike] = None,
+    repo_dir: Optional[os.PathLike] = None,
+    revision_left: Optional[str] = None,
+    revision_right: Optional[str] = None,
+    output: Optional[os.PathLike] = None,
+    keys: list[str] = [],
+    compares: list[str] = [],
+):
+    """
+    Compares data from two paths and returns a diff respecting the type of data.
+    Args:
+        path: `os.PathLike`
+            The path to diff. If `to` is not provided,
+            this will compare the data frame to the previous commit.
+        to: `os.PathLike`
+            An optional second path to compare to.
+            If provided this will be the right side of the diff.
+        repo_dir: `os.PathLike`
+            The path to the oxen repository. Must be provided if `compare_to` is
+            not provided, or if `revision_left` or `revision_right` is provided.
+            If not provided, the repository will be searched for in the current
+            working directory.
+        revision_left: `str`
+            The left revision to compare. Can be a commit hash or branch name.
+        revision_right: `str`
+            The right revision to compare. Can be a commit hash or branch name.
+        output: `os.PathLike`
+            The path to save the diff to. If not provided, the diff will not be saved.
+        keys: `list[str]`
+            Only for tabular diffs. The keys to compare on.
+            This is used to join the two data frames.
+            Keys will be combined and hashed to create a identifier for each row.
+        compares: `list[str]`
+            Only for tabular diffs. The compares to compare on.
+            This is used to compare the values of the two data frames.
+    """
+    result = py_diff.diff_paths(path, keys, to, repo_dir, revision_left, revision_right)
+    if output:
+        df_utils.save(result, output)
+    return Diff(result)
+class Diff:
+    """
+    Diff class wraps many types of diffs and provides a consistent interface.
+    For example the diff can be tabular or text. Eventually we will extend this
+    to support other types of diffs such as images, audio, etc.
+    """
+    def __init__(self, py_diff: PyDiff):
+        self._py_diff = py_diff
+    def __repr__(self) -> str:
+        return f"Diff(format={self.format})"
+    @property
+    def format(self) -> str:
+        """
+        Returns the format of the diff. Ie. tabular, text, etc.
+        """
+        return self._py_diff.format
+    @property
+    def tabular(self) -> Optional[TabularDiff]:
+        """
+        Returns the tabular diff if the diff is tabular.
+        """
+        if self.format == "tabular":
+            return TabularDiff(self._py_diff.tabular)
+        return None
+    @property
+    def text(self) -> Optional[TextDiff]:
+        """
+        Returns the text diff if the diff is text.
+        """
+        if self.format == "text":
+            return TextDiff(self._py_diff.text)
+        return None
+    def get(self):
+        """
+        Resolves the diff type and returns the appropriate diff object.
+        """
+        format = self._py_diff.format
+        if "tabular" == format:
+            return TabularDiff(self._py_diff.tabular)
+        elif "text" == format:
+            return TextDiff(self._py_diff.text)
+        else:
+            raise ValueError("The diff type is unknown.")

oxen/diff/line_diff.py ADDED Viewed

@@ -0,0 +1,41 @@
+from ..oxen import PyLineDiff, PyChangeType
+from oxen.diff.change_type import ChangeType
+class LineDiff:
+    """
+    A class representing a change in a line of text.
+    """
+    def __init__(self, diff: PyLineDiff):
+        self._diff = diff
+    def __repr__(self) -> str:
+        return (
+            f"LineDiff(modification={self._diff.modification}, text={self._diff.text})"
+        )
+    @property
+    def modification(self) -> ChangeType:
+        """
+        Returns the modification of the line diff.
+        """
+        mod_type = self._diff.modification
+        if PyChangeType.Added == mod_type:
+            return ChangeType.ADDED
+        elif PyChangeType.Removed == mod_type:
+            return ChangeType.REMOVED
+        elif PyChangeType.Modified == mod_type:
+            return ChangeType.MODIFIED
+        elif PyChangeType.Unchanged == mod_type:
+            return ChangeType.UNCHANGED
+        else:
+            raise ValueError(f"Invalid modification: {mod_type}")
+    @property
+    def text(self) -> str:
+        """
+        Returns the text of the line diff.
+        """
+        return self._diff.text

oxen/diff/tabular_diff.py ADDED Viewed

@@ -0,0 +1,22 @@
+from ..oxen import PyTabularDiff
+from polars import DataFrame
+class TabularDiff:
+    """
+    This class returns a polars data frame that represents a tabular diff.
+    """
+    def __init__(self, diff: PyTabularDiff):
+        self._diff = diff
+    def __repr__(self) -> str:
+        return f"TabularDiff(shape={self._diff.data.shape})\n\n{self._diff.data}"
+    @property
+    def data(self) -> DataFrame:
+        """
+        Returns the data of the diff as a polars data frame.
+        """
+        return self._diff.data

oxen/diff/text_diff.py ADDED Viewed

@@ -0,0 +1,48 @@
+from ..oxen import PyTextDiff, PyChangeType
+from oxen.diff.line_diff import LineDiff
+class TextDiff:
+    """
+    A class representing a text diff.
+    """
+    def __init__(self, diff: PyTextDiff):
+        self._diff = diff
+    def __repr__(self) -> str:
+        return f"TextDiff(num_added={self.num_added}, num_removed={self.num_removed})"
+    def __str__(self) -> str:
+        # iterate over lines and print them with a + or - prefix
+        return "\n".join([f"{line.value}" for line in self._diff.lines])
+    @property
+    def num_added(self) -> int:
+        """
+        Returns the number of added lines in the diff.
+        """
+        # count the number of added lines
+        return self._count_lines(PyChangeType.Added)
+    @property
+    def num_removed(self) -> int:
+        """
+        Returns the number of removed lines in the diff.
+        """
+        # count the number of removed lines
+        return self._count_lines(PyChangeType.Removed)
+    @property
+    def lines(self) -> list[LineDiff]:
+        """
+        Returns the contents of the diff as a polars data frame.
+        """
+        # map the PyLineDiff to LineDiff
+        return [LineDiff(line) for line in self._diff.lines]
+    def _count_lines(self, modification: PyChangeType) -> int:
+        return len(
+            [line for line in self._diff.lines if line.modification == modification]
+        )

oxen/features.py ADDED Viewed

@@ -0,0 +1,58 @@
+from enum import Enum
+class Feature(Enum):
+    NUMERIC = 1
+    TABULAR = 2
+    TEXT = 3
+    IMAGE = 4
+    AUDIO = 5
+    VIDEO = 6
+    def __init__(self, name, dtype):
+        """
+        A feature is a column in a dataset.
+        It can be numeric, tabular, text, image, audio, or video.
+        Parameters
+        ----------
+        name: str
+            The column name
+        dtype: One of: Feature.NUMERIC, Feature.TABULAR, Feature.TEXT,
+               Feature.IMAGE, Feature.AUDIO, Feature.VIDEO
+        """
+        self._name = name
+        self._dtype = dtype
+    @property
+    def name(self) -> str:
+        return self._name
+    @property
+    def dtype(self) -> str:
+        return self._dtype
+class Features:
+    """
+    Feature is a class that represents the features you
+    want to load into a dataset. For example the input
+    and output columns of a dataset.
+    """
+    def __init__(self, features: list[Feature]):
+        """
+        Create a set of features from a list of columns.
+        Parameters
+        ----------
+        features : list[Feature]
+            The columns to load from the dataset, and their respective types.
+        """
+        self.features = features
+    def feature_names(self) -> list[str]:
+        """
+        Returns a list of the feature names.
+        """
+        return [feature.name for feature in self.features]

oxen/fs.py ADDED Viewed

@@ -0,0 +1,57 @@
+import os
+from oxen import Repo
+def rcount_files_in_dir(directory: str) -> int:
+    """
+    Counts the number of files in a repo recursively.
+    Parameters
+    ----------
+    directory : str
+        The directory to count the number of files in.
+    """
+    return sum([len(files) for _, _, files in os.walk(directory)])
+def rcount_files_in_dir_ignore_oxen(directory: str) -> int:
+    """
+    Counts the number of files in a directory recursively, ignoring the .oxen directory.
+    Parameters
+    ----------
+    directory : str
+        The directory to count the number of files in.
+    """
+    total = 0
+    for root, _, files in os.walk(directory):
+        if ".oxen" in root:
+            continue
+        total += len(files)
+    return total
+def rcount_files_in_repo(repo: Repo) -> int:
+    """
+    Recursively counts the number of files in a repo ignoring the .oxen directory.
+    Parameters
+    ----------
+    repo : Repo
+        The repository to count the number of files in.
+    """
+    return rcount_files_in_dir_ignore_oxen(repo.path)
+def rcount_files_in_repo_dir(repo: Repo, directory: str) -> int:
+    """
+    Recursively counts the number of files in a directory repo within a repo.
+    Parameters
+    ----------
+    repo : Repo
+        The repository to count the number of files in.
+    directory : str
+        The directory to start the count in, relative to the repo.
+    """
+    return rcount_files_in_dir_ignore_oxen(os.path.join(repo.path, directory))

oxen/init.py ADDED Viewed

@@ -0,0 +1,19 @@
+from oxen.repo import Repo
+def init(
+    path: str = "./",
+):
+    """
+    Initialize a [Repo](/python-api/repo) at the given path.
+    Args:
+        path: `str`
+            The path to initialize the repo at.
+     Returns:
+        [Repo](/python-api/repo)
+            A Repo object that can be used to interact with the repo.
+    """
+    # Init Repo
+    repo = Repo(path)
+    return repo.init()

oxen/oxen.cpython-312-darwin.so ADDED Viewed

Binary file