PyPI - oxenai - Versions diffs - 0.39.1__cp313-cp313-manylinux_2_34_x86_64.whl - Mend

oxenai 0.39.1__cp313-cp313-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of oxenai might be problematic. Click here for more details.

Files changed (35) hide show

oxen/__init__.py +62 -0
oxen/auth.py +40 -0
oxen/clone.py +58 -0
oxen/config.py +16 -0
oxen/data_frame.py +462 -0
oxen/datasets.py +106 -0
oxen/df_utils.py +54 -0
oxen/diff/__init__.py +0 -0
oxen/diff/change_type.py +12 -0
oxen/diff/diff.py +143 -0
oxen/diff/line_diff.py +41 -0
oxen/diff/tabular_diff.py +22 -0
oxen/diff/text_diff.py +48 -0
oxen/features.py +58 -0
oxen/fs.py +57 -0
oxen/init.py +19 -0
oxen/notebooks.py +97 -0
oxen/oxen.cpython-313-x86_64-linux-gnu.so +0 -0
oxen/oxen_fs.py +351 -0
oxen/providers/__init__.py +0 -0
oxen/providers/dataset_path_provider.py +26 -0
oxen/providers/mock_provider.py +73 -0
oxen/providers/oxen_data_frame_provider.py +61 -0
oxen/remote_repo.py +626 -0
oxen/repo.py +239 -0
oxen/streaming_dataset.py +242 -0
oxen/user.py +40 -0
oxen/util/__init__.py +0 -0
oxen/workspace.py +210 -0
oxen.libs/libcrypto-0787ff19.so.3 +0 -0
oxen.libs/libssl-ec2edb95.so.3 +0 -0
oxenai-0.39.1.dist-info/METADATA +92 -0
oxenai-0.39.1.dist-info/RECORD +35 -0
oxenai-0.39.1.dist-info/WHEEL +4 -0
oxenai-0.39.1.dist-info/entry_points.txt +2 -0

oxen/repo.py ADDED Viewed

@@ -0,0 +1,239 @@
+import os
+from typing import Optional
+from oxen import PyRepo
+class Repo:
+    """
+    The Repo class that allows you to interact with your local oxen repo.
+    ## Examples
+    ### Init, Add, Commit and Push
+    Adding and committing a file to a remote workspace.
+    ```python
+    import os
+    from oxen import Repo
+    # Initialize the Oxen Repository in a CatsAndDogs directory
+    directory = "CatsAndDogs"
+    repo = Repo(directory)
+    repo.init()
+    repo.add("images")
+    repo.commit("Adding all the images")
+    # Replace <namespace> and <repo_name> with your values
+    repo.set_remote("origin", "https://hub.oxen.ai/<namespace>/<repo_name>")
+    repo.push()
+    ```
+    """
+    def __init__(self, path: str = "", mkdir=False):
+        """
+        Create a new Repo object. Use .init() to initialize a new oxen repository,
+        or pass the path to an existing one.
+        Args:
+            path: `str`
+                Path to the main working directory of your oxen repo.
+            mkdir: `bool`
+                Whether to create the directory if one doesn't exist. Default: False
+        """
+        # Check if the path exists, and convert to absolute path
+        if path:
+            path = os.path.abspath(path)
+            if not os.path.exists(path) and mkdir:
+                os.makedirs(path)
+        self._repo = PyRepo(path)
+    def __repr__(self):
+        return f"Repo({self.path})"
+    def init(self):
+        """
+        Initializes a new oxen repository at the path specified in the constructor.
+        Will create a .oxen folder to store all the versions and metadata.
+        """
+        self._repo.init()
+        return self
+    def clone(
+        self,
+        url: str,
+        branch: str = "main",
+        all=False,
+        filters: Optional[str | list[str]] = None,
+    ):
+        """
+        Clone repository from a remote url.
+        Args:
+            url: `str`
+                The url of the remote repository. ex) https://hub.oxen.ai/ox/chatbot
+            branch: `str`
+                The name of the branch to clone. Default: main
+            all: `bool`
+                Whether to clone the full commit history or not. Default: False
+            filters: `str | list[str] | None`
+                Filter down the set of directories you want to clone. Useful if
+                you have a large repository and only want to make changes to a
+                specific subset of files. Default: None
+        """
+        if isinstance(filters, str):
+            filters = [filters]
+        return self._repo.clone(url, branch, all, filters)
+    def branches(self):
+        """
+        List all branches for a repo
+        """
+        return self._repo.list_branches()
+    def branch(self, name: str, delete=False):
+        """ """
+        return self._repo.branch(name, delete)
+    def branch_exists(self, name: str):
+        """ """
+        return self._repo.branch_exists(name)
+    def checkout(self, revision: str, create=False):
+        """
+        Checkout a branch or commit id.
+        Args:
+            revision: `str`
+                The name of the branch or commit id to checkout.
+            create: `bool`
+                Whether to create a new branch if it doesn't exist. Default: False
+        """
+        self._repo.checkout(revision, create)
+    def add(self, path: str):
+        """
+        Stage a file or directory to be committed.
+        """
+        # Check if the path exists
+        if not os.path.exists(path):
+            # try repo.path + path
+            path = os.path.join(self.path, path)
+        # Convert to absolute path before adding
+        path = os.path.abspath(path)
+        if not os.path.exists(path):
+            raise Exception(f"Path {path} does not exist.")
+        self._repo.add(path)
+    def add_schema_metadata(self, path: str, column_name: str, metadata: str):
+        """
+        Add schema to the local repository
+        """
+        self._repo.add_schema_metadata(path, column_name, metadata)
+    def rm(self, path: str, recursive=False, staged=False):
+        """
+        Remove a file or directory from being tracked.
+        This will not delete the file or directory.
+        Args:
+            path: `str`
+                The path to the file or directory to remove.
+            recursive: `bool`
+                Whether to remove the file or directory recursively. Default: False
+            staged: `bool`
+                Whether to remove the file or directory from the staging area.
+                Default: False
+            remote: `bool`
+                Whether to remove the file or directory from a remote workspace.
+                Default: False
+        """
+        self._repo.rm(path, recursive, staged)
+    def status(self):
+        """
+        Check the status of the repo. Returns a StagedData object.
+        """
+        return self._repo.status()
+    def commit(self, message: str):
+        """
+        Commit the staged data in a repo with a message.
+        Args:
+            message: `str`
+                The commit message.
+        """
+        return self._repo.commit(message)
+    def log(self):
+        """
+        Get the commit history for a repo.
+        """
+        return self._repo.log()
+    def set_remote(self, name: str, url: str):
+        """
+        Map a name to a remote url.
+        Args:
+            name: `str`
+                The name of the remote. Ex) origin
+            url: `str`
+                The url you want to map the name to. Ex) https://hub.oxen.ai/ox/chatbot
+        """
+        self._repo.set_remote(name, url)
+    def create_remote(self, name: str):
+        self._repo.create_remote(name)
+    def push(
+        self, remote_name: str = "origin", branch: str = "main", delete: bool = False
+    ):
+        """
+        Push data to a remote repo from a local repo.
+        Args:
+            remote_name: `str`
+                The name of the remote to push to.
+            branch: `str`
+                The name of the branch to push to.
+        """
+        return self._repo.push(remote_name, branch, delete)
+    def pull(self, remote_name: str = "origin", branch: str = "main", all=False):
+        """
+        Pull data from a remote repo to a local repo.
+        Args:
+            remote_name: `str`
+                The name of the remote to pull from.
+            branch: `str`
+                The name of the branch to pull from.
+            all: `bool`
+                Whether to pull all data from branch history or not. Default: False
+        """
+        return self._repo.pull(remote_name, branch, all)
+    @property
+    def path(self):
+        """
+        Returns the path to the repo.
+        """
+        return self._repo.path()
+    @property
+    def current_branch(self):
+        """
+        Returns the current branch.
+        """
+        return self._repo.current_branch()
+    def merge(self, branch: str):
+        """
+        Merge a branch into the current branch.
+        """
+        return self._repo.merge(branch)

oxen/streaming_dataset.py ADDED Viewed

@@ -0,0 +1,242 @@
+from oxen.providers.dataset_path_provider import DatasetPathProvider
+from oxen.providers.oxen_data_frame_provider import OxenDataFrameProvider
+from oxen import RemoteRepo
+from typing import List, Union, Optional
+from collections import deque
+from tqdm import tqdm
+import threading
+import time
+import os
+def load_dataset(
+    repo: Union[RemoteRepo, str],
+    paths: Optional[Union[str, List[str]]] = None,
+    directory: Optional[str] = None,
+    features: Optional[List[str]] = None,
+    host: Optional[str] = None,
+):
+    """
+    Load a dataset from a repo.
+    Parameters
+    ----------
+    repo : Repo
+        The oxen repository you are loading data from
+        can be a local or a remote repo
+    paths : str | List[str] | None
+        A path or set of paths to the data files needed to load the dataset.
+        all paths must be data frames.
+    directory : str | None
+        The directory to stream the data from.
+        Must be a directory of files with type data frame.
+        Can be used instead of paths.
+        (default: None)
+    features : List[str] | None
+        The columns of the dataset (default: None)
+    """
+    if isinstance(paths, str):
+        paths = [paths]
+    if isinstance(repo, str):
+        repo = RemoteRepo(repo, host=host)
+    # If they supplied a directory, list all the files in the directory to get paths
+    if directory is not None:
+        # list all the files in the directory
+        paths = repo.ls(directory)
+        # prepend the directory to the paths
+        paths = [os.path.join(directory, path.filename) for path in paths]
+    if paths is None:
+        raise ValueError("Must provide either paths or directory")
+    provider = OxenDataFrameProvider(repo, paths, features)
+    dataset = StreamingDataset(provider, features)
+    return dataset
+class StreamingDataset:
+    """
+    StreamingDataset object constructs a dataset from a remote repo.
+    It can be used to load data into a dataloader.
+    """
+    def __init__(
+        self,
+        provider: DatasetPathProvider,
+        features=None,
+        num_buffers=3,
+        buffer_size=128,
+        sleep_interval=0.1,
+    ):
+        """
+        Create a new RemoteRepo object to interact with.
+        Parameters
+        ----------
+        provider : DatasetPathProvider
+            The implementation of fetching data from a path and index
+        features : List[str] | None
+            The features of the dataset, columns, dtypes, etc.
+        paths : str | List[str]
+            The paths to the data files needed to load the dataset
+        """
+        self._provider = provider
+        self._features = features
+        # Get the paths from the provider
+        self._paths = provider.paths
+        # Compute overall size of the dataset
+        print(f"Computing dataset size for {len(self._paths)} files...")
+        self._path_sizes = [self._provider.size(path) for path in tqdm(self._paths)]
+        # print(f"path sizes... {self._path_sizes}")
+        # Culmulative sum of the path sizes
+        self._culm_sizes = [
+            sum([size[1] for size in self._path_sizes[: i + 1]])
+            for i in range(len(self._path_sizes))
+        ]
+        # print(f"Culmulative: {self._culm_sizes}")
+        # Update width and height based on features
+        if self._features is None:
+            width = self._path_sizes[0][0]
+        else:
+            width = len(self._features)
+        height = sum([size[1] for size in self._path_sizes])
+        self._size = width, height
+        print(f"Dataset size {self._size}")
+        # We are going to use a set of in memory buffers to pre-fetch data
+        # from the API. This is to avoid having to make a request for every
+        # row we want to load.
+        # n_buffers is how many slices ahead we will load into memory
+        self._n_buffers = num_buffers
+        self._buffers = deque([])
+        # print(f"Fetching {self._n_buffers} buffers...")
+        # Which path file we are on
+        self._path_idx = 0
+        # How far into the whole dataset we have fetched
+        self._fetch_idx = 0
+        # How far into the current buffer we have fetched
+        self._buffer_idx = 0
+        # we will fetch the data in chunks of this size
+        self._buffer_size = buffer_size
+        # Fill the buffers with data
+        # * kick off background thread to fill the buffers
+        # * then wait until a buffer frees up to fetch the next one
+        self._sleep_interval = sleep_interval  # seconds
+        thread = threading.Thread(target=self._start_bg_collection, args=())
+        thread.daemon = True
+        thread.start()
+    def __repr__(self):
+        return f"StreamingDataset({self._provider}, {self._paths})"
+    def __str__(self):
+        return f"StreamingDataset({self._provider}, {self._paths})"
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+    # Total abstracted size of the dataset
+    @property
+    def size(self):
+        return self._size
+    # For iterating over the dataset
+    def __len__(self):
+        return self._size[1]
+    # For iterating over the dataset
+    def __getitem__(self, idx):
+        # print(f"StreamingDataset.__getitem__ {idx}")
+        if idx >= self._size[1]:
+            raise IndexError(
+                f"Index {idx} out of range for dataset of size {self._size}"
+            )
+        # Make sure we have data in the first two buffers
+        # we want the second one to be filled in case
+        # we've exhausted the first one
+        while len(self._buffers) < 1 or self._buffer_idx >= len(self._buffers[0]):
+            # We will be filling this in a background thread
+            time.sleep(self._sleep_interval)
+            # If we have exhausted the first buffer, pop it,
+            # and reset the buffer index
+            if len(self._buffers) > 1 and self._buffer_idx >= len(self._buffers[0]):
+                self._buffers.popleft()
+                self._buffer_idx = 0
+        # Offset is the row we are at in the data frame
+        buffer = self._buffers[0]
+        # extract the features from the data frame if there are some
+        item = {}
+        if self._features is None:
+            item = buffer[self._buffer_idx]
+        else:
+            buffer = buffer[self._buffer_idx]
+            item = {}
+            for feature in self._features:
+                val = buffer[feature]
+                item[feature] = val
+        # Increment the buffer index
+        self._buffer_idx += 1
+        return item
+    def _start_bg_collection(self):
+        # This is run in a background thread to fill the buffers
+        # print("Start data collection...")
+        # initialize the buffers
+        while True:
+            # print(f"Initializing buffer {len(self._buffers)}...")
+            if self._path_idx >= len(self._paths):
+                # We have exhausted all the paths
+                print("No more paths to fetch")
+                return
+            # print(f"Fetching buffer {len(self._buffers)} < {self._n_buffers}")
+            if len(self._buffers) < self._n_buffers:
+                self._buffers.append(self._fetch_next_buffer())
+            else:
+                time.sleep(self._sleep_interval)
+    def _fetch_next_buffer(self):
+        # fetch the next buffer from the API
+        path_idx = self._path_idx
+        path = self._paths[path_idx]
+        start = self._fetch_idx
+        end = self._fetch_idx + self._buffer_size
+        # If we are not on the first path, we need to offset the start and end
+        if path_idx > 0:
+            culm_size = self._culm_sizes[path_idx - 1]
+            start = start - culm_size
+            end = end - culm_size
+        buffer = self._provider.slice(path, start, end)
+        self._fetch_idx += len(buffer)
+        # If we have exhausted the current path, move to the next one
+        if self._fetch_idx >= self._culm_sizes[path_idx]:
+            self._path_idx += 1
+        return buffer

oxen/user.py ADDED Viewed

@@ -0,0 +1,40 @@
+from .oxen import user, util
+from typing import Optional
+import os
+def config_user(name: str, email: str, path: Optional[str] = None):
+    """
+    Configures user for a host.
+    Args:
+        name: `str`
+            The name to use for user.
+        email: `str`
+            The email to use for user.
+        path: `Optional[str]`
+            The path to save the user config to.
+            Defaults to $HOME/.config/oxen/user_config.toml
+    """
+    if path is None:
+        path = os.path.join(util.get_oxen_config_dir(), "user_config.toml")
+    if not path.endswith(".toml"):
+        raise ValueError(f"Path {path} must end with .toml")
+    return user.config_user(name, email, path)
+def current_user(path: Optional[str] = None):
+    """
+    Gets the current user.
+    Args:
+        path: `Optional[str]`
+            The path to load the user config from.
+            Defaults to $HOME/.config/oxen/user_config.toml
+    """
+    if path is None:
+        path = os.path.join(util.get_oxen_config_dir(), "user_config.toml")
+    if not path.endswith(".toml"):
+        raise ValueError(f"Path {path} must end with .toml")
+    return user.current_user(path)

oxen/util/__init__.py ADDED Viewed

File without changes