oxenai 0.42.4__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oxen/datasets.py ADDED
@@ -0,0 +1,106 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ from oxen import RemoteRepo
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def load_dataset(repo_id: str, path: str, fmt: str = "hugging_face", revision=None):
10
+ """
11
+ Load a dataset from an Oxen repository into memory using the HuggingFace datasets library.
12
+
13
+ Args:
14
+ repo_id: `str`
15
+ The namespace/repo_name of the oxen repository to load the dataset from
16
+ path: `str` | Sequence[str]
17
+ The path to the dataset we want to load
18
+ fmt: `str`
19
+ The format of the data files. Currently only "hugging_face" is supported.
20
+ revision: `str` | None
21
+ The commit id or branch name of the version of the data to download
22
+
23
+ Example:
24
+ ```python
25
+ from oxen.datasets import load_dataset
26
+ dataset = load_dataset("datasets/gsm8k", "train.jsonl")
27
+ # use datasets functions as you normally would
28
+ dataset.shuffle()[:10]
29
+ ```
30
+ """
31
+ logger.info(
32
+ f"Loading dataset {repo_id} from {path} with format {fmt} and revision {revision}"
33
+ )
34
+
35
+ if fmt == "hugging_face":
36
+ # Download the data from Oxen.ai
37
+ download(repo_id, path, revision=revision)
38
+ # Use the Hugging Face datasets library to load the data
39
+ return _load_hf(path)
40
+ else:
41
+ raise ValueError(f"Unsupported load format: {fmt}")
42
+
43
+
44
+ def _load_hf(path: str):
45
+ from datasets import load_dataset as hf_load_dataset
46
+
47
+ if path.endswith(".csv"):
48
+ return hf_load_dataset("csv", data_files=path)
49
+ elif path.endswith(".json") or path.endswith(".jsonl"):
50
+ return hf_load_dataset("json", data_files=path)
51
+ elif path.endswith(".parquet"):
52
+ return hf_load_dataset("parquet", data_files=path)
53
+ else:
54
+ raise ValueError(f"Unsupported file extension: {path}")
55
+
56
+
57
+ def download(
58
+ repo_id: str, path: str, revision=None, dst=None, host="hub.oxen.ai", scheme="https"
59
+ ):
60
+ """
61
+ Download files or directories from a remote Oxen repository.
62
+
63
+ Args:
64
+ repo_id: `str`
65
+ The namespace/repo_name of the oxen repository to load the dataset from
66
+ path: `str`
67
+ The path to the data files
68
+ revision: `str | None`
69
+ The commit id or branch name of the version of the data to download
70
+ dst: `str | None`
71
+ The path to download the data to.
72
+ host: `str`
73
+ The host to download the data from.
74
+ scheme: `str`
75
+ The scheme to download the data with. (default: "https")
76
+ """
77
+
78
+ repo = RemoteRepo(repo_id, host=host, scheme=scheme)
79
+ repo.download(path, revision=revision, dst=dst)
80
+
81
+
82
+ def upload(
83
+ repo_id: str, path: str, message: str, branch: Optional[str] = None, dst: str = ""
84
+ ):
85
+ """
86
+ Upload files or directories to a remote Oxen repository.
87
+
88
+ Args:
89
+ repo_id: `str`
90
+ The namespace/repo_name of the oxen repository to upload the dataset to
91
+ path: `str`
92
+ The path to the data files
93
+ message: `str`
94
+ The commit message to use when uploading the data
95
+ branch: `str | None`
96
+ The branch to upload the data to. If None, the `main` branch is used.
97
+ dst: `str | None`
98
+ The directory to upload the data to.
99
+ """
100
+
101
+ repo = RemoteRepo(repo_id)
102
+ if branch is not None:
103
+ repo.checkout(branch)
104
+
105
+ repo.add(path, dst=dst)
106
+ return repo.commit(message)
oxen/df_utils.py ADDED
@@ -0,0 +1,54 @@
1
+ """
2
+ The `df_utils` module provides a consistent interface for loading data frames and saving them to disk.
3
+
4
+ Supported types: csv, parquet, json, jsonl, arrow
5
+
6
+ Example usage:
7
+
8
+ ```python
9
+ import os
10
+ from oxen import df_utils
11
+
12
+ # load a data frame
13
+ df = df_utils.load("path/to/data.csv")
14
+
15
+ # save a data frame
16
+ df_utils.save(df, "path/to/save.csv")
17
+ ```
18
+ """
19
+
20
+ from .oxen import df_utils
21
+
22
+ import os
23
+ from polars import DataFrame
24
+
25
+
26
+ def load(
27
+ path: os.PathLike,
28
+ ):
29
+ """
30
+ Reads a file into a data frame. The file format is inferred from the file extension.
31
+
32
+ Supported types: csv, parquet, json, jsonl, arrow
33
+
34
+ Args:
35
+ path: `os.PathLike`
36
+ The path to the file to read.
37
+ """
38
+ return df_utils.load(path)
39
+
40
+
41
+ def save(
42
+ data_frame: DataFrame,
43
+ path: os.PathLike,
44
+ ):
45
+ """
46
+ Saves a data frame to a file. The file format is inferred from the file extension.
47
+
48
+ Args:
49
+ data_frame: `DataFrame`
50
+ The polars data frame to save.
51
+ path: `os.PathLike`
52
+ The path to save the data frame to.
53
+ """
54
+ return df_utils.save(data_frame, path)
oxen/diff/__init__.py ADDED
File without changes
@@ -0,0 +1,12 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ChangeType(Enum):
5
+ """
6
+ An enum representing the type of change in a diff.
7
+ """
8
+
9
+ ADDED = "Added"
10
+ REMOVED = "Removed"
11
+ MODIFIED = "Modified"
12
+ UNCHANGED = "Unchanged"
oxen/diff/diff.py ADDED
@@ -0,0 +1,143 @@
1
+ """
2
+ Oxen can be used to compare data frames and return a tabular diff.
3
+
4
+ There is more information about the diff in the
5
+ [Diff Getting Started Documentation](/concepts/diffs).
6
+
7
+ For example comparing two data frames will give you an output data frame,
8
+ where the `.oxen.diff.status` column shows if the row was `added`, `removed`,
9
+ or `modified`.
10
+
11
+ ```
12
+ shape: (6, 7)
13
+ +-------------+-----+-----+-------+--------+-------------+-------------------+
14
+ | file | x | y | width | height | label.right | .oxen.diff.status |
15
+ | --- | --- | --- | --- | --- | --- | --- |
16
+ | str | i64 | i64 | i64 | i64 | str | str |
17
+ +-------------+-----+-----+-------+--------+-------------+-------------------+
18
+ | image_0.jpg | 0 | 0 | 10 | 10 | cat | modified |
19
+ | image_1.jpg | 1 | 2 | 10 | 20 | null | removed |
20
+ | image_1.jpg | 200 | 100 | 10 | 20 | dog | added |
21
+ | image_2.jpg | 4 | 10 | 20 | 20 | null | removed |
22
+ | image_3.jpg | 4 | 10 | 20 | 20 | dog | added |
23
+ | image_4.jpg | 10 | 10 | 10 | 10 | dog | added |
24
+ +-------------+-----+-----+-------+--------+-------------+-------------------+
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```python
30
+ import os
31
+ import oxen
32
+
33
+ result = oxen.diff("dataset_1.csv", "dataset_2.csv")
34
+ print(result.get())
35
+ ```
36
+
37
+ """
38
+
39
+ from ..oxen import PyDiff
40
+ from ..oxen import diff as py_diff
41
+
42
+ from oxen import df_utils
43
+ from oxen.diff.tabular_diff import TabularDiff
44
+ from oxen.diff.text_diff import TextDiff
45
+
46
+ import os
47
+ from typing import Optional
48
+
49
+
50
+ def diff(
51
+ path: os.PathLike,
52
+ to: Optional[os.PathLike] = None,
53
+ repo_dir: Optional[os.PathLike] = None,
54
+ revision_left: Optional[str] = None,
55
+ revision_right: Optional[str] = None,
56
+ output: Optional[os.PathLike] = None,
57
+ keys: list[str] = [],
58
+ compares: list[str] = [],
59
+ ):
60
+ """
61
+ Compares data from two paths and returns a diff respecting the type of data.
62
+
63
+ Args:
64
+ path: `os.PathLike`
65
+ The path to diff. If `to` is not provided,
66
+ this will compare the data frame to the previous commit.
67
+ to: `os.PathLike`
68
+ An optional second path to compare to.
69
+ If provided this will be the right side of the diff.
70
+ repo_dir: `os.PathLike`
71
+ The path to the oxen repository. Must be provided if `compare_to` is
72
+ not provided, or if `revision_left` or `revision_right` is provided.
73
+ If not provided, the repository will be searched for in the current
74
+ working directory.
75
+ revision_left: `str`
76
+ The left revision to compare. Can be a commit hash or branch name.
77
+ revision_right: `str`
78
+ The right revision to compare. Can be a commit hash or branch name.
79
+ output: `os.PathLike`
80
+ The path to save the diff to. If not provided, the diff will not be saved.
81
+ keys: `list[str]`
82
+ Only for tabular diffs. The keys to compare on.
83
+ This is used to join the two data frames.
84
+ Keys will be combined and hashed to create a identifier for each row.
85
+ compares: `list[str]`
86
+ Only for tabular diffs. The compares to compare on.
87
+ This is used to compare the values of the two data frames.
88
+ """
89
+ result = py_diff.diff_paths(path, keys, to, repo_dir, revision_left, revision_right)
90
+ if output:
91
+ df_utils.save(result, output)
92
+ return Diff(result)
93
+
94
+
95
+ class Diff:
96
+ """
97
+ Diff class wraps many types of diffs and provides a consistent interface.
98
+ For example the diff can be tabular or text. Eventually we will extend this
99
+ to support other types of diffs such as images, audio, etc.
100
+ """
101
+
102
+ def __init__(self, py_diff: PyDiff):
103
+ self._py_diff = py_diff
104
+
105
+ def __repr__(self) -> str:
106
+ return f"Diff(format={self.format})"
107
+
108
+ @property
109
+ def format(self) -> str:
110
+ """
111
+ Returns the format of the diff. Ie. tabular, text, etc.
112
+ """
113
+ return self._py_diff.format
114
+
115
+ @property
116
+ def tabular(self) -> Optional[TabularDiff]:
117
+ """
118
+ Returns the tabular diff if the diff is tabular.
119
+ """
120
+ if self.format == "tabular":
121
+ return TabularDiff(self._py_diff.tabular)
122
+ return None
123
+
124
+ @property
125
+ def text(self) -> Optional[TextDiff]:
126
+ """
127
+ Returns the text diff if the diff is text.
128
+ """
129
+ if self.format == "text":
130
+ return TextDiff(self._py_diff.text)
131
+ return None
132
+
133
+ def get(self):
134
+ """
135
+ Resolves the diff type and returns the appropriate diff object.
136
+ """
137
+ format = self._py_diff.format
138
+ if "tabular" == format:
139
+ return TabularDiff(self._py_diff.tabular)
140
+ elif "text" == format:
141
+ return TextDiff(self._py_diff.text)
142
+ else:
143
+ raise ValueError("The diff type is unknown.")
oxen/diff/line_diff.py ADDED
@@ -0,0 +1,41 @@
1
+ from ..oxen import PyLineDiff, PyChangeType
2
+
3
+ from oxen.diff.change_type import ChangeType
4
+
5
+
6
+ class LineDiff:
7
+ """
8
+ A class representing a change in a line of text.
9
+ """
10
+
11
+ def __init__(self, diff: PyLineDiff):
12
+ self._diff = diff
13
+
14
+ def __repr__(self) -> str:
15
+ return (
16
+ f"LineDiff(modification={self._diff.modification}, text={self._diff.text})"
17
+ )
18
+
19
+ @property
20
+ def modification(self) -> ChangeType:
21
+ """
22
+ Returns the modification of the line diff.
23
+ """
24
+ mod_type = self._diff.modification
25
+ if PyChangeType.Added == mod_type:
26
+ return ChangeType.ADDED
27
+ elif PyChangeType.Removed == mod_type:
28
+ return ChangeType.REMOVED
29
+ elif PyChangeType.Modified == mod_type:
30
+ return ChangeType.MODIFIED
31
+ elif PyChangeType.Unchanged == mod_type:
32
+ return ChangeType.UNCHANGED
33
+ else:
34
+ raise ValueError(f"Invalid modification: {mod_type}")
35
+
36
+ @property
37
+ def text(self) -> str:
38
+ """
39
+ Returns the text of the line diff.
40
+ """
41
+ return self._diff.text
@@ -0,0 +1,22 @@
1
+ from ..oxen import PyTabularDiff
2
+
3
+ from polars import DataFrame
4
+
5
+
6
+ class TabularDiff:
7
+ """
8
+ This class returns a polars data frame that represents a tabular diff.
9
+ """
10
+
11
+ def __init__(self, diff: PyTabularDiff):
12
+ self._diff = diff
13
+
14
+ def __repr__(self) -> str:
15
+ return f"TabularDiff(shape={self._diff.data.shape})\n\n{self._diff.data}"
16
+
17
+ @property
18
+ def data(self) -> DataFrame:
19
+ """
20
+ Returns the data of the diff as a polars data frame.
21
+ """
22
+ return self._diff.data
oxen/diff/text_diff.py ADDED
@@ -0,0 +1,48 @@
1
+ from ..oxen import PyTextDiff, PyChangeType
2
+
3
+ from oxen.diff.line_diff import LineDiff
4
+
5
+
6
+ class TextDiff:
7
+ """
8
+ A class representing a text diff.
9
+ """
10
+
11
+ def __init__(self, diff: PyTextDiff):
12
+ self._diff = diff
13
+
14
+ def __repr__(self) -> str:
15
+ return f"TextDiff(num_added={self.num_added}, num_removed={self.num_removed})"
16
+
17
+ def __str__(self) -> str:
18
+ # iterate over lines and print them with a + or - prefix
19
+ return "\n".join([f"{line.value}" for line in self._diff.lines])
20
+
21
+ @property
22
+ def num_added(self) -> int:
23
+ """
24
+ Returns the number of added lines in the diff.
25
+ """
26
+ # count the number of added lines
27
+ return self._count_lines(PyChangeType.Added)
28
+
29
+ @property
30
+ def num_removed(self) -> int:
31
+ """
32
+ Returns the number of removed lines in the diff.
33
+ """
34
+ # count the number of removed lines
35
+ return self._count_lines(PyChangeType.Removed)
36
+
37
+ @property
38
+ def lines(self) -> list[LineDiff]:
39
+ """
40
+ Returns the contents of the diff as a polars data frame.
41
+ """
42
+ # map the PyLineDiff to LineDiff
43
+ return [LineDiff(line) for line in self._diff.lines]
44
+
45
+ def _count_lines(self, modification: PyChangeType) -> int:
46
+ return len(
47
+ [line for line in self._diff.lines if line.modification == modification]
48
+ )
oxen/features.py ADDED
@@ -0,0 +1,58 @@
1
+ from enum import Enum
2
+
3
+
4
+ class Feature(Enum):
5
+ NUMERIC = 1
6
+ TABULAR = 2
7
+ TEXT = 3
8
+ IMAGE = 4
9
+ AUDIO = 5
10
+ VIDEO = 6
11
+
12
+ def __init__(self, name, dtype):
13
+ """
14
+ A feature is a column in a dataset.
15
+ It can be numeric, tabular, text, image, audio, or video.
16
+
17
+ Parameters
18
+ ----------
19
+ name: str
20
+ The column name
21
+ dtype: One of: Feature.NUMERIC, Feature.TABULAR, Feature.TEXT,
22
+ Feature.IMAGE, Feature.AUDIO, Feature.VIDEO
23
+ """
24
+ self._name = name
25
+ self._dtype = dtype
26
+
27
+ @property
28
+ def name(self) -> str:
29
+ return self._name
30
+
31
+ @property
32
+ def dtype(self) -> str:
33
+ return self._dtype
34
+
35
+
36
+ class Features:
37
+ """
38
+ Feature is a class that represents the features you
39
+ want to load into a dataset. For example the input
40
+ and output columns of a dataset.
41
+ """
42
+
43
+ def __init__(self, features: list[Feature]):
44
+ """
45
+ Create a set of features from a list of columns.
46
+
47
+ Parameters
48
+ ----------
49
+ features : list[Feature]
50
+ The columns to load from the dataset, and their respective types.
51
+ """
52
+ self.features = features
53
+
54
+ def feature_names(self) -> list[str]:
55
+ """
56
+ Returns a list of the feature names.
57
+ """
58
+ return [feature.name for feature in self.features]
oxen/fs.py ADDED
@@ -0,0 +1,57 @@
1
+ import os
2
+ from oxen import Repo
3
+
4
+
5
+ def rcount_files_in_dir(directory: str) -> int:
6
+ """
7
+ Counts the number of files in a repo recursively.
8
+
9
+ Parameters
10
+ ----------
11
+ directory : str
12
+ The directory to count the number of files in.
13
+ """
14
+ return sum([len(files) for _, _, files in os.walk(directory)])
15
+
16
+
17
+ def rcount_files_in_dir_ignore_oxen(directory: str) -> int:
18
+ """
19
+ Counts the number of files in a directory recursively, ignoring the .oxen directory.
20
+
21
+ Parameters
22
+ ----------
23
+ directory : str
24
+ The directory to count the number of files in.
25
+ """
26
+ total = 0
27
+ for root, _, files in os.walk(directory):
28
+ if ".oxen" in root:
29
+ continue
30
+ total += len(files)
31
+ return total
32
+
33
+
34
+ def rcount_files_in_repo(repo: Repo) -> int:
35
+ """
36
+ Recursively counts the number of files in a repo ignoring the .oxen directory.
37
+
38
+ Parameters
39
+ ----------
40
+ repo : Repo
41
+ The repository to count the number of files in.
42
+ """
43
+ return rcount_files_in_dir_ignore_oxen(repo.path)
44
+
45
+
46
+ def rcount_files_in_repo_dir(repo: Repo, directory: str) -> int:
47
+ """
48
+ Recursively counts the number of files in a directory repo within a repo.
49
+
50
+ Parameters
51
+ ----------
52
+ repo : Repo
53
+ The repository to count the number of files in.
54
+ directory : str
55
+ The directory to start the count in, relative to the repo.
56
+ """
57
+ return rcount_files_in_dir_ignore_oxen(os.path.join(repo.path, directory))
oxen/init.py ADDED
@@ -0,0 +1,19 @@
1
+ from oxen.repo import Repo
2
+
3
+
4
+ def init(
5
+ path: str = "./",
6
+ ):
7
+ """
8
+ Initialize a [Repo](/python-api/repo) at the given path.
9
+
10
+ Args:
11
+ path: `str`
12
+ The path to initialize the repo at.
13
+ Returns:
14
+ [Repo](/python-api/repo)
15
+ A Repo object that can be used to interact with the repo.
16
+ """
17
+ # Init Repo
18
+ repo = Repo(path)
19
+ return repo.init()
Binary file