oxenai 0.39.1__cp313-cp313-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oxenai might be problematic. Click here for more details.

oxen/repo.py ADDED
@@ -0,0 +1,239 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ from oxen import PyRepo
5
+
6
+
7
+ class Repo:
8
+ """
9
+ The Repo class that allows you to interact with your local oxen repo.
10
+
11
+ ## Examples
12
+
13
+ ### Init, Add, Commit and Push
14
+
15
+ Adding and committing a file to a remote workspace.
16
+
17
+ ```python
18
+ import os
19
+ from oxen import Repo
20
+
21
+ # Initialize the Oxen Repository in a CatsAndDogs directory
22
+ directory = "CatsAndDogs"
23
+ repo = Repo(directory)
24
+ repo.init()
25
+ repo.add("images")
26
+ repo.commit("Adding all the images")
27
+ # Replace <namespace> and <repo_name> with your values
28
+ repo.set_remote("origin", "https://hub.oxen.ai/<namespace>/<repo_name>")
29
+ repo.push()
30
+ ```
31
+ """
32
+
33
+ def __init__(self, path: str = "", mkdir=False):
34
+ """
35
+ Create a new Repo object. Use .init() to initialize a new oxen repository,
36
+ or pass the path to an existing one.
37
+
38
+ Args:
39
+ path: `str`
40
+ Path to the main working directory of your oxen repo.
41
+ mkdir: `bool`
42
+ Whether to create the directory if one doesn't exist. Default: False
43
+ """
44
+ # Check if the path exists, and convert to absolute path
45
+ if path:
46
+ path = os.path.abspath(path)
47
+ if not os.path.exists(path) and mkdir:
48
+ os.makedirs(path)
49
+
50
+ self._repo = PyRepo(path)
51
+
52
+ def __repr__(self):
53
+ return f"Repo({self.path})"
54
+
55
+ def init(self):
56
+ """
57
+ Initializes a new oxen repository at the path specified in the constructor.
58
+ Will create a .oxen folder to store all the versions and metadata.
59
+ """
60
+ self._repo.init()
61
+ return self
62
+
63
+ def clone(
64
+ self,
65
+ url: str,
66
+ branch: str = "main",
67
+ all=False,
68
+ filters: Optional[str | list[str]] = None,
69
+ ):
70
+ """
71
+ Clone repository from a remote url.
72
+
73
+ Args:
74
+ url: `str`
75
+ The url of the remote repository. ex) https://hub.oxen.ai/ox/chatbot
76
+ branch: `str`
77
+ The name of the branch to clone. Default: main
78
+ all: `bool`
79
+ Whether to clone the full commit history or not. Default: False
80
+ filters: `str | list[str] | None`
81
+ Filter down the set of directories you want to clone. Useful if
82
+ you have a large repository and only want to make changes to a
83
+ specific subset of files. Default: None
84
+ """
85
+ if isinstance(filters, str):
86
+ filters = [filters]
87
+ return self._repo.clone(url, branch, all, filters)
88
+
89
+ def branches(self):
90
+ """
91
+ List all branches for a repo
92
+ """
93
+ return self._repo.list_branches()
94
+
95
+ def branch(self, name: str, delete=False):
96
+ """ """
97
+ return self._repo.branch(name, delete)
98
+
99
+ def branch_exists(self, name: str):
100
+ """ """
101
+ return self._repo.branch_exists(name)
102
+
103
+ def checkout(self, revision: str, create=False):
104
+ """
105
+ Checkout a branch or commit id.
106
+
107
+ Args:
108
+ revision: `str`
109
+ The name of the branch or commit id to checkout.
110
+ create: `bool`
111
+ Whether to create a new branch if it doesn't exist. Default: False
112
+ """
113
+ self._repo.checkout(revision, create)
114
+
115
+ def add(self, path: str):
116
+ """
117
+ Stage a file or directory to be committed.
118
+ """
119
+ # Check if the path exists
120
+ if not os.path.exists(path):
121
+ # try repo.path + path
122
+ path = os.path.join(self.path, path)
123
+
124
+ # Convert to absolute path before adding
125
+ path = os.path.abspath(path)
126
+ if not os.path.exists(path):
127
+ raise Exception(f"Path {path} does not exist.")
128
+
129
+ self._repo.add(path)
130
+
131
+ def add_schema_metadata(self, path: str, column_name: str, metadata: str):
132
+ """
133
+ Add schema to the local repository
134
+ """
135
+ self._repo.add_schema_metadata(path, column_name, metadata)
136
+
137
+ def rm(self, path: str, recursive=False, staged=False):
138
+ """
139
+ Remove a file or directory from being tracked.
140
+ This will not delete the file or directory.
141
+
142
+ Args:
143
+ path: `str`
144
+ The path to the file or directory to remove.
145
+ recursive: `bool`
146
+ Whether to remove the file or directory recursively. Default: False
147
+ staged: `bool`
148
+ Whether to remove the file or directory from the staging area.
149
+ Default: False
150
+ remote: `bool`
151
+ Whether to remove the file or directory from a remote workspace.
152
+ Default: False
153
+ """
154
+ self._repo.rm(path, recursive, staged)
155
+
156
+ def status(self):
157
+ """
158
+ Check the status of the repo. Returns a StagedData object.
159
+ """
160
+ return self._repo.status()
161
+
162
+ def commit(self, message: str):
163
+ """
164
+ Commit the staged data in a repo with a message.
165
+
166
+ Args:
167
+ message: `str`
168
+ The commit message.
169
+ """
170
+ return self._repo.commit(message)
171
+
172
+ def log(self):
173
+ """
174
+ Get the commit history for a repo.
175
+ """
176
+ return self._repo.log()
177
+
178
+ def set_remote(self, name: str, url: str):
179
+ """
180
+ Map a name to a remote url.
181
+
182
+ Args:
183
+ name: `str`
184
+ The name of the remote. Ex) origin
185
+ url: `str`
186
+ The url you want to map the name to. Ex) https://hub.oxen.ai/ox/chatbot
187
+ """
188
+ self._repo.set_remote(name, url)
189
+
190
+ def create_remote(self, name: str):
191
+ self._repo.create_remote(name)
192
+
193
+ def push(
194
+ self, remote_name: str = "origin", branch: str = "main", delete: bool = False
195
+ ):
196
+ """
197
+ Push data to a remote repo from a local repo.
198
+
199
+ Args:
200
+ remote_name: `str`
201
+ The name of the remote to push to.
202
+ branch: `str`
203
+ The name of the branch to push to.
204
+ """
205
+ return self._repo.push(remote_name, branch, delete)
206
+
207
+ def pull(self, remote_name: str = "origin", branch: str = "main", all=False):
208
+ """
209
+ Pull data from a remote repo to a local repo.
210
+
211
+ Args:
212
+ remote_name: `str`
213
+ The name of the remote to pull from.
214
+ branch: `str`
215
+ The name of the branch to pull from.
216
+ all: `bool`
217
+ Whether to pull all data from branch history or not. Default: False
218
+ """
219
+ return self._repo.pull(remote_name, branch, all)
220
+
221
+ @property
222
+ def path(self):
223
+ """
224
+ Returns the path to the repo.
225
+ """
226
+ return self._repo.path()
227
+
228
+ @property
229
+ def current_branch(self):
230
+ """
231
+ Returns the current branch.
232
+ """
233
+ return self._repo.current_branch()
234
+
235
+ def merge(self, branch: str):
236
+ """
237
+ Merge a branch into the current branch.
238
+ """
239
+ return self._repo.merge(branch)
@@ -0,0 +1,242 @@
1
+ from oxen.providers.dataset_path_provider import DatasetPathProvider
2
+ from oxen.providers.oxen_data_frame_provider import OxenDataFrameProvider
3
+ from oxen import RemoteRepo
4
+
5
+ from typing import List, Union, Optional
6
+ from collections import deque
7
+ from tqdm import tqdm
8
+
9
+ import threading
10
+ import time
11
+ import os
12
+
13
+
14
+ def load_dataset(
15
+ repo: Union[RemoteRepo, str],
16
+ paths: Optional[Union[str, List[str]]] = None,
17
+ directory: Optional[str] = None,
18
+ features: Optional[List[str]] = None,
19
+ host: Optional[str] = None,
20
+ ):
21
+ """
22
+ Load a dataset from a repo.
23
+
24
+ Parameters
25
+ ----------
26
+ repo : Repo
27
+ The oxen repository you are loading data from
28
+ can be a local or a remote repo
29
+ paths : str | List[str] | None
30
+ A path or set of paths to the data files needed to load the dataset.
31
+ all paths must be data frames.
32
+ directory : str | None
33
+ The directory to stream the data from.
34
+ Must be a directory of files with type data frame.
35
+ Can be used instead of paths.
36
+ (default: None)
37
+ features : List[str] | None
38
+ The columns of the dataset (default: None)
39
+ """
40
+ if isinstance(paths, str):
41
+ paths = [paths]
42
+
43
+ if isinstance(repo, str):
44
+ repo = RemoteRepo(repo, host=host)
45
+
46
+ # If they supplied a directory, list all the files in the directory to get paths
47
+ if directory is not None:
48
+ # list all the files in the directory
49
+ paths = repo.ls(directory)
50
+
51
+ # prepend the directory to the paths
52
+ paths = [os.path.join(directory, path.filename) for path in paths]
53
+
54
+ if paths is None:
55
+ raise ValueError("Must provide either paths or directory")
56
+
57
+ provider = OxenDataFrameProvider(repo, paths, features)
58
+ dataset = StreamingDataset(provider, features)
59
+ return dataset
60
+
61
+
62
+ class StreamingDataset:
63
+ """
64
+ StreamingDataset object constructs a dataset from a remote repo.
65
+ It can be used to load data into a dataloader.
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ provider: DatasetPathProvider,
71
+ features=None,
72
+ num_buffers=3,
73
+ buffer_size=128,
74
+ sleep_interval=0.1,
75
+ ):
76
+ """
77
+ Create a new RemoteRepo object to interact with.
78
+
79
+ Parameters
80
+ ----------
81
+ provider : DatasetPathProvider
82
+ The implementation of fetching data from a path and index
83
+ features : List[str] | None
84
+ The features of the dataset, columns, dtypes, etc.
85
+ paths : str | List[str]
86
+ The paths to the data files needed to load the dataset
87
+ """
88
+ self._provider = provider
89
+ self._features = features
90
+
91
+ # Get the paths from the provider
92
+ self._paths = provider.paths
93
+
94
+ # Compute overall size of the dataset
95
+ print(f"Computing dataset size for {len(self._paths)} files...")
96
+ self._path_sizes = [self._provider.size(path) for path in tqdm(self._paths)]
97
+ # print(f"path sizes... {self._path_sizes}")
98
+ # Culmulative sum of the path sizes
99
+ self._culm_sizes = [
100
+ sum([size[1] for size in self._path_sizes[: i + 1]])
101
+ for i in range(len(self._path_sizes))
102
+ ]
103
+ # print(f"Culmulative: {self._culm_sizes}")
104
+
105
+ # Update width and height based on features
106
+ if self._features is None:
107
+ width = self._path_sizes[0][0]
108
+ else:
109
+ width = len(self._features)
110
+ height = sum([size[1] for size in self._path_sizes])
111
+ self._size = width, height
112
+ print(f"Dataset size {self._size}")
113
+
114
+ # We are going to use a set of in memory buffers to pre-fetch data
115
+ # from the API. This is to avoid having to make a request for every
116
+ # row we want to load.
117
+ # n_buffers is how many slices ahead we will load into memory
118
+ self._n_buffers = num_buffers
119
+ self._buffers = deque([])
120
+
121
+ # print(f"Fetching {self._n_buffers} buffers...")
122
+
123
+ # Which path file we are on
124
+ self._path_idx = 0
125
+
126
+ # How far into the whole dataset we have fetched
127
+ self._fetch_idx = 0
128
+
129
+ # How far into the current buffer we have fetched
130
+ self._buffer_idx = 0
131
+
132
+ # we will fetch the data in chunks of this size
133
+ self._buffer_size = buffer_size
134
+
135
+ # Fill the buffers with data
136
+ # * kick off background thread to fill the buffers
137
+ # * then wait until a buffer frees up to fetch the next one
138
+ self._sleep_interval = sleep_interval # seconds
139
+ thread = threading.Thread(target=self._start_bg_collection, args=())
140
+ thread.daemon = True
141
+ thread.start()
142
+
143
+ def __repr__(self):
144
+ return f"StreamingDataset({self._provider}, {self._paths})"
145
+
146
+ def __str__(self):
147
+ return f"StreamingDataset({self._provider}, {self._paths})"
148
+
149
+ def __iter__(self):
150
+ for i in range(len(self)):
151
+ yield self[i]
152
+
153
+ # Total abstracted size of the dataset
154
+ @property
155
+ def size(self):
156
+ return self._size
157
+
158
+ # For iterating over the dataset
159
+ def __len__(self):
160
+ return self._size[1]
161
+
162
+ # For iterating over the dataset
163
+ def __getitem__(self, idx):
164
+ # print(f"StreamingDataset.__getitem__ {idx}")
165
+
166
+ if idx >= self._size[1]:
167
+ raise IndexError(
168
+ f"Index {idx} out of range for dataset of size {self._size}"
169
+ )
170
+
171
+ # Make sure we have data in the first two buffers
172
+ # we want the second one to be filled in case
173
+ # we've exhausted the first one
174
+ while len(self._buffers) < 1 or self._buffer_idx >= len(self._buffers[0]):
175
+ # We will be filling this in a background thread
176
+ time.sleep(self._sleep_interval)
177
+
178
+ # If we have exhausted the first buffer, pop it,
179
+ # and reset the buffer index
180
+ if len(self._buffers) > 1 and self._buffer_idx >= len(self._buffers[0]):
181
+ self._buffers.popleft()
182
+ self._buffer_idx = 0
183
+
184
+ # Offset is the row we are at in the data frame
185
+ buffer = self._buffers[0]
186
+
187
+ # extract the features from the data frame if there are some
188
+ item = {}
189
+ if self._features is None:
190
+ item = buffer[self._buffer_idx]
191
+ else:
192
+ buffer = buffer[self._buffer_idx]
193
+ item = {}
194
+ for feature in self._features:
195
+ val = buffer[feature]
196
+ item[feature] = val
197
+
198
+ # Increment the buffer index
199
+ self._buffer_idx += 1
200
+
201
+ return item
202
+
203
+ def _start_bg_collection(self):
204
+ # This is run in a background thread to fill the buffers
205
+ # print("Start data collection...")
206
+
207
+ # initialize the buffers
208
+ while True:
209
+ # print(f"Initializing buffer {len(self._buffers)}...")
210
+ if self._path_idx >= len(self._paths):
211
+ # We have exhausted all the paths
212
+ print("No more paths to fetch")
213
+ return
214
+
215
+ # print(f"Fetching buffer {len(self._buffers)} < {self._n_buffers}")
216
+
217
+ if len(self._buffers) < self._n_buffers:
218
+ self._buffers.append(self._fetch_next_buffer())
219
+ else:
220
+ time.sleep(self._sleep_interval)
221
+
222
+ def _fetch_next_buffer(self):
223
+ # fetch the next buffer from the API
224
+ path_idx = self._path_idx
225
+ path = self._paths[path_idx]
226
+ start = self._fetch_idx
227
+ end = self._fetch_idx + self._buffer_size
228
+
229
+ # If we are not on the first path, we need to offset the start and end
230
+ if path_idx > 0:
231
+ culm_size = self._culm_sizes[path_idx - 1]
232
+ start = start - culm_size
233
+ end = end - culm_size
234
+
235
+ buffer = self._provider.slice(path, start, end)
236
+ self._fetch_idx += len(buffer)
237
+
238
+ # If we have exhausted the current path, move to the next one
239
+ if self._fetch_idx >= self._culm_sizes[path_idx]:
240
+ self._path_idx += 1
241
+
242
+ return buffer
oxen/user.py ADDED
@@ -0,0 +1,40 @@
1
+ from .oxen import user, util
2
+ from typing import Optional
3
+ import os
4
+
5
+
6
+ def config_user(name: str, email: str, path: Optional[str] = None):
7
+ """
8
+ Configures user for a host.
9
+
10
+ Args:
11
+ name: `str`
12
+ The name to use for user.
13
+ email: `str`
14
+ The email to use for user.
15
+ path: `Optional[str]`
16
+ The path to save the user config to.
17
+ Defaults to $HOME/.config/oxen/user_config.toml
18
+ """
19
+ if path is None:
20
+ path = os.path.join(util.get_oxen_config_dir(), "user_config.toml")
21
+
22
+ if not path.endswith(".toml"):
23
+ raise ValueError(f"Path {path} must end with .toml")
24
+ return user.config_user(name, email, path)
25
+
26
+
27
+ def current_user(path: Optional[str] = None):
28
+ """
29
+ Gets the current user.
30
+
31
+ Args:
32
+ path: `Optional[str]`
33
+ The path to load the user config from.
34
+ Defaults to $HOME/.config/oxen/user_config.toml
35
+ """
36
+ if path is None:
37
+ path = os.path.join(util.get_oxen_config_dir(), "user_config.toml")
38
+ if not path.endswith(".toml"):
39
+ raise ValueError(f"Path {path} must end with .toml")
40
+ return user.current_user(path)
oxen/util/__init__.py ADDED
File without changes