oxenai 0.39.1__cp313-cp313-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oxenai might be problematic. Click here for more details.

oxen/oxen_fs.py ADDED
@@ -0,0 +1,351 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import tempfile
6
+ from typing import Optional
7
+
8
+ import fsspec
9
+ from fsspec.utils import infer_storage_options
10
+
11
+ from .remote_repo import RemoteRepo
12
+ from .oxen import PyEntry
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class OxenFS(fsspec.AbstractFileSystem):
18
+ """
19
+ OxenFS is a filesystem interface for Oxen repositories that implements the
20
+ [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) protocol. This
21
+ allows you to interact with Oxen repositories using familiar filesystem
22
+ operations and integrate with other compatible libraries like Pandas.
23
+
24
+ ## Basic Usage
25
+
26
+ ### Creating a Filesystem Instance
27
+
28
+ ```python
29
+ import oxen
30
+
31
+ # For Oxen Hub repositories
32
+ fs = oxen.OxenFS("ox", "Flowers")
33
+
34
+ # For local oxen-server
35
+ fs = oxen.OxenFS("ox", "test-repo", host="localhost:3000", scheme="http")
36
+ ```
37
+
38
+ ### Reading Files
39
+
40
+ ```python
41
+ with fs.open("data/train.csv") as f:
42
+ content = f.read()
43
+ ```
44
+
45
+ ### Writing Files
46
+
47
+ You must have write access to the repository to write files. See:
48
+ https://docs.oxen.ai/getting-started/python#private-repositories
49
+
50
+ OxenFS will automatically commit the file to the repository when the
51
+ context is exited (or the file is closed some other way). New
52
+ directories are automatically created as needed.
53
+
54
+ ```python
55
+ # Write with custom commit message
56
+ with fs.open("data/test.txt", mode="wb", commit_message="Added test.txt") as f:
57
+ f.write("Hello, world!")
58
+
59
+ # You can also set/update the commit message inside the context
60
+ with fs.open("data/test.txt", mode="wb") as f:
61
+ f.commit_message = "Updated test.txt"
62
+ f.write("Hello, world again!")
63
+ ```
64
+
65
+ #### Writing file objects
66
+
67
+ If you're integrating Oxen in a situation where you already have a file object,
68
+ you can save it to your repo by using `shutil.copyfileobj` like this:
69
+
70
+ ```python
71
+ import shutil
72
+
73
+ file_object_from_somewhere = open("data.csv")
74
+
75
+ with fs.open("train/data.csv", mode="wb") as output_file:
76
+ output_file.commit_message = "Copy from a file object"
77
+ shutil.copyfileobj(file_object_from_somewhere, output_file)
78
+ ```
79
+
80
+ ## Integration with Third Party Libraries (Pandas, etc.)
81
+
82
+ OxenFS works seamlessly with Pandas and other fsspec-compatible libraries using
83
+ the URL format: `oxen://namespace:repo@revision/path/to/file`
84
+
85
+ ### Reading Data
86
+
87
+ These will work with Pandas `{to,from}_{csv,parquet,json,etc.}` functions.
88
+
89
+ ```python
90
+ import pandas as pd
91
+
92
+ # Read parquet directly from Oxen repository
93
+ df = pd.read_parquet("oxen://openai:gsm8k@main/gsm8k_test.parquet")
94
+ ```
95
+
96
+ ### Writing Data
97
+
98
+ ```python
99
+ # Write DataFrame directly to Oxen repository
100
+ df.to_csv("oxen://ox:my-repo@main/data/test.csv", index=False)
101
+ ```
102
+
103
+ ## Notes
104
+ - Only binary read ("rb") and write ("wb") modes are currently supported
105
+ - But writing will automatically encode strings to bytes
106
+ - Does not yet support streaming files. All operations use temporary local files.
107
+ """
108
+
109
+ def __init__(
110
+ self,
111
+ namespace: str,
112
+ repo: str,
113
+ host: str = "hub.oxen.ai",
114
+ revision: str = "main",
115
+ scheme: str = "https",
116
+ **kwargs,
117
+ ):
118
+ """
119
+ Initialize the OxenFS instance.
120
+
121
+ Args:
122
+ namespace: `str`
123
+ The namespace of the repository.
124
+ repo: `str`
125
+ The name of the repository.
126
+ host: `str`
127
+ The host to connect to. Defaults to 'hub.oxen.ai'
128
+ revision: `str`
129
+ The branch name or commit id to checkout. Defaults to 'main'
130
+ scheme: `str`
131
+ The scheme to use for the remote url. Default: 'https'
132
+ """
133
+ super().__init__(**kwargs)
134
+ self.namespace = namespace
135
+ self.repo_name = repo
136
+ self.revision = revision
137
+ self.scheme = scheme
138
+ self.host = host
139
+ self.repo = RemoteRepo(f"{namespace}/{repo}", host, revision, scheme)
140
+ if not self.repo.exists():
141
+ raise ValueError(f"Repo {namespace}/{repo} not found on host {host}")
142
+ logger.debug(f"Initialized OxenFS for {namespace}/{repo}@{revision} on {host}")
143
+
144
+ def __repr__(self):
145
+ return f"OxenFS(namespace='{self.namespace}', repo='{self.repo_name}', revision='{self.revision}', host='{self.host}', scheme='{self.scheme}')"
146
+
147
+ def exists(self, path: str) -> bool:
148
+ return self.repo.metadata(path) is not None
149
+
150
+ def isfile(self, path: str) -> bool:
151
+ metadata = self.repo.metadata(path)
152
+ return metadata is not None and not metadata.is_dir
153
+
154
+ def isdir(self, path: str) -> bool:
155
+ metadata = self.repo.metadata(path)
156
+ return metadata is not None and metadata.is_dir
157
+
158
+ def ls(self, path: str = "", detail: bool = False):
159
+ """
160
+ List the contents of a directory.
161
+
162
+ Args:
163
+ path: `str`
164
+ The path to list the contents of.
165
+ detail: `bool`
166
+ If True, return a list of dictionaries with detailed metadata.
167
+ Otherwise, return a list of strings with the filenames.
168
+ """
169
+ logger.debug(f"OxenFS.ls: '{path}'")
170
+ metadata = self.repo.metadata(path)
171
+ if not metadata:
172
+ return []
173
+ if metadata.is_dir:
174
+ entries = self.repo.ls(path)
175
+ return [
176
+ self._metadata_entry_to_ls_entry(entry, detail) for entry in entries
177
+ ]
178
+ else:
179
+ return [self._metadata_entry_to_ls_entry(metadata, detail)]
180
+
181
+ @staticmethod
182
+ def _metadata_entry_to_ls_entry(entry: PyEntry, detail: bool = False):
183
+ if detail:
184
+ return {
185
+ "name": entry.path,
186
+ "type": "directory" if entry.is_dir else "file",
187
+ "size": entry.size,
188
+ "hash": entry.hash,
189
+ }
190
+ else:
191
+ return entry.path
192
+
193
+ def _open(self, path: str, mode: str = "rb", **kwargs):
194
+ """
195
+ Open a file in the OxenFS backend.
196
+
197
+ This is normally called through `OxenFS.open()` or `fsspec.open()`.
198
+ """
199
+ if mode == "rb":
200
+ return self._open_read(path, **kwargs)
201
+ if mode == "wb":
202
+ return self._open_write(path, **kwargs)
203
+ else:
204
+ raise ValueError(
205
+ "Unsupported file mode. Only rb and wb modes are supported"
206
+ )
207
+
208
+ def _open_read(self, path: str, **kwargs):
209
+ logger.debug(f"Opening file {path} for reading")
210
+ metadata = self.repo.metadata(path)
211
+ if metadata.is_dir:
212
+ raise ValueError("Cannot open directories")
213
+ tmp_file = tempfile.NamedTemporaryFile()
214
+ dst_path = tmp_file.file.name
215
+ self.repo.download(path, dst_path)
216
+ logger.debug(f"Downloaded file {path} to temp file {dst_path}")
217
+ return open(dst_path, "rb")
218
+
219
+ def _open_write(
220
+ self,
221
+ path: str,
222
+ commit_message: Optional[str] = None,
223
+ **kwargs,
224
+ ):
225
+ path = os.path.normpath(path)
226
+ logger.debug(f"Opening file {path} for writing")
227
+ target_dir = os.path.dirname(path)
228
+ file_name = os.path.basename(path).strip()
229
+ if file_name == "" or file_name == ".":
230
+ raise ValueError("File name cannot be empty")
231
+ try:
232
+ metadata = self.repo.metadata(target_dir)
233
+ if metadata and not metadata.is_dir:
234
+ raise ValueError("target_dir cannot be an existing file")
235
+ except ValueError as e:
236
+ if "not found" in str(e):
237
+ # If the directory does not exist, it will be created on the server
238
+ pass
239
+ else:
240
+ raise e
241
+
242
+ return OxenFSFileWriter(self.repo, file_name, target_dir, commit_message)
243
+
244
+ @classmethod
245
+ def _strip_protocol(cls, path):
246
+ opts = infer_storage_options(path)
247
+ if "username" not in opts:
248
+ return super()._strip_protocol(path)
249
+ return opts["path"].lstrip("/")
250
+
251
+ @staticmethod
252
+ def _get_kwargs_from_urls(path):
253
+ opts = infer_storage_options(path)
254
+ if "username" not in opts:
255
+ return {}
256
+ out = {"namespace": opts["username"], "repo": opts["password"]}
257
+ if opts["host"]:
258
+ out["revision"] = opts["host"]
259
+ return out
260
+
261
+
262
+ class OxenFSFileWriter:
263
+ """
264
+ A file writer for the OxenFS backend.
265
+
266
+ This is normally called through `OxenFS.open()` or `fsspec.open()`.
267
+ """
268
+
269
+ def __init__(
270
+ self,
271
+ repo: RemoteRepo,
272
+ path: str,
273
+ target_dir: str = "",
274
+ commit_message: Optional[str] = None,
275
+ ):
276
+ self.repo = repo
277
+ self.path = path
278
+ self.commit_message = commit_message or "Auto-commit from OxenFS"
279
+ self.target_dir = target_dir
280
+ self._tmp_file = tempfile.NamedTemporaryFile()
281
+ self.closed = False
282
+ logger.debug(f"Initialized OxenFSFileWriter for {path} in '{target_dir}'")
283
+
284
+ def __enter__(self) -> OxenFSFileWriter:
285
+ return self
286
+
287
+ def __exit__(self, exc_type, exc_value, traceback):
288
+ if exc_type is not None:
289
+ logger.error(
290
+ f"Error writing to {self.repo} {self.path}: {exc_type} {exc_value} {traceback}"
291
+ )
292
+
293
+ self.close()
294
+ # Don't suppress exceptions
295
+ return False
296
+
297
+ def write(self, data: str | bytes):
298
+ """
299
+ Write string or binary data to the file.
300
+ """
301
+ if isinstance(data, str):
302
+ data = data.encode("utf-8")
303
+ self._tmp_file.write(data)
304
+
305
+ def flush(self):
306
+ """
307
+ Flush the file to disk.
308
+ """
309
+ self._tmp_file.flush()
310
+
311
+ def tell(self):
312
+ """
313
+ Return the current position of the file.
314
+ """
315
+ return self._tmp_file.tell()
316
+
317
+ def seek(self, offset: int, whence: int = os.SEEK_SET):
318
+ """
319
+ Seek to a specific position in the file.
320
+ """
321
+ self._tmp_file.seek(offset, whence)
322
+
323
+ def commit(self, commit_message: Optional[str] = None):
324
+ """
325
+ Commit the file to the remote repo.
326
+ """
327
+ logger.debug(f"Committing file {self.path} to dir '{self.target_dir}'")
328
+ self.repo.upload(
329
+ self._tmp_file.name,
330
+ commit_message=commit_message or self.commit_message,
331
+ file_name=self.path,
332
+ dst_dir=self.target_dir,
333
+ )
334
+ logger.info(f"Committed file {self.path} to dir '{self.target_dir}'")
335
+
336
+ def close(self):
337
+ """
338
+ Close the file writer. This will commit the file to the remote repo.
339
+ """
340
+ if self.closed:
341
+ return
342
+ logger.debug(
343
+ f"Closing OxenFSFileWriter for {self.path} in dir '{self.target_dir}'"
344
+ )
345
+ self.flush()
346
+ self.commit()
347
+ self._tmp_file.close()
348
+ self.closed = True
349
+ logger.debug(
350
+ f"Closed OxenFSFileWriter for {self.path} in dir '{self.target_dir}'"
351
+ )
File without changes
@@ -0,0 +1,26 @@
1
+ class DatasetPathProvider:
2
+ """An interface for providing data by path and index"""
3
+
4
+ @property
5
+ def paths(self):
6
+ """Get the paths to the data files"""
7
+ raise NotImplementedError
8
+
9
+ def size(self, path) -> int:
10
+ """Get the size of the dataframe at the given path"""
11
+ raise NotImplementedError
12
+
13
+ def slice(self, path, start, end):
14
+ """
15
+ Get a slice of the dataframe at the given path
16
+
17
+ Parameters
18
+ ----------
19
+ path : str
20
+ The path to the dataframe
21
+ start : int
22
+ The start index
23
+ end : int
24
+ The end index
25
+ """
26
+ raise NotImplementedError
@@ -0,0 +1,73 @@
1
+ from oxen.providers.dataset_path_provider import DatasetPathProvider
2
+ import time
3
+
4
+
5
+ class MockPathProvider(DatasetPathProvider):
6
+ """
7
+ A mock implementation for providing data by path and index
8
+
9
+ It generates mock data with the given columns and number of rows
10
+ for the set of paths.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ paths=["path_1.csv", "path_2.csv"],
16
+ num_rows=1024,
17
+ columns=["path", "x", "y"],
18
+ download_time=0.1, # mock a slow download
19
+ ):
20
+ self._paths = paths
21
+ self._num_rows = num_rows
22
+ self._columns = columns
23
+ self._download_time = download_time
24
+ self._setup()
25
+
26
+ def _setup(self):
27
+ self._data_frame_paths = {}
28
+ for i, path in enumerate(self._paths):
29
+ self._data_frame_paths[path] = self._make_data_frame(i)
30
+
31
+ def _make_data_frame(self, i):
32
+ df = []
33
+ for j in range(self._num_rows):
34
+ row = {}
35
+ for col in self._columns:
36
+ idx = i * self._num_rows + j
37
+ row[col] = f"{col}_{idx}"
38
+ df.append(row)
39
+ return df
40
+
41
+ @property
42
+ def paths(self):
43
+ return self._paths
44
+
45
+ def size(self, path) -> int:
46
+ """Get the size of the dataframe at the given path"""
47
+ if path not in self._data_frame_paths:
48
+ # Make sure the path exists
49
+ return 0, 0
50
+
51
+ if len(self._data_frame_paths[path]) == 0:
52
+ # Make sure the path has data
53
+ return 0, 0
54
+
55
+ # width x height
56
+ return len(self._data_frame_paths[path][0]), len(self._data_frame_paths[path])
57
+
58
+ def slice(self, path, start, end):
59
+ """
60
+ Get a slice of the dataframe at the given path
61
+
62
+ Parameters
63
+ ----------
64
+ path : str
65
+ The path to the dataframe
66
+ start : int
67
+ The start index
68
+ end : int
69
+ The end index
70
+ """
71
+ # mock a slow download
72
+ time.sleep(self._download_time)
73
+ return self._data_frame_paths[path][start:end]
@@ -0,0 +1,61 @@
1
+ from oxen.providers.dataset_path_provider import DatasetPathProvider
2
+ from oxen import RemoteRepo
3
+ from typing import List
4
+ import json
5
+
6
+
7
+ class OxenDataFrameProvider(DatasetPathProvider):
8
+ """
9
+ An implementation for providing data by path and index
10
+
11
+ It grabs rows of data from the oxen server.
12
+ """
13
+
14
+ def __init__(
15
+ self, repo: RemoteRepo, paths: List[str], columns: List[str] | None = None
16
+ ):
17
+ """
18
+ Initialize
19
+
20
+ Parameters
21
+ ----------
22
+ repo : RemoteRepo
23
+ The oxen repository you are loading data from
24
+ paths : List[str]
25
+ The paths to the data files needed to load the dataset
26
+ columns : List[str] | None
27
+ The columns of the dataset (default: None)
28
+ """
29
+
30
+ if len(paths) == 0:
31
+ raise ValueError("Paths must not be empty")
32
+
33
+ self._repo = repo
34
+ self._paths = paths
35
+ self._columns = columns
36
+
37
+ @property
38
+ def paths(self):
39
+ return self._paths
40
+
41
+ def size(self, path) -> int:
42
+ """Get the size of the dataframe at the given path"""
43
+ # width x height
44
+ return self._repo.get_df_size(path)
45
+
46
+ def slice(self, path, start, end):
47
+ """
48
+ Get a slice of the dataframe at the given path
49
+
50
+ Parameters
51
+ ----------
52
+ path : str
53
+ The path to the dataframe
54
+ start : int
55
+ The start index
56
+ end : int
57
+ The end index
58
+ """
59
+ data = self._repo.get_df_slice(path, start, end)
60
+ json_data = json.loads(data)
61
+ return json_data