datachain 0.31.2__py3-none-any.whl → 0.31.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/lib/file.py +95 -18
- {datachain-0.31.2.dist-info → datachain-0.31.4.dist-info}/METADATA +1 -1
- {datachain-0.31.2.dist-info → datachain-0.31.4.dist-info}/RECORD +7 -7
- {datachain-0.31.2.dist-info → datachain-0.31.4.dist-info}/WHEEL +0 -0
- {datachain-0.31.2.dist-info → datachain-0.31.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.31.2.dist-info → datachain-0.31.4.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.31.2.dist-info → datachain-0.31.4.dist-info}/top_level.txt +0 -0
datachain/lib/file.py
CHANGED
|
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
|
|
|
35
35
|
from datachain.catalog import Catalog
|
|
36
36
|
from datachain.client.fsspec import Client
|
|
37
37
|
from datachain.dataset import RowDict
|
|
38
|
+
from datachain.query.session import Session
|
|
38
39
|
|
|
39
40
|
sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
40
41
|
|
|
@@ -252,6 +253,15 @@ class File(DataModel):
|
|
|
252
253
|
"last_modified",
|
|
253
254
|
]
|
|
254
255
|
|
|
256
|
+
# Allowed kwargs we forward to TextIOWrapper
|
|
257
|
+
_TEXT_WRAPPER_ALLOWED: ClassVar[tuple[str, ...]] = (
|
|
258
|
+
"encoding",
|
|
259
|
+
"errors",
|
|
260
|
+
"newline",
|
|
261
|
+
"line_buffering",
|
|
262
|
+
"write_through",
|
|
263
|
+
)
|
|
264
|
+
|
|
255
265
|
@staticmethod
|
|
256
266
|
def _validate_dict(
|
|
257
267
|
v: Optional[Union[str, dict, list[dict]]],
|
|
@@ -328,7 +338,6 @@ class File(DataModel):
|
|
|
328
338
|
from datachain.catalog.loader import get_catalog
|
|
329
339
|
|
|
330
340
|
catalog = get_catalog()
|
|
331
|
-
|
|
332
341
|
from datachain.client.fsspec import Client
|
|
333
342
|
|
|
334
343
|
client_cls = Client.get_implementation(path)
|
|
@@ -341,6 +350,27 @@ class File(DataModel):
|
|
|
341
350
|
file._set_stream(catalog)
|
|
342
351
|
return file
|
|
343
352
|
|
|
353
|
+
@classmethod
|
|
354
|
+
def at(cls, uri: str, session: Optional["Session"] = None) -> "Self":
|
|
355
|
+
"""Construct a File from a full URI in one call.
|
|
356
|
+
|
|
357
|
+
Example:
|
|
358
|
+
file = File.at("s3://bucket/path/to/output.png")
|
|
359
|
+
with file.open("wb") as f: ...
|
|
360
|
+
"""
|
|
361
|
+
from datachain.client.fsspec import Client
|
|
362
|
+
from datachain.query.session import Session
|
|
363
|
+
|
|
364
|
+
if session is None:
|
|
365
|
+
session = Session.get()
|
|
366
|
+
catalog = session.catalog
|
|
367
|
+
|
|
368
|
+
client_cls = Client.get_implementation(uri)
|
|
369
|
+
source, rel_path = client_cls.split_url(uri)
|
|
370
|
+
file = cls(source=client_cls.get_uri(source), path=rel_path)
|
|
371
|
+
file._set_stream(catalog)
|
|
372
|
+
return file
|
|
373
|
+
|
|
344
374
|
@classmethod
|
|
345
375
|
def _from_row(cls, row: "RowDict") -> "Self":
|
|
346
376
|
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
|
@@ -354,28 +384,70 @@ class File(DataModel):
|
|
|
354
384
|
return str(PurePosixPath(self.path).parent)
|
|
355
385
|
|
|
356
386
|
@contextmanager
|
|
357
|
-
def open(self, mode:
|
|
358
|
-
"""Open the file and return a file object.
|
|
359
|
-
if self.location:
|
|
360
|
-
with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
|
|
361
|
-
yield f
|
|
387
|
+
def open(self, mode: str = "rb", **open_kwargs) -> Iterator[Any]:
|
|
388
|
+
"""Open the file and return a file-like object.
|
|
362
389
|
|
|
363
|
-
|
|
390
|
+
Supports both read ("rb", "r") and write modes (e.g. "wb", "w", "ab").
|
|
391
|
+
When opened in a write mode, metadata is refreshed after closing.
|
|
392
|
+
"""
|
|
393
|
+
writing = any(ch in mode for ch in "wax+")
|
|
394
|
+
if self.location and writing:
|
|
395
|
+
raise VFileError(
|
|
396
|
+
"Writing to virtual file is not supported",
|
|
397
|
+
self.source,
|
|
398
|
+
self.path,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
if self._catalog is None:
|
|
402
|
+
raise RuntimeError("Cannot open file: catalog is not set")
|
|
403
|
+
|
|
404
|
+
client: Client = self._catalog.get_client(self.source)
|
|
405
|
+
|
|
406
|
+
if not writing:
|
|
407
|
+
if self.location:
|
|
408
|
+
with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
|
|
409
|
+
yield self._wrap_text(f, mode, open_kwargs)
|
|
410
|
+
return
|
|
364
411
|
if self._caching_enabled:
|
|
365
412
|
self.ensure_cached()
|
|
366
|
-
client: Client = self._catalog.get_client(self.source)
|
|
367
413
|
with client.open_object(
|
|
368
414
|
self, use_cache=self._caching_enabled, cb=self._download_cb
|
|
369
415
|
) as f:
|
|
370
|
-
yield
|
|
416
|
+
yield self._wrap_text(f, mode, open_kwargs)
|
|
417
|
+
return
|
|
418
|
+
|
|
419
|
+
# write path
|
|
420
|
+
full_path = client.get_full_path(self.get_path_normalized())
|
|
421
|
+
with client.fs.open(full_path, mode, **open_kwargs) as f:
|
|
422
|
+
yield self._wrap_text(f, mode, open_kwargs)
|
|
423
|
+
|
|
424
|
+
# refresh metadata
|
|
425
|
+
info = client.fs.info(full_path)
|
|
426
|
+
refreshed = client.info_to_file(info, self.get_path_normalized())
|
|
427
|
+
for k, v in refreshed.model_dump().items():
|
|
428
|
+
setattr(self, k, v)
|
|
429
|
+
|
|
430
|
+
def _wrap_text(self, f: Any, mode: str, open_kwargs: dict[str, Any]) -> Any:
|
|
431
|
+
"""Return stream possibly wrapped for text."""
|
|
432
|
+
if "b" in mode or isinstance(f, io.TextIOBase):
|
|
433
|
+
return f
|
|
434
|
+
filtered = {
|
|
435
|
+
k: open_kwargs[k] for k in self._TEXT_WRAPPER_ALLOWED if k in open_kwargs
|
|
436
|
+
}
|
|
437
|
+
return io.TextIOWrapper(f, **filtered)
|
|
371
438
|
|
|
372
439
|
def read_bytes(self, length: int = -1):
|
|
373
440
|
"""Returns file contents as bytes."""
|
|
374
441
|
with self.open() as stream:
|
|
375
442
|
return stream.read(length)
|
|
376
443
|
|
|
377
|
-
def read_text(self):
|
|
378
|
-
"""
|
|
444
|
+
def read_text(self, **open_kwargs):
|
|
445
|
+
"""Return file contents decoded as text.
|
|
446
|
+
|
|
447
|
+
**open_kwargs : Any
|
|
448
|
+
Extra keyword arguments forwarded to ``open(mode="r", ...)``
|
|
449
|
+
(e.g. ``encoding="utf-8"``, ``errors="ignore"``)
|
|
450
|
+
"""
|
|
379
451
|
if self.location:
|
|
380
452
|
raise VFileError(
|
|
381
453
|
"Reading text from virtual file is not supported",
|
|
@@ -383,7 +455,7 @@ class File(DataModel):
|
|
|
383
455
|
self.path,
|
|
384
456
|
)
|
|
385
457
|
|
|
386
|
-
with self.open(mode="r") as stream:
|
|
458
|
+
with self.open(mode="r", **open_kwargs) as stream:
|
|
387
459
|
return stream.read()
|
|
388
460
|
|
|
389
461
|
def read(self, length: int = -1):
|
|
@@ -701,14 +773,19 @@ class TextFile(File):
|
|
|
701
773
|
"""`DataModel` for reading text files."""
|
|
702
774
|
|
|
703
775
|
@contextmanager
|
|
704
|
-
def open(self, mode:
|
|
705
|
-
"""Open the file and return a file object
|
|
706
|
-
|
|
776
|
+
def open(self, mode: str = "r", **open_kwargs) -> Iterator[Any]:
|
|
777
|
+
"""Open the file and return a file-like object.
|
|
778
|
+
Default to text mode"""
|
|
779
|
+
with super().open(mode=mode, **open_kwargs) as stream:
|
|
707
780
|
yield stream
|
|
708
781
|
|
|
709
|
-
def read_text(self):
|
|
710
|
-
"""
|
|
711
|
-
|
|
782
|
+
def read_text(self, **open_kwargs):
|
|
783
|
+
"""Return file contents as text.
|
|
784
|
+
|
|
785
|
+
**open_kwargs : Any
|
|
786
|
+
Extra keyword arguments forwarded to ``open()`` (e.g. encoding).
|
|
787
|
+
"""
|
|
788
|
+
with self.open(**open_kwargs) as stream:
|
|
712
789
|
return stream.read()
|
|
713
790
|
|
|
714
791
|
def save(self, destination: str, client_config: Optional[dict] = None):
|
|
@@ -75,7 +75,7 @@ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
|
|
|
75
75
|
datachain/lib/clip.py,sha256=ae6uoiymOl53rBXwIfqJkbHrk_IA21R1uJwXo5454C4,6145
|
|
76
76
|
datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
|
|
77
77
|
datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
|
|
78
|
-
datachain/lib/file.py,sha256=
|
|
78
|
+
datachain/lib/file.py,sha256=gCtF1J1wYXpIZem5sd-ENVtuWip_znE7EQOkV51uFkQ,47321
|
|
79
79
|
datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
|
|
80
80
|
datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
81
81
|
datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
|
|
@@ -161,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
161
161
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
162
162
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
163
163
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
164
|
-
datachain-0.31.
|
|
165
|
-
datachain-0.31.
|
|
166
|
-
datachain-0.31.
|
|
167
|
-
datachain-0.31.
|
|
168
|
-
datachain-0.31.
|
|
169
|
-
datachain-0.31.
|
|
164
|
+
datachain-0.31.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
165
|
+
datachain-0.31.4.dist-info/METADATA,sha256=wqjT5wVjclsvbSjyXxABcJ46-JKCGT5t8-MJK55VApM,13898
|
|
166
|
+
datachain-0.31.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
167
|
+
datachain-0.31.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
168
|
+
datachain-0.31.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
169
|
+
datachain-0.31.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|