datachain 0.31.2__py3-none-any.whl → 0.31.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/file.py CHANGED
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
35
35
  from datachain.catalog import Catalog
36
36
  from datachain.client.fsspec import Client
37
37
  from datachain.dataset import RowDict
38
+ from datachain.query.session import Session
38
39
 
39
40
  sha256 = partial(hashlib.sha256, usedforsecurity=False)
40
41
 
@@ -252,6 +253,15 @@ class File(DataModel):
252
253
  "last_modified",
253
254
  ]
254
255
 
256
+ # Allowed kwargs we forward to TextIOWrapper
257
+ _TEXT_WRAPPER_ALLOWED: ClassVar[tuple[str, ...]] = (
258
+ "encoding",
259
+ "errors",
260
+ "newline",
261
+ "line_buffering",
262
+ "write_through",
263
+ )
264
+
255
265
  @staticmethod
256
266
  def _validate_dict(
257
267
  v: Optional[Union[str, dict, list[dict]]],
@@ -328,7 +338,6 @@ class File(DataModel):
328
338
  from datachain.catalog.loader import get_catalog
329
339
 
330
340
  catalog = get_catalog()
331
-
332
341
  from datachain.client.fsspec import Client
333
342
 
334
343
  client_cls = Client.get_implementation(path)
@@ -341,6 +350,27 @@ class File(DataModel):
341
350
  file._set_stream(catalog)
342
351
  return file
343
352
 
353
+ @classmethod
354
+ def at(cls, uri: str, session: Optional["Session"] = None) -> "Self":
355
+ """Construct a File from a full URI in one call.
356
+
357
+ Example:
358
+ file = File.at("s3://bucket/path/to/output.png")
359
+ with file.open("wb") as f: ...
360
+ """
361
+ from datachain.client.fsspec import Client
362
+ from datachain.query.session import Session
363
+
364
+ if session is None:
365
+ session = Session.get()
366
+ catalog = session.catalog
367
+
368
+ client_cls = Client.get_implementation(uri)
369
+ source, rel_path = client_cls.split_url(uri)
370
+ file = cls(source=client_cls.get_uri(source), path=rel_path)
371
+ file._set_stream(catalog)
372
+ return file
373
+
344
374
  @classmethod
345
375
  def _from_row(cls, row: "RowDict") -> "Self":
346
376
  return cls(**{key: row[key] for key in cls._datachain_column_types})
@@ -354,28 +384,70 @@ class File(DataModel):
354
384
  return str(PurePosixPath(self.path).parent)
355
385
 
356
386
  @contextmanager
357
- def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
358
- """Open the file and return a file object."""
359
- if self.location:
360
- with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
361
- yield f
387
+ def open(self, mode: str = "rb", **open_kwargs) -> Iterator[Any]:
388
+ """Open the file and return a file-like object.
362
389
 
363
- else:
390
+ Supports both read ("rb", "r") and write modes (e.g. "wb", "w", "ab").
391
+ When opened in a write mode, metadata is refreshed after closing.
392
+ """
393
+ writing = any(ch in mode for ch in "wax+")
394
+ if self.location and writing:
395
+ raise VFileError(
396
+ "Writing to virtual file is not supported",
397
+ self.source,
398
+ self.path,
399
+ )
400
+
401
+ if self._catalog is None:
402
+ raise RuntimeError("Cannot open file: catalog is not set")
403
+
404
+ client: Client = self._catalog.get_client(self.source)
405
+
406
+ if not writing:
407
+ if self.location:
408
+ with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
409
+ yield self._wrap_text(f, mode, open_kwargs)
410
+ return
364
411
  if self._caching_enabled:
365
412
  self.ensure_cached()
366
- client: Client = self._catalog.get_client(self.source)
367
413
  with client.open_object(
368
414
  self, use_cache=self._caching_enabled, cb=self._download_cb
369
415
  ) as f:
370
- yield io.TextIOWrapper(f) if mode == "r" else f
416
+ yield self._wrap_text(f, mode, open_kwargs)
417
+ return
418
+
419
+ # write path
420
+ full_path = client.get_full_path(self.get_path_normalized())
421
+ with client.fs.open(full_path, mode, **open_kwargs) as f:
422
+ yield self._wrap_text(f, mode, open_kwargs)
423
+
424
+ # refresh metadata
425
+ info = client.fs.info(full_path)
426
+ refreshed = client.info_to_file(info, self.get_path_normalized())
427
+ for k, v in refreshed.model_dump().items():
428
+ setattr(self, k, v)
429
+
430
+ def _wrap_text(self, f: Any, mode: str, open_kwargs: dict[str, Any]) -> Any:
431
+ """Return stream possibly wrapped for text."""
432
+ if "b" in mode or isinstance(f, io.TextIOBase):
433
+ return f
434
+ filtered = {
435
+ k: open_kwargs[k] for k in self._TEXT_WRAPPER_ALLOWED if k in open_kwargs
436
+ }
437
+ return io.TextIOWrapper(f, **filtered)
371
438
 
372
439
  def read_bytes(self, length: int = -1):
373
440
  """Returns file contents as bytes."""
374
441
  with self.open() as stream:
375
442
  return stream.read(length)
376
443
 
377
- def read_text(self):
378
- """Returns file contents as text."""
444
+ def read_text(self, **open_kwargs):
445
+ """Return file contents decoded as text.
446
+
447
+ **open_kwargs : Any
448
+ Extra keyword arguments forwarded to ``open(mode="r", ...)``
449
+ (e.g. ``encoding="utf-8"``, ``errors="ignore"``)
450
+ """
379
451
  if self.location:
380
452
  raise VFileError(
381
453
  "Reading text from virtual file is not supported",
@@ -383,7 +455,7 @@ class File(DataModel):
383
455
  self.path,
384
456
  )
385
457
 
386
- with self.open(mode="r") as stream:
458
+ with self.open(mode="r", **open_kwargs) as stream:
387
459
  return stream.read()
388
460
 
389
461
  def read(self, length: int = -1):
@@ -701,14 +773,19 @@ class TextFile(File):
701
773
  """`DataModel` for reading text files."""
702
774
 
703
775
  @contextmanager
704
- def open(self, mode: Literal["rb", "r"] = "r"):
705
- """Open the file and return a file object (default to text mode)."""
706
- with super().open(mode=mode) as stream:
776
+ def open(self, mode: str = "r", **open_kwargs) -> Iterator[Any]:
777
+ """Open the file and return a file-like object.
778
+ Default to text mode"""
779
+ with super().open(mode=mode, **open_kwargs) as stream:
707
780
  yield stream
708
781
 
709
- def read_text(self):
710
- """Returns file contents as text."""
711
- with self.open() as stream:
782
+ def read_text(self, **open_kwargs):
783
+ """Return file contents as text.
784
+
785
+ **open_kwargs : Any
786
+ Extra keyword arguments forwarded to ``open()`` (e.g. encoding).
787
+ """
788
+ with self.open(**open_kwargs) as stream:
712
789
  return stream.read()
713
790
 
714
791
  def save(self, destination: str, client_config: Optional[dict] = None):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.31.2
3
+ Version: 0.31.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -75,7 +75,7 @@ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
75
75
  datachain/lib/clip.py,sha256=ae6uoiymOl53rBXwIfqJkbHrk_IA21R1uJwXo5454C4,6145
76
76
  datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
77
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
78
- datachain/lib/file.py,sha256=IGwpCwjsSOpZXlRsatcMKToMmuvYiX6_UtaTjUKAAdg,44511
78
+ datachain/lib/file.py,sha256=gCtF1J1wYXpIZem5sd-ENVtuWip_znE7EQOkV51uFkQ,47321
79
79
  datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
80
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
@@ -161,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
161
161
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
162
162
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
163
163
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
164
- datachain-0.31.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
- datachain-0.31.2.dist-info/METADATA,sha256=ALo4Vp6w2VSanACVy1xv6aHWzbdasSKzD2U8_SybXBU,13898
166
- datachain-0.31.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
- datachain-0.31.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
- datachain-0.31.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
- datachain-0.31.2.dist-info/RECORD,,
164
+ datachain-0.31.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
+ datachain-0.31.4.dist-info/METADATA,sha256=wqjT5wVjclsvbSjyXxABcJ46-JKCGT5t8-MJK55VApM,13898
166
+ datachain-0.31.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
+ datachain-0.31.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
+ datachain-0.31.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
+ datachain-0.31.4.dist-info/RECORD,,