datachain 0.31.3__py3-none-any.whl → 0.31.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -144,26 +144,19 @@ def shutdown_process(
144
144
  return proc.wait()
145
145
 
146
146
 
147
- def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
147
+ def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
148
148
  buffer = b""
149
+ while byt := stream.read(1): # Read one byte at a time
150
+ buffer += byt
149
151
 
150
- try:
151
- while byt := stream.read(1): # Read one byte at a time
152
- buffer += byt
153
-
154
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
155
- line = buffer.decode("utf-8", errors="replace")
156
- callback(line)
157
- buffer = b"" # Clear buffer for the next line
158
-
159
- if buffer: # Handle any remaining data in the buffer
160
- line = buffer.decode("utf-8", errors="replace")
152
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
153
+ line = buffer.decode("utf-8")
161
154
  callback(line)
162
- finally:
163
- try:
164
- stream.close() # Ensure output is closed
165
- except Exception: # noqa: BLE001, S110
166
- pass
155
+ buffer = b"" # Clear buffer for next line
156
+
157
+ if buffer: # Handle any remaining data in the buffer
158
+ line = buffer.decode("utf-8")
159
+ callback(line)
167
160
 
168
161
 
169
162
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1767,13 +1760,13 @@ class Catalog:
1767
1760
  recursive=recursive,
1768
1761
  )
1769
1762
 
1770
- @staticmethod
1771
1763
  def query(
1764
+ self,
1772
1765
  query_script: str,
1773
1766
  env: Optional[Mapping[str, str]] = None,
1774
1767
  python_executable: str = sys.executable,
1775
- stdout_callback: Optional[Callable[[str], None]] = None,
1776
- stderr_callback: Optional[Callable[[str], None]] = None,
1768
+ capture_output: bool = False,
1769
+ output_hook: Callable[[str], None] = noop,
1777
1770
  params: Optional[dict[str, str]] = None,
1778
1771
  job_id: Optional[str] = None,
1779
1772
  interrupt_timeout: Optional[int] = None,
@@ -1788,18 +1781,13 @@ class Catalog:
1788
1781
  },
1789
1782
  )
1790
1783
  popen_kwargs: dict[str, Any] = {}
1791
-
1792
- if stdout_callback is not None:
1793
- popen_kwargs = {"stdout": subprocess.PIPE}
1794
- if stderr_callback is not None:
1795
- popen_kwargs["stderr"] = subprocess.PIPE
1784
+ if capture_output:
1785
+ popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1796
1786
 
1797
1787
  def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1798
1788
  raise TerminationSignal(sig)
1799
1789
 
1800
- stdout_thread: Optional[Thread] = None
1801
- stderr_thread: Optional[Thread] = None
1802
-
1790
+ thread: Optional[Thread] = None
1803
1791
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1804
1792
  logger.info("Starting process %s", proc.pid)
1805
1793
 
@@ -1813,20 +1801,10 @@ class Catalog:
1813
1801
  orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1814
1802
  signal.signal(signal.SIGTERM, raise_termination_signal)
1815
1803
  try:
1816
- if stdout_callback is not None:
1817
- stdout_thread = Thread(
1818
- target=process_output,
1819
- args=(proc.stdout, stdout_callback),
1820
- daemon=True,
1821
- )
1822
- stdout_thread.start()
1823
- if stderr_callback is not None:
1824
- stderr_thread = Thread(
1825
- target=process_output,
1826
- args=(proc.stderr, stderr_callback),
1827
- daemon=True,
1828
- )
1829
- stderr_thread.start()
1804
+ if capture_output:
1805
+ args = (proc.stdout, output_hook)
1806
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1807
+ thread.start()
1830
1808
 
1831
1809
  proc.wait()
1832
1810
  except TerminationSignal as exc:
@@ -1844,22 +1822,8 @@ class Catalog:
1844
1822
  finally:
1845
1823
  signal.signal(signal.SIGTERM, orig_sigterm_handler)
1846
1824
  signal.signal(signal.SIGINT, orig_sigint_handler)
1847
- # wait for the reader thread
1848
- thread_join_timeout_seconds = 30
1849
- if stdout_thread is not None:
1850
- stdout_thread.join(timeout=thread_join_timeout_seconds)
1851
- if stdout_thread.is_alive():
1852
- logger.warning(
1853
- "stdout thread is still alive after %s seconds",
1854
- thread_join_timeout_seconds,
1855
- )
1856
- if stderr_thread is not None:
1857
- stderr_thread.join(timeout=thread_join_timeout_seconds)
1858
- if stderr_thread.is_alive():
1859
- logger.warning(
1860
- "stderr thread is still alive after %s seconds",
1861
- thread_join_timeout_seconds,
1862
- )
1825
+ if thread:
1826
+ thread.join() # wait for the reader thread
1863
1827
 
1864
1828
  logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1865
1829
  if proc.returncode in (
datachain/lib/file.py CHANGED
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
35
35
  from datachain.catalog import Catalog
36
36
  from datachain.client.fsspec import Client
37
37
  from datachain.dataset import RowDict
38
+ from datachain.query.session import Session
38
39
 
39
40
  sha256 = partial(hashlib.sha256, usedforsecurity=False)
40
41
 
@@ -252,6 +253,15 @@ class File(DataModel):
252
253
  "last_modified",
253
254
  ]
254
255
 
256
+ # Allowed kwargs we forward to TextIOWrapper
257
+ _TEXT_WRAPPER_ALLOWED: ClassVar[tuple[str, ...]] = (
258
+ "encoding",
259
+ "errors",
260
+ "newline",
261
+ "line_buffering",
262
+ "write_through",
263
+ )
264
+
255
265
  @staticmethod
256
266
  def _validate_dict(
257
267
  v: Optional[Union[str, dict, list[dict]]],
@@ -328,7 +338,6 @@ class File(DataModel):
328
338
  from datachain.catalog.loader import get_catalog
329
339
 
330
340
  catalog = get_catalog()
331
-
332
341
  from datachain.client.fsspec import Client
333
342
 
334
343
  client_cls = Client.get_implementation(path)
@@ -341,6 +350,27 @@ class File(DataModel):
341
350
  file._set_stream(catalog)
342
351
  return file
343
352
 
353
+ @classmethod
354
+ def at(cls, uri: str, session: Optional["Session"] = None) -> "Self":
355
+ """Construct a File from a full URI in one call.
356
+
357
+ Example:
358
+ file = File.at("s3://bucket/path/to/output.png")
359
+ with file.open("wb") as f: ...
360
+ """
361
+ from datachain.client.fsspec import Client
362
+ from datachain.query.session import Session
363
+
364
+ if session is None:
365
+ session = Session.get()
366
+ catalog = session.catalog
367
+
368
+ client_cls = Client.get_implementation(uri)
369
+ source, rel_path = client_cls.split_url(uri)
370
+ file = cls(source=client_cls.get_uri(source), path=rel_path)
371
+ file._set_stream(catalog)
372
+ return file
373
+
344
374
  @classmethod
345
375
  def _from_row(cls, row: "RowDict") -> "Self":
346
376
  return cls(**{key: row[key] for key in cls._datachain_column_types})
@@ -354,28 +384,70 @@ class File(DataModel):
354
384
  return str(PurePosixPath(self.path).parent)
355
385
 
356
386
  @contextmanager
357
- def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
358
- """Open the file and return a file object."""
359
- if self.location:
360
- with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
361
- yield f
387
+ def open(self, mode: str = "rb", **open_kwargs) -> Iterator[Any]:
388
+ """Open the file and return a file-like object.
362
389
 
363
- else:
390
+ Supports both read ("rb", "r") and write modes (e.g. "wb", "w", "ab").
391
+ When opened in a write mode, metadata is refreshed after closing.
392
+ """
393
+ writing = any(ch in mode for ch in "wax+")
394
+ if self.location and writing:
395
+ raise VFileError(
396
+ "Writing to virtual file is not supported",
397
+ self.source,
398
+ self.path,
399
+ )
400
+
401
+ if self._catalog is None:
402
+ raise RuntimeError("Cannot open file: catalog is not set")
403
+
404
+ client: Client = self._catalog.get_client(self.source)
405
+
406
+ if not writing:
407
+ if self.location:
408
+ with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
409
+ yield self._wrap_text(f, mode, open_kwargs)
410
+ return
364
411
  if self._caching_enabled:
365
412
  self.ensure_cached()
366
- client: Client = self._catalog.get_client(self.source)
367
413
  with client.open_object(
368
414
  self, use_cache=self._caching_enabled, cb=self._download_cb
369
415
  ) as f:
370
- yield io.TextIOWrapper(f) if mode == "r" else f
416
+ yield self._wrap_text(f, mode, open_kwargs)
417
+ return
418
+
419
+ # write path
420
+ full_path = client.get_full_path(self.get_path_normalized())
421
+ with client.fs.open(full_path, mode, **open_kwargs) as f:
422
+ yield self._wrap_text(f, mode, open_kwargs)
423
+
424
+ # refresh metadata
425
+ info = client.fs.info(full_path)
426
+ refreshed = client.info_to_file(info, self.get_path_normalized())
427
+ for k, v in refreshed.model_dump().items():
428
+ setattr(self, k, v)
429
+
430
+ def _wrap_text(self, f: Any, mode: str, open_kwargs: dict[str, Any]) -> Any:
431
+ """Return stream possibly wrapped for text."""
432
+ if "b" in mode or isinstance(f, io.TextIOBase):
433
+ return f
434
+ filtered = {
435
+ k: open_kwargs[k] for k in self._TEXT_WRAPPER_ALLOWED if k in open_kwargs
436
+ }
437
+ return io.TextIOWrapper(f, **filtered)
371
438
 
372
439
  def read_bytes(self, length: int = -1):
373
440
  """Returns file contents as bytes."""
374
441
  with self.open() as stream:
375
442
  return stream.read(length)
376
443
 
377
- def read_text(self):
378
- """Returns file contents as text."""
444
+ def read_text(self, **open_kwargs):
445
+ """Return file contents decoded as text.
446
+
447
+ **open_kwargs : Any
448
+ Extra keyword arguments forwarded to ``open(mode="r", ...)``
449
+ (e.g. ``encoding="utf-8"``, ``errors="ignore"``)
450
+ """
379
451
  if self.location:
380
452
  raise VFileError(
381
453
  "Reading text from virtual file is not supported",
@@ -383,7 +455,7 @@ class File(DataModel):
383
455
  self.path,
384
456
  )
385
457
 
386
- with self.open(mode="r") as stream:
458
+ with self.open(mode="r", **open_kwargs) as stream:
387
459
  return stream.read()
388
460
 
389
461
  def read(self, length: int = -1):
@@ -701,14 +773,19 @@ class TextFile(File):
701
773
  """`DataModel` for reading text files."""
702
774
 
703
775
  @contextmanager
704
- def open(self, mode: Literal["rb", "r"] = "r"):
705
- """Open the file and return a file object (default to text mode)."""
706
- with super().open(mode=mode) as stream:
776
+ def open(self, mode: str = "r", **open_kwargs) -> Iterator[Any]:
777
+ """Open the file and return a file-like object.
778
+ Default to text mode"""
779
+ with super().open(mode=mode, **open_kwargs) as stream:
707
780
  yield stream
708
781
 
709
- def read_text(self):
710
- """Returns file contents as text."""
711
- with self.open() as stream:
782
+ def read_text(self, **open_kwargs):
783
+ """Return file contents as text.
784
+
785
+ **open_kwargs : Any
786
+ Extra keyword arguments forwarded to ``open()`` (e.g. encoding).
787
+ """
788
+ with self.open(**open_kwargs) as stream:
712
789
  return stream.read()
713
790
 
714
791
  def save(self, destination: str, client_config: Optional[dict] = None):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.31.3
3
+ Version: 0.31.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -21,7 +21,7 @@ datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
22
  datachain/utils.py,sha256=5ehFeqXau7MFmGUQRsjRyPfDMPoOF1ojpfVciYUo5fE,15659
23
23
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
24
- datachain/catalog/catalog.py,sha256=oI4YBuuOJGVx_Fp1cDoFb56lPV7Or27ZquzR8oM1m3Y,69133
24
+ datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
25
25
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
26
26
  datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
27
27
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
@@ -75,7 +75,7 @@ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
75
75
  datachain/lib/clip.py,sha256=ae6uoiymOl53rBXwIfqJkbHrk_IA21R1uJwXo5454C4,6145
76
76
  datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
77
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
78
- datachain/lib/file.py,sha256=IGwpCwjsSOpZXlRsatcMKToMmuvYiX6_UtaTjUKAAdg,44511
78
+ datachain/lib/file.py,sha256=gCtF1J1wYXpIZem5sd-ENVtuWip_znE7EQOkV51uFkQ,47321
79
79
  datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
80
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
@@ -161,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
161
161
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
162
162
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
163
163
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
164
- datachain-0.31.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
- datachain-0.31.3.dist-info/METADATA,sha256=dZjBfjFrwEjatAGqlONnD8fIO6H-2Njw1rHyvvZQ1kU,13898
166
- datachain-0.31.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
- datachain-0.31.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
- datachain-0.31.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
- datachain-0.31.3.dist-info/RECORD,,
164
+ datachain-0.31.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
+ datachain-0.31.4.dist-info/METADATA,sha256=wqjT5wVjclsvbSjyXxABcJ46-JKCGT5t8-MJK55VApM,13898
166
+ datachain-0.31.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
+ datachain-0.31.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
+ datachain-0.31.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
+ datachain-0.31.4.dist-info/RECORD,,