datachain 0.34.0__py3-none-any.whl → 0.34.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -144,26 +144,19 @@ def shutdown_process(
144
144
  return proc.wait()
145
145
 
146
146
 
147
- def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
147
+ def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
148
148
  buffer = b""
149
+ while byt := stream.read(1): # Read one byte at a time
150
+ buffer += byt
149
151
 
150
- try:
151
- while byt := stream.read(1): # Read one byte at a time
152
- buffer += byt
153
-
154
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
155
- line = buffer.decode("utf-8", errors="replace")
156
- callback(line)
157
- buffer = b"" # Clear buffer for the next line
158
-
159
- if buffer: # Handle any remaining data in the buffer
160
- line = buffer.decode("utf-8", errors="replace")
152
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
153
+ line = buffer.decode("utf-8")
161
154
  callback(line)
162
- finally:
163
- try:
164
- stream.close() # Ensure output is closed
165
- except Exception: # noqa: BLE001, S110
166
- pass
155
+ buffer = b"" # Clear buffer for next line
156
+
157
+ if buffer: # Handle any remaining data in the buffer
158
+ line = buffer.decode("utf-8")
159
+ callback(line)
167
160
 
168
161
 
169
162
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1767,13 +1760,13 @@ class Catalog:
1767
1760
  recursive=recursive,
1768
1761
  )
1769
1762
 
1770
- @staticmethod
1771
1763
  def query(
1764
+ self,
1772
1765
  query_script: str,
1773
1766
  env: Optional[Mapping[str, str]] = None,
1774
1767
  python_executable: str = sys.executable,
1775
- stdout_callback: Optional[Callable[[str], None]] = None,
1776
- stderr_callback: Optional[Callable[[str], None]] = None,
1768
+ capture_output: bool = False,
1769
+ output_hook: Callable[[str], None] = noop,
1777
1770
  params: Optional[dict[str, str]] = None,
1778
1771
  job_id: Optional[str] = None,
1779
1772
  interrupt_timeout: Optional[int] = None,
@@ -1788,18 +1781,13 @@ class Catalog:
1788
1781
  },
1789
1782
  )
1790
1783
  popen_kwargs: dict[str, Any] = {}
1791
-
1792
- if stdout_callback is not None:
1793
- popen_kwargs = {"stdout": subprocess.PIPE}
1794
- if stderr_callback is not None:
1795
- popen_kwargs["stderr"] = subprocess.PIPE
1784
+ if capture_output:
1785
+ popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1796
1786
 
1797
1787
  def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1798
1788
  raise TerminationSignal(sig)
1799
1789
 
1800
- stdout_thread: Optional[Thread] = None
1801
- stderr_thread: Optional[Thread] = None
1802
-
1790
+ thread: Optional[Thread] = None
1803
1791
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1804
1792
  logger.info("Starting process %s", proc.pid)
1805
1793
 
@@ -1813,20 +1801,10 @@ class Catalog:
1813
1801
  orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1814
1802
  signal.signal(signal.SIGTERM, raise_termination_signal)
1815
1803
  try:
1816
- if stdout_callback is not None:
1817
- stdout_thread = Thread(
1818
- target=process_output,
1819
- args=(proc.stdout, stdout_callback),
1820
- daemon=True,
1821
- )
1822
- stdout_thread.start()
1823
- if stderr_callback is not None:
1824
- stderr_thread = Thread(
1825
- target=process_output,
1826
- args=(proc.stderr, stderr_callback),
1827
- daemon=True,
1828
- )
1829
- stderr_thread.start()
1804
+ if capture_output:
1805
+ args = (proc.stdout, output_hook)
1806
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1807
+ thread.start()
1830
1808
 
1831
1809
  proc.wait()
1832
1810
  except TerminationSignal as exc:
@@ -1844,22 +1822,8 @@ class Catalog:
1844
1822
  finally:
1845
1823
  signal.signal(signal.SIGTERM, orig_sigterm_handler)
1846
1824
  signal.signal(signal.SIGINT, orig_sigint_handler)
1847
- # wait for the reader thread
1848
- thread_join_timeout_seconds = 30
1849
- if stdout_thread is not None:
1850
- stdout_thread.join(timeout=thread_join_timeout_seconds)
1851
- if stdout_thread.is_alive():
1852
- logger.warning(
1853
- "stdout thread is still alive after %s seconds",
1854
- thread_join_timeout_seconds,
1855
- )
1856
- if stderr_thread is not None:
1857
- stderr_thread.join(timeout=thread_join_timeout_seconds)
1858
- if stderr_thread.is_alive():
1859
- logger.warning(
1860
- "stderr thread is still alive after %s seconds",
1861
- thread_join_timeout_seconds,
1862
- )
1825
+ if thread:
1826
+ thread.join() # wait for the reader thread
1863
1827
 
1864
1828
  logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1865
1829
  if proc.returncode in (
@@ -1963,12 +1963,15 @@ class DataChain:
1963
1963
  self,
1964
1964
  flatten: bool = False,
1965
1965
  include_hidden: bool = True,
1966
+ as_object: bool = False,
1966
1967
  ) -> "pd.DataFrame":
1967
1968
  """Return a pandas DataFrame from the chain.
1968
1969
 
1969
1970
  Parameters:
1970
1971
  flatten: Whether to use a multiindex or flatten column names.
1971
1972
  include_hidden: Whether to include hidden columns.
1973
+ as_object: Whether to emit a dataframe backed by Python objects
1974
+ rather than pandas-inferred dtypes.
1972
1975
 
1973
1976
  Returns:
1974
1977
  pd.DataFrame: A pandas DataFrame representation of the chain.
@@ -1984,6 +1987,9 @@ class DataChain:
1984
1987
  columns = pd.MultiIndex.from_tuples(map(tuple, headers))
1985
1988
 
1986
1989
  results = self.results(include_hidden=include_hidden)
1990
+ if as_object:
1991
+ df = pd.DataFrame(results, columns=columns, dtype=object)
1992
+ return df.where(pd.notna(df), None)
1987
1993
  return pd.DataFrame.from_records(results, columns=columns)
1988
1994
 
1989
1995
  def show(
@@ -2006,7 +2012,11 @@ class DataChain:
2006
2012
  import pandas as pd
2007
2013
 
2008
2014
  dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
2009
- df = dc.to_pandas(flatten, include_hidden=include_hidden)
2015
+ df = dc.to_pandas(
2016
+ flatten,
2017
+ include_hidden=include_hidden,
2018
+ as_object=True,
2019
+ )
2010
2020
 
2011
2021
  if df.empty:
2012
2022
  print("Empty result")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.34.0
3
+ Version: 0.34.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -23,7 +23,7 @@ datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
23
23
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
24
24
  datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
25
25
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
26
- datachain/catalog/catalog.py,sha256=oI4YBuuOJGVx_Fp1cDoFb56lPV7Or27ZquzR8oM1m3Y,69133
26
+ datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
27
27
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
28
28
  datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
29
29
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
@@ -107,7 +107,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
107
107
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
108
108
  datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
109
109
  datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
110
- datachain/lib/dc/datachain.py,sha256=uUAPchtNXyJo1tzFd3z1MLWhVC2dzO2ZjhTS0naqXiE,104032
110
+ datachain/lib/dc/datachain.py,sha256=Xh7Hwpvow_3QHPhsPSpP99HDKlwcJOpZEZJUNa_Ex9c,104396
111
111
  datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
112
112
  datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
113
113
  datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
@@ -164,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
164
164
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
165
165
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
166
166
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
167
- datachain-0.34.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
168
- datachain-0.34.0.dist-info/METADATA,sha256=YBmM_daqadosEKHBY-QLxSRxYn55XuhB0S0tfeEfzts,13655
169
- datachain-0.34.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
- datachain-0.34.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
171
- datachain-0.34.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
172
- datachain-0.34.0.dist-info/RECORD,,
167
+ datachain-0.34.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
168
+ datachain-0.34.1.dist-info/METADATA,sha256=x6vwqoDfsyj5T08GdAT7Qs13lv9uIonatPaxr_nPQ5Y,13655
169
+ datachain-0.34.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
+ datachain-0.34.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
171
+ datachain-0.34.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
172
+ datachain-0.34.1.dist-info/RECORD,,