datachain 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -9,11 +9,9 @@ import os.path
9
9
  import posixpath
10
10
  import subprocess
11
11
  import sys
12
- import tempfile
13
12
  import time
14
13
  import traceback
15
14
  from collections.abc import Iterable, Iterator, Mapping, Sequence
16
- from contextlib import contextmanager, nullcontext
17
15
  from copy import copy
18
16
  from dataclasses import dataclass
19
17
  from functools import cached_property, reduce
@@ -24,7 +22,6 @@ from typing import (
24
22
  TYPE_CHECKING,
25
23
  Any,
26
24
  Callable,
27
- NamedTuple,
28
25
  NoReturn,
29
26
  Optional,
30
27
  Union,
@@ -59,7 +56,6 @@ from datachain.error import (
59
56
  PendingIndexingError,
60
57
  QueryScriptCancelError,
61
58
  QueryScriptCompileError,
62
- QueryScriptDatasetNotFound,
63
59
  QueryScriptRunError,
64
60
  )
65
61
  from datachain.listing import Listing
@@ -77,7 +73,6 @@ from datachain.utils import (
77
73
  )
78
74
 
79
75
  from .datasource import DataSource
80
- from .subclass import SubclassFinder
81
76
 
82
77
  if TYPE_CHECKING:
83
78
  from datachain.data_storage import (
@@ -92,7 +87,6 @@ logger = logging.getLogger("datachain")
92
87
 
93
88
  DEFAULT_DATASET_DIR = "dataset"
94
89
  DATASET_FILE_SUFFIX = ".edatachain"
95
- FEATURE_CLASSES = ["DataModel"]
96
90
 
97
91
  TTL_INT = 4 * 60 * 60
98
92
 
@@ -118,44 +112,19 @@ def noop(_: str):
118
112
  pass
119
113
 
120
114
 
121
- @contextmanager
122
- def print_and_capture(
123
- stream: "IO[bytes]|IO[str]", callback: Callable[[str], None] = noop
124
- ) -> "Iterator[list[str]]":
125
- lines: list[str] = []
126
- append = lines.append
115
+ def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
116
+ buffer = b""
117
+ while byt := stream.read(1): # Read one byte at a time
118
+ buffer += byt
127
119
 
128
- def loop() -> None:
129
- buffer = b""
130
- while byt := stream.read(1): # Read one byte at a time
131
- buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
132
-
133
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
134
- line = buffer.decode("utf-8")
135
- print(line, end="")
136
- callback(line)
137
- append(line)
138
- buffer = b"" # Clear buffer for next line
139
-
140
- if buffer: # Handle any remaining data in the buffer
120
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
141
121
  line = buffer.decode("utf-8")
142
- print(line, end="")
143
122
  callback(line)
144
- append(line)
145
-
146
- thread = Thread(target=loop, daemon=True)
147
- thread.start()
148
-
149
- try:
150
- yield lines
151
- finally:
152
- thread.join()
123
+ buffer = b"" # Clear buffer for next line
153
124
 
154
-
155
- class QueryResult(NamedTuple):
156
- dataset: Optional[DatasetRecord]
157
- version: Optional[int]
158
- output: str
125
+ if buffer: # Handle any remaining data in the buffer
126
+ line = buffer.decode("utf-8")
127
+ callback(line)
159
128
 
160
129
 
161
130
  class DatasetRowsFetcher(NodesThreadPool):
@@ -569,12 +538,6 @@ def find_column_to_str( # noqa: PLR0911
569
538
  return ""
570
539
 
571
540
 
572
- def form_module_source(source_ast):
573
- module = ast.Module(body=source_ast, type_ignores=[])
574
- module = ast.fix_missing_locations(module)
575
- return ast.unparse(module)
576
-
577
-
578
541
  class Catalog:
579
542
  def __init__(
580
543
  self,
@@ -658,34 +621,8 @@ class Catalog:
658
621
  ),
659
622
  ]
660
623
  code_ast.body[-1:] = new_expressions
661
- else:
662
- raise Exception("Last line in a script was not an expression")
663
624
  return code_ast
664
625
 
665
- def compile_query_script(
666
- self, script: str, feature_module_name: str
667
- ) -> tuple[Union[str, None], str]:
668
- code_ast = ast.parse(script)
669
- code_ast = self.attach_query_wrapper(code_ast)
670
- finder = SubclassFinder(FEATURE_CLASSES)
671
- finder.visit(code_ast)
672
-
673
- if not finder.feature_class:
674
- main_module = form_module_source([*finder.imports, *finder.main_body])
675
- return None, main_module
676
-
677
- feature_import = ast.ImportFrom(
678
- module=feature_module_name,
679
- names=[ast.alias(name="*", asname=None)],
680
- level=0,
681
- )
682
- feature_module = form_module_source([*finder.imports, *finder.feature_class])
683
- main_module = form_module_source(
684
- [*finder.imports, feature_import, *finder.main_body]
685
- )
686
-
687
- return feature_module, main_module
688
-
689
626
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
690
627
  config = config or self.client_config
691
628
  return Client.parse_url(uri, self.cache, **config)
@@ -1416,7 +1353,8 @@ class Catalog:
1416
1353
 
1417
1354
  for d in datasets:
1418
1355
  yield from (
1419
- (d, v, jobs.get(v.job_id) if v.job_id else None) for v in d.versions
1356
+ (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
1357
+ for v in d.versions
1420
1358
  )
1421
1359
 
1422
1360
  def ls_dataset_rows(
@@ -1834,14 +1772,15 @@ class Catalog:
1834
1772
  def query(
1835
1773
  self,
1836
1774
  query_script: str,
1837
- envs: Optional[Mapping[str, str]] = None,
1838
- python_executable: Optional[str] = None,
1775
+ env: Optional[Mapping[str, str]] = None,
1776
+ python_executable: str = sys.executable,
1839
1777
  save: bool = False,
1840
1778
  capture_output: bool = True,
1841
1779
  output_hook: Callable[[str], None] = noop,
1842
1780
  params: Optional[dict[str, str]] = None,
1843
1781
  job_id: Optional[str] = None,
1844
- ) -> QueryResult:
1782
+ _execute_last_expression: bool = False,
1783
+ ) -> None:
1845
1784
  """
1846
1785
  Method to run custom user Python script to run a query and, as result,
1847
1786
  creates new dataset from the results of a query.
@@ -1864,170 +1803,51 @@ class Catalog:
1864
1803
  C.size > 1000
1865
1804
  )
1866
1805
  """
1867
-
1868
- feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1869
- dir=os.getcwd(), suffix=".py", delete=False
1870
- )
1871
- _, feature_module = os.path.split(feature_file.name)
1872
-
1873
- try:
1874
- lines, proc, response_text = self.run_query(
1875
- python_executable or sys.executable,
1876
- query_script,
1877
- envs,
1878
- feature_file,
1879
- capture_output,
1880
- feature_module,
1881
- output_hook,
1882
- params,
1883
- save,
1884
- job_id,
1885
- )
1886
- finally:
1887
- feature_file.close()
1888
- os.unlink(feature_file.name)
1889
-
1890
- output = "".join(lines)
1891
-
1892
- if proc.returncode:
1893
- if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1894
- raise QueryScriptCancelError(
1895
- "Query script was canceled by user",
1896
- return_code=proc.returncode,
1897
- output=output,
1898
- )
1899
- if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
1900
- raise QueryScriptRunError(
1901
- "Last line in a script was not an instance of DataChain",
1902
- return_code=proc.returncode,
1903
- output=output,
1904
- )
1905
- raise QueryScriptRunError(
1906
- f"Query script exited with error code {proc.returncode}",
1907
- return_code=proc.returncode,
1908
- output=output,
1909
- )
1910
-
1911
- try:
1912
- result = json.loads(response_text)
1913
- except ValueError:
1914
- result = None
1915
-
1916
- dataset: Optional[DatasetRecord] = None
1917
- version: Optional[int] = None
1918
- if save:
1919
- dataset, version = self.save_result(
1920
- query_script, result, output, version, job_id
1921
- )
1922
-
1923
- return QueryResult(dataset=dataset, version=version, output=output)
1924
-
1925
- def run_query(
1926
- self,
1927
- python_executable: str,
1928
- query_script: str,
1929
- envs: Optional[Mapping[str, str]],
1930
- feature_file: IO[bytes],
1931
- capture_output: bool,
1932
- feature_module: str,
1933
- output_hook: Callable[[str], None],
1934
- params: Optional[dict[str, str]],
1935
- save: bool,
1936
- job_id: Optional[str],
1937
- ) -> tuple[list[str], subprocess.Popen, str]:
1938
- try:
1939
- feature_code, query_script_compiled = self.compile_query_script(
1940
- query_script, feature_module[:-3]
1941
- )
1942
- if feature_code:
1943
- feature_file.write(feature_code.encode())
1944
- feature_file.flush()
1945
-
1946
- except Exception as exc:
1947
- raise QueryScriptCompileError(
1948
- f"Query script failed to compile, reason: {exc}"
1949
- ) from exc
1950
- r, w = os.pipe()
1951
- if os.name == "nt":
1952
- import msvcrt
1953
-
1954
- os.set_inheritable(w, True)
1955
-
1956
- startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
1957
- handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
1958
- startupinfo.lpAttributeList["handle_list"].append(handle)
1959
- kwargs: dict[str, Any] = {"startupinfo": startupinfo}
1806
+ if _execute_last_expression:
1807
+ try:
1808
+ code_ast = ast.parse(query_script)
1809
+ code_ast = self.attach_query_wrapper(code_ast)
1810
+ query_script_compiled = ast.unparse(code_ast)
1811
+ except Exception as exc:
1812
+ raise QueryScriptCompileError(
1813
+ f"Query script failed to compile, reason: {exc}"
1814
+ ) from exc
1960
1815
  else:
1961
- handle = w
1962
- kwargs = {"pass_fds": [w]}
1963
- envs = dict(envs or os.environ)
1964
- if feature_code:
1965
- envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
1966
- {feature_module: feature_code}
1967
- )
1968
- envs.update(
1816
+ query_script_compiled = query_script
1817
+ assert not save
1818
+
1819
+ env = dict(env or os.environ)
1820
+ env.update(
1969
1821
  {
1970
1822
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
1971
1823
  "PYTHONPATH": os.getcwd(), # For local imports
1972
1824
  "DATACHAIN_QUERY_SAVE": "1" if save else "",
1973
1825
  "PYTHONUNBUFFERED": "1",
1974
- "DATACHAIN_OUTPUT_FD": str(handle),
1975
1826
  "DATACHAIN_JOB_ID": job_id or "",
1976
1827
  },
1977
1828
  )
1978
- with subprocess.Popen( # noqa: S603
1979
- [python_executable, "-c", query_script_compiled],
1980
- env=envs,
1981
- stdout=subprocess.PIPE if capture_output else None,
1982
- stderr=subprocess.STDOUT if capture_output else None,
1983
- bufsize=1,
1984
- text=False,
1985
- **kwargs,
1986
- ) as proc:
1987
- os.close(w)
1988
-
1989
- out = proc.stdout
1990
- _lines: list[str] = []
1991
- ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
1992
-
1993
- with ctx as lines, open(r) as f:
1994
- response_text = ""
1995
- while proc.poll() is None:
1996
- response_text += f.readline()
1997
- time.sleep(0.1)
1998
- response_text += f.readline()
1999
- return lines, proc, response_text
2000
-
2001
- def save_result(self, query_script, exec_result, output, version, job_id):
2002
- if not exec_result:
2003
- raise QueryScriptDatasetNotFound(
2004
- "No dataset found after running Query script",
2005
- output=output,
1829
+ popen_kwargs = {}
1830
+ if capture_output:
1831
+ popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1832
+
1833
+ cmd = [python_executable, "-c", query_script_compiled]
1834
+ with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # type: ignore[call-overload] # noqa: S603
1835
+ if capture_output:
1836
+ args = (proc.stdout, output_hook)
1837
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1838
+ thread.start()
1839
+ thread.join() # wait for the reader thread
1840
+
1841
+ if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1842
+ raise QueryScriptCancelError(
1843
+ "Query script was canceled by user",
1844
+ return_code=proc.returncode,
1845
+ )
1846
+ if proc.returncode:
1847
+ raise QueryScriptRunError(
1848
+ f"Query script exited with error code {proc.returncode}",
1849
+ return_code=proc.returncode,
2006
1850
  )
2007
- name, version = exec_result
2008
- # finding returning dataset
2009
- try:
2010
- dataset = self.get_dataset(name)
2011
- dataset.get_version(version)
2012
- except (DatasetNotFoundError, ValueError) as e:
2013
- raise QueryScriptDatasetNotFound(
2014
- "No dataset found after running Query script",
2015
- output=output,
2016
- ) from e
2017
- dataset = self.update_dataset(
2018
- dataset,
2019
- script_output=output,
2020
- query_script=query_script,
2021
- )
2022
- self.update_dataset_version_with_warehouse_info(
2023
- dataset,
2024
- version,
2025
- script_output=output,
2026
- query_script=query_script,
2027
- job_id=job_id,
2028
- is_job_result=True,
2029
- )
2030
- return dataset, version
2031
1851
 
2032
1852
  def cp(
2033
1853
  self,
datachain/error.py CHANGED
@@ -42,10 +42,6 @@ class QueryScriptRunError(Exception):
42
42
  super().__init__(self.message)
43
43
 
44
44
 
45
- class QueryScriptDatasetNotFound(QueryScriptRunError): # noqa: N818
46
- pass
47
-
48
-
49
45
  class QueryScriptCancelError(QueryScriptRunError):
50
46
  pass
51
47
 
datachain/job.py CHANGED
@@ -1,7 +1,8 @@
1
1
  import json
2
+ import uuid
2
3
  from dataclasses import dataclass
3
4
  from datetime import datetime
4
- from typing import Any, Optional, TypeVar
5
+ from typing import Any, Optional, TypeVar, Union
5
6
 
6
7
  J = TypeVar("J", bound="Job")
7
8
 
@@ -25,7 +26,7 @@ class Job:
25
26
  @classmethod
26
27
  def parse(
27
28
  cls: type[J],
28
- id: str,
29
+ id: Union[str, uuid.UUID],
29
30
  name: str,
30
31
  status: int,
31
32
  created_at: datetime,
@@ -40,7 +41,7 @@ class Job:
40
41
  metrics: str,
41
42
  ) -> "Job":
42
43
  return cls(
43
- id,
44
+ str(id),
44
45
  name,
45
46
  status,
46
47
  created_at,
datachain/lib/clip.py CHANGED
@@ -18,7 +18,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
18
18
  hasattr(model, method_name) and inspect.ismethod(getattr(model, method_name))
19
19
  ):
20
20
  method = getattr(model, method_name)
21
- return lambda x: method(torch.tensor(x))
21
+ return lambda x: method(torch.as_tensor(x).clone().detach())
22
22
 
23
23
  # Check for model from clip or open_clip library
24
24
  method_name = f"encode_{type}"
datachain/lib/dc.py CHANGED
@@ -56,7 +56,7 @@ from datachain.query.dataset import (
56
56
  PartitionByType,
57
57
  detach,
58
58
  )
59
- from datachain.query.schema import Column, DatasetRow
59
+ from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
60
60
  from datachain.sql.functions import path as pathfunc
61
61
  from datachain.utils import inside_notebook
62
62
 
@@ -112,11 +112,31 @@ class DatasetFromValuesError(DataChainParamsError): # noqa: D101
112
112
  super().__init__(f"Dataset{name} from values error: {msg}")
113
113
 
114
114
 
115
+ def _get_merge_error_str(col: Union[str, sqlalchemy.ColumnElement]) -> str:
116
+ if isinstance(col, str):
117
+ return col
118
+ if isinstance(col, sqlalchemy.Column):
119
+ return col.name.replace(DEFAULT_DELIMITER, ".")
120
+ if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
121
+ return f"{col.name} expression"
122
+ return str(col)
123
+
124
+
115
125
  class DatasetMergeError(DataChainParamsError): # noqa: D101
116
- def __init__(self, on: Sequence[str], right_on: Optional[Sequence[str]], msg: str): # noqa: D107
117
- on_str = ", ".join(on) if isinstance(on, Sequence) else ""
126
+ def __init__( # noqa: D107
127
+ self,
128
+ on: Sequence[Union[str, sqlalchemy.ColumnElement]],
129
+ right_on: Optional[Sequence[Union[str, sqlalchemy.ColumnElement]]],
130
+ msg: str,
131
+ ):
132
+ def _get_str(on: Sequence[Union[str, sqlalchemy.ColumnElement]]) -> str:
133
+ if not isinstance(on, Sequence):
134
+ return str(on) # type: ignore[unreachable]
135
+ return ", ".join([_get_merge_error_str(col) for col in on])
136
+
137
+ on_str = _get_str(on)
118
138
  right_on_str = (
119
- ", right_on='" + ", ".join(right_on) + "'"
139
+ ", right_on='" + _get_str(right_on) + "'"
120
140
  if right_on and isinstance(right_on, Sequence)
121
141
  else ""
122
142
  )
@@ -139,7 +159,7 @@ class Sys(DataModel):
139
159
 
140
160
 
141
161
  class DataChain(DatasetQuery):
142
- """AI 🔗 DataChain - a data structure for batch data processing and evaluation.
162
+ """DataChain - a data structure for batch data processing and evaluation.
143
163
 
144
164
  It represents a sequence of data manipulation steps such as reading data from
145
165
  storages, running AI or LLM models or calling external services API to validate or
@@ -252,13 +272,24 @@ class DataChain(DatasetQuery):
252
272
  """Returns Column instance with a type if name is found in current schema,
253
273
  otherwise raises an exception.
254
274
  """
255
- name_path = name.split(".")
275
+ if "." in name:
276
+ name_path = name.split(".")
277
+ elif DEFAULT_DELIMITER in name:
278
+ name_path = name.split(DEFAULT_DELIMITER)
279
+ else:
280
+ name_path = [name]
256
281
  for path, type_, _, _ in self.signals_schema.get_flat_tree():
257
282
  if path == name_path:
258
283
  return Column(name, python_to_sql(type_))
259
284
 
260
285
  raise ValueError(f"Column with name {name} not found in the schema")
261
286
 
287
+ def c(self, column: Union[str, Column]) -> Column:
288
+ """Returns Column instance attached to the current chain."""
289
+ c = self.column(column) if isinstance(column, str) else self.column(column.name)
290
+ c.table = self.table
291
+ return c
292
+
262
293
  def print_schema(self) -> None:
263
294
  """Print schema of the chain."""
264
295
  self._effective_signals_schema.print_tree()
@@ -384,7 +415,7 @@ class DataChain(DatasetQuery):
384
415
  .save(list_dataset_name, listing=True)
385
416
  )
386
417
 
387
- dc = cls.from_dataset(list_dataset_name, session=session)
418
+ dc = cls.from_dataset(list_dataset_name, session=session, settings=settings)
388
419
  dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
389
420
 
390
421
  return ls(dc, list_path, recursive=recursive, object_name=object_name)
@@ -395,6 +426,7 @@ class DataChain(DatasetQuery):
395
426
  name: str,
396
427
  version: Optional[int] = None,
397
428
  session: Optional[Session] = None,
429
+ settings: Optional[dict] = None,
398
430
  ) -> "DataChain":
399
431
  """Get data from a saved Dataset. It returns the chain itself.
400
432
 
@@ -407,7 +439,7 @@ class DataChain(DatasetQuery):
407
439
  chain = DataChain.from_dataset("my_cats")
408
440
  ```
409
441
  """
410
- return DataChain(name=name, version=version, session=session)
442
+ return DataChain(name=name, version=version, session=session, settings=settings)
411
443
 
412
444
  @classmethod
413
445
  def from_json(
@@ -1140,8 +1172,17 @@ class DataChain(DatasetQuery):
1140
1172
  def merge(
1141
1173
  self,
1142
1174
  right_ds: "DataChain",
1143
- on: Union[str, Sequence[str]],
1144
- right_on: Union[str, Sequence[str], None] = None,
1175
+ on: Union[
1176
+ str,
1177
+ sqlalchemy.ColumnElement,
1178
+ Sequence[Union[str, sqlalchemy.ColumnElement]],
1179
+ ],
1180
+ right_on: Union[
1181
+ str,
1182
+ sqlalchemy.ColumnElement,
1183
+ Sequence[Union[str, sqlalchemy.ColumnElement]],
1184
+ None,
1185
+ ] = None,
1145
1186
  inner=False,
1146
1187
  rname="right_",
1147
1188
  ) -> "Self":
@@ -1166,7 +1207,7 @@ class DataChain(DatasetQuery):
1166
1207
  if on is None:
1167
1208
  raise DatasetMergeError(["None"], None, "'on' must be specified")
1168
1209
 
1169
- if isinstance(on, str):
1210
+ if isinstance(on, (str, sqlalchemy.ColumnElement)):
1170
1211
  on = [on]
1171
1212
  elif not isinstance(on, Sequence):
1172
1213
  raise DatasetMergeError(
@@ -1175,19 +1216,15 @@ class DataChain(DatasetQuery):
1175
1216
  f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
1176
1217
  )
1177
1218
 
1178
- signals_schema = self.signals_schema.clone_without_sys_signals()
1179
- on_columns: list[str] = signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
1180
-
1181
- right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
1182
1219
  if right_on is not None:
1183
- if isinstance(right_on, str):
1220
+ if isinstance(right_on, (str, sqlalchemy.ColumnElement)):
1184
1221
  right_on = [right_on]
1185
1222
  elif not isinstance(right_on, Sequence):
1186
1223
  raise DatasetMergeError(
1187
1224
  on,
1188
1225
  right_on,
1189
1226
  "'right_on' must be 'str' or 'Sequence' object"
1190
- f" but got type '{right_on}'",
1227
+ f" but got type '{type(right_on)}'",
1191
1228
  )
1192
1229
 
1193
1230
  if len(right_on) != len(on):
@@ -1195,34 +1232,39 @@ class DataChain(DatasetQuery):
1195
1232
  on, right_on, "'on' and 'right_on' must have the same length'"
1196
1233
  )
1197
1234
 
1198
- right_on_columns: list[str] = right_signals_schema.resolve(
1199
- *right_on
1200
- ).db_signals() # type: ignore[assignment]
1201
-
1202
- if len(right_on_columns) != len(on_columns):
1203
- on_str = ", ".join(right_on_columns)
1204
- right_on_str = ", ".join(right_on_columns)
1205
- raise DatasetMergeError(
1206
- on,
1207
- right_on,
1208
- "'on' and 'right_on' must have the same number of columns in db'."
1209
- f" on -> {on_str}, right_on -> {right_on_str}",
1210
- )
1211
- else:
1212
- right_on = on
1213
- right_on_columns = on_columns
1214
-
1215
1235
  if self == right_ds:
1216
1236
  right_ds = right_ds.clone(new_table=True)
1217
1237
 
1238
+ errors = []
1239
+
1240
+ def _resolve(
1241
+ ds: DataChain,
1242
+ col: Union[str, sqlalchemy.ColumnElement],
1243
+ side: Union[str, None],
1244
+ ):
1245
+ try:
1246
+ return ds.c(col) if isinstance(col, (str, C)) else col
1247
+ except ValueError:
1248
+ if side:
1249
+ errors.append(f"{_get_merge_error_str(col)} in {side}")
1250
+
1218
1251
  ops = [
1219
- self.c(left) == right_ds.c(right)
1220
- for left, right in zip(on_columns, right_on_columns)
1252
+ _resolve(self, left, "left")
1253
+ == _resolve(right_ds, right, "right" if right_on else None)
1254
+ for left, right in zip(on, right_on or on)
1221
1255
  ]
1222
1256
 
1257
+ if errors:
1258
+ raise DatasetMergeError(
1259
+ on, right_on, f"Could not resolve {', '.join(errors)}"
1260
+ )
1261
+
1223
1262
  ds = self.join(right_ds, sqlalchemy.and_(*ops), inner, rname + "{name}")
1224
1263
 
1225
1264
  ds.feature_schema = None
1265
+
1266
+ signals_schema = self.signals_schema.clone_without_sys_signals()
1267
+ right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
1226
1268
  ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
1227
1269
  right_signals_schema, rname
1228
1270
  )
@@ -1581,6 +1623,8 @@ class DataChain(DatasetQuery):
1581
1623
  model_name: str = "",
1582
1624
  source: bool = True,
1583
1625
  nrows=None,
1626
+ session: Optional[Session] = None,
1627
+ settings: Optional[dict] = None,
1584
1628
  **kwargs,
1585
1629
  ) -> "DataChain":
1586
1630
  """Generate chain from csv files.
@@ -1597,6 +1641,8 @@ class DataChain(DatasetQuery):
1597
1641
  model_name : Generated model name.
1598
1642
  source : Whether to include info about the source file.
1599
1643
  nrows : Optional row limit.
1644
+ session : Session to use for the chain.
1645
+ settings : Settings to use for the chain.
1600
1646
 
1601
1647
  Example:
1602
1648
  Reading a csv file:
@@ -1613,7 +1659,9 @@ class DataChain(DatasetQuery):
1613
1659
  from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
1614
1660
  from pyarrow.dataset import CsvFileFormat
1615
1661
 
1616
- chain = DataChain.from_storage(path, **kwargs)
1662
+ chain = DataChain.from_storage(
1663
+ path, session=session, settings=settings, **kwargs
1664
+ )
1617
1665
 
1618
1666
  column_names = None
1619
1667
  if not header:
@@ -1660,6 +1708,8 @@ class DataChain(DatasetQuery):
1660
1708
  object_name: str = "",
1661
1709
  model_name: str = "",
1662
1710
  source: bool = True,
1711
+ session: Optional[Session] = None,
1712
+ settings: Optional[dict] = None,
1663
1713
  **kwargs,
1664
1714
  ) -> "DataChain":
1665
1715
  """Generate chain from parquet files.
@@ -1672,6 +1722,8 @@ class DataChain(DatasetQuery):
1672
1722
  object_name : Created object column name.
1673
1723
  model_name : Generated model name.
1674
1724
  source : Whether to include info about the source file.
1725
+ session : Session to use for the chain.
1726
+ settings : Settings to use for the chain.
1675
1727
 
1676
1728
  Example:
1677
1729
  Reading a single file:
@@ -1684,7 +1736,9 @@ class DataChain(DatasetQuery):
1684
1736
  dc = DataChain.from_parquet("s3://mybucket/dir")
1685
1737
  ```
1686
1738
  """
1687
- chain = DataChain.from_storage(path, **kwargs)
1739
+ chain = DataChain.from_storage(
1740
+ path, session=session, settings=settings, **kwargs
1741
+ )
1688
1742
  return chain.parse_tabular(
1689
1743
  output=output,
1690
1744
  object_name=object_name,
datachain/lib/file.py CHANGED
@@ -195,14 +195,15 @@ class File(DataModel):
195
195
  with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
196
196
  yield f
197
197
 
198
- uid = self.get_uid()
199
- client = self._catalog.get_client(self.source)
200
- if self._caching_enabled:
201
- client.download(uid, callback=self._download_cb)
202
- with client.open_object(
203
- uid, use_cache=self._caching_enabled, cb=self._download_cb
204
- ) as f:
205
- yield io.TextIOWrapper(f) if mode == "r" else f
198
+ else:
199
+ uid = self.get_uid()
200
+ client = self._catalog.get_client(self.source)
201
+ if self._caching_enabled:
202
+ client.download(uid, callback=self._download_cb)
203
+ with client.open_object(
204
+ uid, use_cache=self._caching_enabled, cb=self._download_cb
205
+ ) as f:
206
+ yield io.TextIOWrapper(f) if mode == "r" else f
206
207
 
207
208
  def read(self, length: int = -1):
208
209
  """Returns file contents."""
datachain/lib/image.py CHANGED
@@ -34,7 +34,7 @@ def convert_image(
34
34
  from transformers.image_processing_utils import BaseImageProcessor
35
35
 
36
36
  if isinstance(transform, BaseImageProcessor):
37
- img = torch.tensor(img.pixel_values[0]) # type: ignore[assignment,attr-defined]
37
+ img = torch.as_tensor(img.pixel_values[0]).clone().detach() # type: ignore[assignment,attr-defined]
38
38
  except ImportError:
39
39
  pass
40
40
  if device:
@@ -1,15 +1,12 @@
1
- # pip install datamodel-code-generator
2
- # pip install jmespath
3
- #
4
1
  import csv
5
- import io
6
2
  import json
7
- import subprocess
8
- import sys
3
+ import tempfile
9
4
  import uuid
10
5
  from collections.abc import Iterator
11
- from typing import Any, Callable
6
+ from pathlib import Path
7
+ from typing import Callable
12
8
 
9
+ import datamodel_code_generator
13
10
  import jmespath as jsp
14
11
  from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
15
12
 
@@ -47,9 +44,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
47
44
  data_string = ""
48
45
  # using uiid to get around issue #1617
49
46
  if not model_name:
50
- uid_str = str(generate_uuid()).replace(
51
- "-", ""
52
- ) # comply with Python class names
47
+ # comply with Python class names
48
+ uid_str = str(generate_uuid()).replace("-", "")
53
49
  model_name = f"Model{data_type}{uid_str}"
54
50
  try:
55
51
  with source_file.open() as fd: # CSV can be larger than memory
@@ -70,33 +66,26 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
70
66
  if data_type == "jsonl":
71
67
  data_type = "json" # treat json line as plain JSON in auto-schema
72
68
  data_string = json.dumps(json_object)
73
- command = [
74
- "datamodel-codegen",
75
- "--input-file-type",
76
- data_type,
77
- "--class-name",
78
- model_name,
79
- "--base-class",
80
- "datachain.lib.meta_formats.UserModel",
81
- ]
82
- try:
83
- result = subprocess.run( # noqa: S603
84
- command,
85
- input=data_string,
86
- text=True,
87
- capture_output=True,
88
- check=True,
69
+
70
+ input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
71
+ input_file_type = input_file_types[data_type]
72
+ with tempfile.TemporaryDirectory() as tmpdir:
73
+ output = Path(tmpdir) / "model.py"
74
+ datamodel_code_generator.generate(
75
+ data_string,
76
+ input_file_type=input_file_type,
77
+ output=output,
78
+ target_python_version=datamodel_code_generator.PythonVersion.PY_39,
79
+ base_class="datachain.lib.meta_formats.UserModel",
80
+ class_name=model_name,
81
+ additional_imports=["datachain.lib.data_model.DataModel"],
82
+ use_standard_collections=True,
89
83
  )
90
- model_output = (
91
- result.stdout
92
- ) # This will contain the output from datamodel-codegen
93
- except subprocess.CalledProcessError as e:
94
- model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
95
- print(f"{model_output}")
96
- print("from datachain.lib.data_model import DataModel")
97
- print("\n" + f"DataModel.register({model_name})" + "\n")
98
- print("\n" + f"spec={model_name}" + "\n")
99
- return model_output
84
+ epilogue = f"""
85
+ DataModel.register({model_name})
86
+ spec = {model_name}
87
+ """
88
+ return output.read_text() + epilogue
100
89
 
101
90
 
102
91
  #
@@ -113,35 +102,25 @@ def read_meta( # noqa: C901
113
102
  ) -> Callable:
114
103
  from datachain.lib.dc import DataChain
115
104
 
116
- # ugly hack: datachain is run redirecting printed outputs to a variable
117
105
  if schema_from:
118
- captured_output = io.StringIO()
119
- current_stdout = sys.stdout
120
- sys.stdout = captured_output
121
- try:
122
- chain = (
123
- DataChain.from_storage(schema_from, type="text")
124
- .limit(1)
125
- .map( # dummy column created (#1615)
126
- meta_schema=lambda file: read_schema(
127
- file, data_type=meta_type, expr=jmespath, model_name=model_name
128
- ),
129
- output=str,
130
- )
106
+ chain = (
107
+ DataChain.from_storage(schema_from, type="text")
108
+ .limit(1)
109
+ .map( # dummy column created (#1615)
110
+ meta_schema=lambda file: read_schema(
111
+ file, data_type=meta_type, expr=jmespath, model_name=model_name
112
+ ),
113
+ output=str,
131
114
  )
132
- chain.exec()
133
- finally:
134
- sys.stdout = current_stdout
135
- model_output = captured_output.getvalue()
136
- captured_output.close()
137
-
115
+ )
116
+ (model_output,) = chain.collect("meta_schema")
138
117
  if print_schema:
139
118
  print(f"{model_output}")
140
119
  # Below 'spec' should be a dynamically converted DataModel from Pydantic
141
120
  if not spec:
142
- local_vars: dict[str, Any] = {}
143
- exec(model_output, globals(), local_vars) # noqa: S102
144
- spec = local_vars["spec"]
121
+ gl = globals()
122
+ exec(model_output, gl) # type: ignore[arg-type] # noqa: S102
123
+ spec = gl["spec"]
145
124
 
146
125
  if not (spec) and not (schema_from):
147
126
  raise ValueError(
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import logging
2
3
  from typing import ClassVar, Optional
3
4
 
@@ -69,7 +70,11 @@ class ModelStore:
69
70
 
70
71
  @staticmethod
71
72
  def is_pydantic(val):
72
- return not hasattr(val, "__origin__") and issubclass(val, BaseModel)
73
+ return (
74
+ not hasattr(val, "__origin__")
75
+ and inspect.isclass(val)
76
+ and issubclass(val, BaseModel)
77
+ )
73
78
 
74
79
  @staticmethod
75
80
  def to_pydantic(val) -> Optional[type[BaseModel]]:
datachain/lib/text.py CHANGED
@@ -33,7 +33,7 @@ def convert_text(
33
33
  res = tokenizer(text)
34
34
 
35
35
  tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
36
- tokens = torch.tensor(tokens)
36
+ tokens = torch.as_tensor(tokens).clone().detach()
37
37
  if device:
38
38
  tokens = tokens.to(device)
39
39
 
@@ -1,6 +1,7 @@
1
1
  import hashlib
2
2
  import json
3
3
  import tarfile
4
+ import warnings
4
5
  from collections.abc import Iterator, Sequence
5
6
  from pathlib import Path
6
7
  from typing import (
@@ -19,6 +20,18 @@ from datachain.lib.data_model import DataModel
19
20
  from datachain.lib.file import File, TarVFile
20
21
  from datachain.lib.utils import DataChainError
21
22
 
23
+ # The `json` method of the Pydantic `BaseModel` class has been deprecated
24
+ # and will be removed in Pydantic v3. For more details, see:
25
+ # https://github.com/pydantic/pydantic/issues/10033
26
+ # Until then, we can ignore the warning.
27
+ warnings.filterwarnings(
28
+ "ignore",
29
+ category=UserWarning,
30
+ message=(
31
+ 'Field name "json" in "WDSAllFile" shadows an attribute in parent "WDSBasic"'
32
+ ),
33
+ )
34
+
22
35
 
23
36
  class WDSError(DataChainError):
24
37
  def __init__(self, tar_stream, message: str):
@@ -1,3 +1,4 @@
1
+ import warnings
1
2
  from collections.abc import Iterator
2
3
  from typing import Optional
3
4
 
@@ -7,6 +8,18 @@ from pydantic import BaseModel, Field
7
8
  from datachain.lib.file import File
8
9
  from datachain.lib.webdataset import WDSBasic, WDSReadableSubclass
9
10
 
11
+ # The `json` method of the Pydantic `BaseModel` class has been deprecated
12
+ # and will be removed in Pydantic v3. For more details, see:
13
+ # https://github.com/pydantic/pydantic/issues/10033
14
+ # Until then, we can ignore the warning.
15
+ warnings.filterwarnings(
16
+ "ignore",
17
+ category=UserWarning,
18
+ message=(
19
+ 'Field name "json" in "WDSLaion" shadows an attribute in parent "WDSBasic"'
20
+ ),
21
+ )
22
+
10
23
 
11
24
  class Laion(WDSReadableSubclass):
12
25
  uid: str = Field(default="")
@@ -1,6 +1,5 @@
1
1
  import contextlib
2
2
  import inspect
3
- import json
4
3
  import logging
5
4
  import os
6
5
  import random
@@ -37,11 +36,7 @@ from sqlalchemy.sql.selectable import Select
37
36
  from tqdm import tqdm
38
37
 
39
38
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
40
- from datachain.catalog import (
41
- QUERY_SCRIPT_CANCELED_EXIT_CODE,
42
- QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
43
- get_catalog,
44
- )
39
+ from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
45
40
  from datachain.data_storage.schema import (
46
41
  PARTITION_COLUMN_ID,
47
42
  partition_col_names,
@@ -1173,8 +1168,12 @@ class DatasetQuery:
1173
1168
  """
1174
1169
  return self.name is not None and self.version is not None
1175
1170
 
1176
- def c(self, name: Union[C, str]) -> "ColumnClause[Any]":
1177
- col = sqlalchemy.column(name) if isinstance(name, str) else name
1171
+ def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
1172
+ col: sqlalchemy.ColumnClause = (
1173
+ sqlalchemy.column(column)
1174
+ if isinstance(column, str)
1175
+ else sqlalchemy.column(column.name, column.type)
1176
+ )
1178
1177
  col.table = self.table
1179
1178
  return col
1180
1179
 
@@ -1710,27 +1709,14 @@ class DatasetQuery:
1710
1709
  return self.__class__(name=name, version=version, catalog=self.catalog)
1711
1710
 
1712
1711
 
1713
- def _get_output_fd_for_write() -> Union[str, int]:
1714
- handle = os.getenv("DATACHAIN_OUTPUT_FD")
1715
- if not handle:
1716
- return os.devnull
1717
-
1718
- if os.name != "nt":
1719
- return int(handle)
1720
-
1721
- import msvcrt
1722
-
1723
- return msvcrt.open_osfhandle(int(handle), os.O_WRONLY) # type: ignore[attr-defined]
1724
-
1725
-
1726
- def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1712
+ def query_wrapper(dataset_query: Any) -> Any:
1727
1713
  """
1728
1714
  Wrapper function that wraps the last statement of user query script.
1729
1715
  Last statement MUST be instance of DatasetQuery, otherwise script exits with
1730
1716
  error code 10
1731
1717
  """
1732
1718
  if not isinstance(dataset_query, DatasetQuery):
1733
- sys.exit(QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE)
1719
+ return dataset_query
1734
1720
 
1735
1721
  catalog = dataset_query.catalog
1736
1722
  save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
@@ -1742,13 +1728,4 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1742
1728
  if save and (is_session_temp_dataset or not dataset_query.attached):
1743
1729
  name = catalog.generate_query_dataset_name()
1744
1730
  dataset_query = dataset_query.save(name)
1745
-
1746
- dataset: Optional[tuple[str, int]] = None
1747
- if dataset_query.attached:
1748
- assert dataset_query.name, "Dataset name should be provided"
1749
- assert dataset_query.version, "Dataset version should be provided"
1750
- dataset = dataset_query.name, dataset_query.version
1751
-
1752
- with open(_get_output_fd_for_write(), mode="w") as f:
1753
- json.dump(dataset, f)
1754
1731
  return dataset_query
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.10
3
+ Version: 0.3.12
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -80,7 +80,6 @@ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
80
80
  Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
81
81
  Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
82
82
  Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
83
- Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
84
83
  Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
85
84
  Requires-Dist: virtualenv ; extra == 'tests'
86
85
  Requires-Dist: dulwich ; extra == 'tests'
@@ -96,8 +95,14 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
96
95
  Provides-Extra: vector
97
96
  Requires-Dist: usearch ; extra == 'vector'
98
97
 
98
+ ================
99
+ |logo| DataChain
100
+ ================
101
+
99
102
  |PyPI| |Python Version| |Codecov| |Tests|
100
103
 
104
+ .. |logo| image:: docs/assets/datachain.svg
105
+ :height: 24
101
106
  .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
102
107
  :target: https://pypi.org/project/datachain/
103
108
  :alt: PyPI
@@ -111,9 +116,6 @@ Requires-Dist: usearch ; extra == 'vector'
111
116
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
112
117
  :alt: Tests
113
118
 
114
- AI 🔗 DataChain
115
- ----------------
116
-
117
119
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
118
120
  It is made to organize your unstructured data into datasets and wrangle it at scale on
119
121
  your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
@@ -6,8 +6,8 @@ datachain/cli.py,sha256=ECf_z5X8ILDJdUn2Cpb_z-ZjSRIzn7skiuMGfM-y9i0,30999
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
9
- datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
- datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
9
+ datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
10
+ datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
12
12
  datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
@@ -17,10 +17,9 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=VO-Otcg3QLbb3E9H8gmgu-xJWQqIbWmLP2QyPg8cUos,75386
20
+ datachain/catalog/catalog.py,sha256=xVFNUZ339u2l58ZyPaiJ6GsRRpwqq0LYUbdOHC-Otog,69654
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
- datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
24
23
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
24
  datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
26
25
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
@@ -40,27 +39,27 @@ datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9m
40
39
  datachain/data_storage/warehouse.py,sha256=f7ETnYIXx5KMcPfwg_4bh_00QJiMLIliwE_41vmRGUo,33037
41
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
41
  datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
43
- datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
42
+ datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
44
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
45
44
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
46
- datachain/lib/dc.py,sha256=TOC5-Ar8GQBkFpWkxVeg1og_iCJt_c0FCqA8IGzUrAk,66929
47
- datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
45
+ datachain/lib/dc.py,sha256=gYRkrriG5RJxgLpOUccDU8DFRSoeWZjgmJwHfUo_z7w,68731
46
+ datachain/lib/file.py,sha256=tNb3rJyRYGxpOc6XxcZjIQ9yVHKc7WLAOKoTYqp2TB0,11475
48
47
  datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
49
- datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
48
+ datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
50
49
  datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
51
50
  datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
52
- datachain/lib/meta_formats.py,sha256=0YM7PMcGSLpUKZppyzFi8RvoSwYOqbciFGvzkvYdTXA,7133
53
- datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
51
+ datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
52
+ datachain/lib/model_store.py,sha256=xcrQ69-jcQs716U4UFOSoSKM7EvFIWqxlPhIcE4X7oI,2497
54
53
  datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
55
54
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
56
55
  datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
57
- datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
56
+ datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
58
57
  datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
59
58
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
60
59
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
61
60
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
- datachain/lib/webdataset.py,sha256=Q3UlCk66341sq-nvFbBCX4Cv3cYXBK9n12ejG4axPXE,8298
63
- datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
61
+ datachain/lib/webdataset.py,sha256=ZzGLtOUA-QjP4kttGgNqhrioDuDnomWFlsow4fLdezQ,8717
62
+ datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
64
63
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
64
  datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
66
65
  datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
@@ -70,7 +69,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
70
69
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
71
70
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
72
71
  datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
73
- datachain/query/dataset.py,sha256=v5gCAWswv6DoEWkN7DuOc7BL4Afz8p5ZSA_GNxn5_R4,59056
72
+ datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
74
73
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
75
74
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
96
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
98
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.10.dist-info/METADATA,sha256=eUsgu4Y4iK_rJbx66MCmeKuPaWS1iMKRL6mtbEB6ucY,17056
102
- datachain-0.3.10.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
103
- datachain-0.3.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.10.dist-info/RECORD,,
99
+ datachain-0.3.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.3.12.dist-info/METADATA,sha256=I_Yz0lbiCk4KWv026U7zpDGrU72G575Hd_OnE_seb1k,17073
101
+ datachain-0.3.12.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
102
+ datachain-0.3.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.3.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.3.12.dist-info/RECORD,,
@@ -1,60 +0,0 @@
1
- import ast
2
-
3
-
4
- class SubclassFinder(ast.NodeVisitor):
5
- """Finds subclasses of a target class in an AST."""
6
-
7
- def __init__(self, target_classes: list[str]):
8
- self.imports: list[ast.AST] = []
9
- self.main_body: list[ast.AST] = []
10
-
11
- self.target_classes: list[str] = target_classes
12
- self.aliases: dict[str, str] = {}
13
- self.feature_class: list[ast.AST] = []
14
-
15
- def visit_ImportFrom(self, node): # noqa: N802
16
- module = node.module
17
- for alias in node.names:
18
- full_name = f"{module}.{alias.name}"
19
- self.aliases[alias.asname or alias.name] = full_name
20
- self.imports.append(node)
21
-
22
- def visit_Import(self, node): # noqa: N802
23
- for alias in node.names:
24
- self.aliases[alias.asname or alias.name] = alias.name
25
- self.imports.append(node)
26
-
27
- def visit_ClassDef(self, node): # noqa: N802
28
- base_names = [self.get_base_name(base) for base in node.bases]
29
- if any(self.is_subclass(name) for name in base_names):
30
- self.feature_class.append(node)
31
- else:
32
- self.main_body.append(node)
33
-
34
- def visit(self, node):
35
- if isinstance(
36
- node,
37
- (ast.Import, ast.ImportFrom, ast.ClassDef, ast.Module),
38
- ):
39
- return super().visit(node)
40
- self.main_body.append(node)
41
- return node
42
-
43
- def get_base_name(self, node):
44
- if isinstance(node, ast.Name):
45
- return self.aliases.get(node.id, node.id)
46
- if isinstance(node, ast.Attribute):
47
- return self.get_full_attr_name(node)
48
- if isinstance(node, ast.Subscript):
49
- return self.get_base_name(node.value)
50
- return None
51
-
52
- def get_full_attr_name(self, node):
53
- if isinstance(node.value, ast.Name):
54
- return f"{node.value.id}.{node.attr}"
55
- if isinstance(node.value, ast.Attribute):
56
- return f"{self.get_full_attr_name(node.value)}.{node.attr}"
57
- return node.attr
58
-
59
- def is_subclass(self, base_name):
60
- return base_name and base_name.split(".")[-1] in self.target_classes