datachain 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -9,7 +9,6 @@ import os.path
9
9
  import posixpath
10
10
  import subprocess
11
11
  import sys
12
- import tempfile
13
12
  import time
14
13
  import traceback
15
14
  from collections.abc import Iterable, Iterator, Mapping, Sequence
@@ -77,7 +76,6 @@ from datachain.utils import (
77
76
  )
78
77
 
79
78
  from .datasource import DataSource
80
- from .subclass import SubclassFinder
81
79
 
82
80
  if TYPE_CHECKING:
83
81
  from datachain.data_storage import (
@@ -92,7 +90,6 @@ logger = logging.getLogger("datachain")
92
90
 
93
91
  DEFAULT_DATASET_DIR = "dataset"
94
92
  DATASET_FILE_SUFFIX = ".edatachain"
95
- FEATURE_CLASSES = ["DataModel"]
96
93
 
97
94
  TTL_INT = 4 * 60 * 60
98
95
 
@@ -569,12 +566,6 @@ def find_column_to_str( # noqa: PLR0911
569
566
  return ""
570
567
 
571
568
 
572
- def form_module_source(source_ast):
573
- module = ast.Module(body=source_ast, type_ignores=[])
574
- module = ast.fix_missing_locations(module)
575
- return ast.unparse(module)
576
-
577
-
578
569
  class Catalog:
579
570
  def __init__(
580
571
  self,
@@ -658,33 +649,12 @@ class Catalog:
658
649
  ),
659
650
  ]
660
651
  code_ast.body[-1:] = new_expressions
661
- else:
662
- raise Exception("Last line in a script was not an expression")
663
652
  return code_ast
664
653
 
665
- def compile_query_script(
666
- self, script: str, feature_module_name: str
667
- ) -> tuple[Union[str, None], str]:
654
+ def compile_query_script(self, script: str) -> str:
668
655
  code_ast = ast.parse(script)
669
656
  code_ast = self.attach_query_wrapper(code_ast)
670
- finder = SubclassFinder(FEATURE_CLASSES)
671
- finder.visit(code_ast)
672
-
673
- if not finder.feature_class:
674
- main_module = form_module_source([*finder.imports, *finder.main_body])
675
- return None, main_module
676
-
677
- feature_import = ast.ImportFrom(
678
- module=feature_module_name,
679
- names=[ast.alias(name="*", asname=None)],
680
- level=0,
681
- )
682
- feature_module = form_module_source([*finder.imports, *finder.feature_class])
683
- main_module = form_module_source(
684
- [*finder.imports, feature_import, *finder.main_body]
685
- )
686
-
687
- return feature_module, main_module
657
+ return ast.unparse(code_ast)
688
658
 
689
659
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
690
660
  config = config or self.client_config
@@ -1416,7 +1386,8 @@ class Catalog:
1416
1386
 
1417
1387
  for d in datasets:
1418
1388
  yield from (
1419
- (d, v, jobs.get(v.job_id) if v.job_id else None) for v in d.versions
1389
+ (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
1390
+ for v in d.versions
1420
1391
  )
1421
1392
 
1422
1393
  def ls_dataset_rows(
@@ -1864,29 +1835,25 @@ class Catalog:
1864
1835
  C.size > 1000
1865
1836
  )
1866
1837
  """
1867
-
1868
- feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1869
- dir=os.getcwd(), suffix=".py", delete=False
1870
- )
1871
- _, feature_module = os.path.split(feature_file.name)
1872
-
1873
- try:
1874
- lines, proc, response_text = self.run_query(
1875
- python_executable or sys.executable,
1876
- query_script,
1877
- envs,
1878
- feature_file,
1879
- capture_output,
1880
- feature_module,
1881
- output_hook,
1882
- params,
1883
- save,
1884
- job_id,
1838
+ if not job_id:
1839
+ python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
1840
+ job_id = self.metastore.create_job(
1841
+ name="",
1842
+ query=query_script,
1843
+ params=params,
1844
+ python_version=python_version,
1885
1845
  )
1886
- finally:
1887
- feature_file.close()
1888
- os.unlink(feature_file.name)
1889
1846
 
1847
+ lines, proc = self.run_query(
1848
+ python_executable or sys.executable,
1849
+ query_script,
1850
+ envs,
1851
+ capture_output,
1852
+ output_hook,
1853
+ params,
1854
+ save,
1855
+ job_id,
1856
+ )
1890
1857
  output = "".join(lines)
1891
1858
 
1892
1859
  if proc.returncode:
@@ -1896,82 +1863,69 @@ class Catalog:
1896
1863
  return_code=proc.returncode,
1897
1864
  output=output,
1898
1865
  )
1899
- if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
1900
- raise QueryScriptRunError(
1901
- "Last line in a script was not an instance of DataChain",
1902
- return_code=proc.returncode,
1903
- output=output,
1904
- )
1905
1866
  raise QueryScriptRunError(
1906
1867
  f"Query script exited with error code {proc.returncode}",
1907
1868
  return_code=proc.returncode,
1908
1869
  output=output,
1909
1870
  )
1910
1871
 
1872
+ def _get_dataset_versions_by_job_id():
1873
+ for dr, dv, job in self.list_datasets_versions():
1874
+ if job and str(job.id) == job_id:
1875
+ yield dr, dv
1876
+
1911
1877
  try:
1912
- result = json.loads(response_text)
1913
- except ValueError:
1914
- result = None
1915
-
1916
- dataset: Optional[DatasetRecord] = None
1917
- version: Optional[int] = None
1918
- if save:
1919
- dataset, version = self.save_result(
1920
- query_script, result, output, version, job_id
1878
+ dr, dv = max(
1879
+ _get_dataset_versions_by_job_id(), key=lambda x: x[1].created_at
1921
1880
  )
1881
+ except ValueError as e:
1882
+ if not save:
1883
+ return QueryResult(dataset=None, version=None, output=output)
1884
+
1885
+ raise QueryScriptDatasetNotFound(
1886
+ "No dataset found after running Query script",
1887
+ output=output,
1888
+ ) from e
1922
1889
 
1923
- return QueryResult(dataset=dataset, version=version, output=output)
1890
+ dr = self.update_dataset(
1891
+ dr,
1892
+ script_output=output,
1893
+ query_script=query_script,
1894
+ )
1895
+ self.update_dataset_version_with_warehouse_info(
1896
+ dr,
1897
+ dv.version,
1898
+ script_output=output,
1899
+ query_script=query_script,
1900
+ job_id=job_id,
1901
+ is_job_result=True,
1902
+ )
1903
+ return QueryResult(dataset=dr, version=dv.version, output=output)
1924
1904
 
1925
1905
  def run_query(
1926
1906
  self,
1927
1907
  python_executable: str,
1928
1908
  query_script: str,
1929
1909
  envs: Optional[Mapping[str, str]],
1930
- feature_file: IO[bytes],
1931
1910
  capture_output: bool,
1932
- feature_module: str,
1933
1911
  output_hook: Callable[[str], None],
1934
1912
  params: Optional[dict[str, str]],
1935
1913
  save: bool,
1936
1914
  job_id: Optional[str],
1937
- ) -> tuple[list[str], subprocess.Popen, str]:
1915
+ ) -> tuple[list[str], subprocess.Popen]:
1938
1916
  try:
1939
- feature_code, query_script_compiled = self.compile_query_script(
1940
- query_script, feature_module[:-3]
1941
- )
1942
- if feature_code:
1943
- feature_file.write(feature_code.encode())
1944
- feature_file.flush()
1945
-
1917
+ query_script_compiled = self.compile_query_script(query_script)
1946
1918
  except Exception as exc:
1947
1919
  raise QueryScriptCompileError(
1948
1920
  f"Query script failed to compile, reason: {exc}"
1949
1921
  ) from exc
1950
- r, w = os.pipe()
1951
- if os.name == "nt":
1952
- import msvcrt
1953
-
1954
- os.set_inheritable(w, True)
1955
-
1956
- startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
1957
- handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
1958
- startupinfo.lpAttributeList["handle_list"].append(handle)
1959
- kwargs: dict[str, Any] = {"startupinfo": startupinfo}
1960
- else:
1961
- handle = w
1962
- kwargs = {"pass_fds": [w]}
1963
1922
  envs = dict(envs or os.environ)
1964
- if feature_code:
1965
- envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
1966
- {feature_module: feature_code}
1967
- )
1968
1923
  envs.update(
1969
1924
  {
1970
1925
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
1971
1926
  "PYTHONPATH": os.getcwd(), # For local imports
1972
1927
  "DATACHAIN_QUERY_SAVE": "1" if save else "",
1973
1928
  "PYTHONUNBUFFERED": "1",
1974
- "DATACHAIN_OUTPUT_FD": str(handle),
1975
1929
  "DATACHAIN_JOB_ID": job_id or "",
1976
1930
  },
1977
1931
  )
@@ -1982,52 +1936,12 @@ class Catalog:
1982
1936
  stderr=subprocess.STDOUT if capture_output else None,
1983
1937
  bufsize=1,
1984
1938
  text=False,
1985
- **kwargs,
1986
1939
  ) as proc:
1987
- os.close(w)
1988
-
1989
1940
  out = proc.stdout
1990
1941
  _lines: list[str] = []
1991
1942
  ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
1992
-
1993
- with ctx as lines, open(r) as f:
1994
- response_text = ""
1995
- while proc.poll() is None:
1996
- response_text += f.readline()
1997
- time.sleep(0.1)
1998
- response_text += f.readline()
1999
- return lines, proc, response_text
2000
-
2001
- def save_result(self, query_script, exec_result, output, version, job_id):
2002
- if not exec_result:
2003
- raise QueryScriptDatasetNotFound(
2004
- "No dataset found after running Query script",
2005
- output=output,
2006
- )
2007
- name, version = exec_result
2008
- # finding returning dataset
2009
- try:
2010
- dataset = self.get_dataset(name)
2011
- dataset.get_version(version)
2012
- except (DatasetNotFoundError, ValueError) as e:
2013
- raise QueryScriptDatasetNotFound(
2014
- "No dataset found after running Query script",
2015
- output=output,
2016
- ) from e
2017
- dataset = self.update_dataset(
2018
- dataset,
2019
- script_output=output,
2020
- query_script=query_script,
2021
- )
2022
- self.update_dataset_version_with_warehouse_info(
2023
- dataset,
2024
- version,
2025
- script_output=output,
2026
- query_script=query_script,
2027
- job_id=job_id,
2028
- is_job_result=True,
2029
- )
2030
- return dataset, version
1943
+ with ctx as lines:
1944
+ return lines, proc
2031
1945
 
2032
1946
  def cp(
2033
1947
  self,
datachain/job.py CHANGED
@@ -1,7 +1,8 @@
1
1
  import json
2
+ import uuid
2
3
  from dataclasses import dataclass
3
4
  from datetime import datetime
4
- from typing import Any, Optional, TypeVar
5
+ from typing import Any, Optional, TypeVar, Union
5
6
 
6
7
  J = TypeVar("J", bound="Job")
7
8
 
@@ -25,7 +26,7 @@ class Job:
25
26
  @classmethod
26
27
  def parse(
27
28
  cls: type[J],
28
- id: str,
29
+ id: Union[str, uuid.UUID],
29
30
  name: str,
30
31
  status: int,
31
32
  created_at: datetime,
@@ -40,7 +41,7 @@ class Job:
40
41
  metrics: str,
41
42
  ) -> "Job":
42
43
  return cls(
43
- id,
44
+ str(id),
44
45
  name,
45
46
  status,
46
47
  created_at,
datachain/lib/dc.py CHANGED
@@ -56,7 +56,7 @@ from datachain.query.dataset import (
56
56
  PartitionByType,
57
57
  detach,
58
58
  )
59
- from datachain.query.schema import Column, DatasetRow
59
+ from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
60
60
  from datachain.sql.functions import path as pathfunc
61
61
  from datachain.utils import inside_notebook
62
62
 
@@ -112,11 +112,31 @@ class DatasetFromValuesError(DataChainParamsError): # noqa: D101
112
112
  super().__init__(f"Dataset{name} from values error: {msg}")
113
113
 
114
114
 
115
+ def _get_merge_error_str(col: Union[str, sqlalchemy.ColumnElement]) -> str:
116
+ if isinstance(col, str):
117
+ return col
118
+ if isinstance(col, sqlalchemy.Column):
119
+ return col.name.replace(DEFAULT_DELIMITER, ".")
120
+ if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
121
+ return f"{col.name} expression"
122
+ return str(col)
123
+
124
+
115
125
  class DatasetMergeError(DataChainParamsError): # noqa: D101
116
- def __init__(self, on: Sequence[str], right_on: Optional[Sequence[str]], msg: str): # noqa: D107
117
- on_str = ", ".join(on) if isinstance(on, Sequence) else ""
126
+ def __init__( # noqa: D107
127
+ self,
128
+ on: Sequence[Union[str, sqlalchemy.ColumnElement]],
129
+ right_on: Optional[Sequence[Union[str, sqlalchemy.ColumnElement]]],
130
+ msg: str,
131
+ ):
132
+ def _get_str(on: Sequence[Union[str, sqlalchemy.ColumnElement]]) -> str:
133
+ if not isinstance(on, Sequence):
134
+ return str(on) # type: ignore[unreachable]
135
+ return ", ".join([_get_merge_error_str(col) for col in on])
136
+
137
+ on_str = _get_str(on)
118
138
  right_on_str = (
119
- ", right_on='" + ", ".join(right_on) + "'"
139
+ ", right_on='" + _get_str(right_on) + "'"
120
140
  if right_on and isinstance(right_on, Sequence)
121
141
  else ""
122
142
  )
@@ -139,7 +159,7 @@ class Sys(DataModel):
139
159
 
140
160
 
141
161
  class DataChain(DatasetQuery):
142
- """AI 🔗 DataChain - a data structure for batch data processing and evaluation.
162
+ """DataChain - a data structure for batch data processing and evaluation.
143
163
 
144
164
  It represents a sequence of data manipulation steps such as reading data from
145
165
  storages, running AI or LLM models or calling external services API to validate or
@@ -252,13 +272,24 @@ class DataChain(DatasetQuery):
252
272
  """Returns Column instance with a type if name is found in current schema,
253
273
  otherwise raises an exception.
254
274
  """
255
- name_path = name.split(".")
275
+ if "." in name:
276
+ name_path = name.split(".")
277
+ elif DEFAULT_DELIMITER in name:
278
+ name_path = name.split(DEFAULT_DELIMITER)
279
+ else:
280
+ name_path = [name]
256
281
  for path, type_, _, _ in self.signals_schema.get_flat_tree():
257
282
  if path == name_path:
258
283
  return Column(name, python_to_sql(type_))
259
284
 
260
285
  raise ValueError(f"Column with name {name} not found in the schema")
261
286
 
287
+ def c(self, column: Union[str, Column]) -> Column:
288
+ """Returns Column instance attached to the current chain."""
289
+ c = self.column(column) if isinstance(column, str) else self.column(column.name)
290
+ c.table = self.table
291
+ return c
292
+
262
293
  def print_schema(self) -> None:
263
294
  """Print schema of the chain."""
264
295
  self._effective_signals_schema.print_tree()
@@ -1140,8 +1171,17 @@ class DataChain(DatasetQuery):
1140
1171
  def merge(
1141
1172
  self,
1142
1173
  right_ds: "DataChain",
1143
- on: Union[str, Sequence[str]],
1144
- right_on: Union[str, Sequence[str], None] = None,
1174
+ on: Union[
1175
+ str,
1176
+ sqlalchemy.ColumnElement,
1177
+ Sequence[Union[str, sqlalchemy.ColumnElement]],
1178
+ ],
1179
+ right_on: Union[
1180
+ str,
1181
+ sqlalchemy.ColumnElement,
1182
+ Sequence[Union[str, sqlalchemy.ColumnElement]],
1183
+ None,
1184
+ ] = None,
1145
1185
  inner=False,
1146
1186
  rname="right_",
1147
1187
  ) -> "Self":
@@ -1166,7 +1206,7 @@ class DataChain(DatasetQuery):
1166
1206
  if on is None:
1167
1207
  raise DatasetMergeError(["None"], None, "'on' must be specified")
1168
1208
 
1169
- if isinstance(on, str):
1209
+ if isinstance(on, (str, sqlalchemy.ColumnElement)):
1170
1210
  on = [on]
1171
1211
  elif not isinstance(on, Sequence):
1172
1212
  raise DatasetMergeError(
@@ -1175,19 +1215,15 @@ class DataChain(DatasetQuery):
1175
1215
  f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
1176
1216
  )
1177
1217
 
1178
- signals_schema = self.signals_schema.clone_without_sys_signals()
1179
- on_columns: list[str] = signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
1180
-
1181
- right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
1182
1218
  if right_on is not None:
1183
- if isinstance(right_on, str):
1219
+ if isinstance(right_on, (str, sqlalchemy.ColumnElement)):
1184
1220
  right_on = [right_on]
1185
1221
  elif not isinstance(right_on, Sequence):
1186
1222
  raise DatasetMergeError(
1187
1223
  on,
1188
1224
  right_on,
1189
1225
  "'right_on' must be 'str' or 'Sequence' object"
1190
- f" but got type '{right_on}'",
1226
+ f" but got type '{type(right_on)}'",
1191
1227
  )
1192
1228
 
1193
1229
  if len(right_on) != len(on):
@@ -1195,34 +1231,39 @@ class DataChain(DatasetQuery):
1195
1231
  on, right_on, "'on' and 'right_on' must have the same length'"
1196
1232
  )
1197
1233
 
1198
- right_on_columns: list[str] = right_signals_schema.resolve(
1199
- *right_on
1200
- ).db_signals() # type: ignore[assignment]
1201
-
1202
- if len(right_on_columns) != len(on_columns):
1203
- on_str = ", ".join(right_on_columns)
1204
- right_on_str = ", ".join(right_on_columns)
1205
- raise DatasetMergeError(
1206
- on,
1207
- right_on,
1208
- "'on' and 'right_on' must have the same number of columns in db'."
1209
- f" on -> {on_str}, right_on -> {right_on_str}",
1210
- )
1211
- else:
1212
- right_on = on
1213
- right_on_columns = on_columns
1214
-
1215
1234
  if self == right_ds:
1216
1235
  right_ds = right_ds.clone(new_table=True)
1217
1236
 
1237
+ errors = []
1238
+
1239
+ def _resolve(
1240
+ ds: DataChain,
1241
+ col: Union[str, sqlalchemy.ColumnElement],
1242
+ side: Union[str, None],
1243
+ ):
1244
+ try:
1245
+ return ds.c(col) if isinstance(col, (str, C)) else col
1246
+ except ValueError:
1247
+ if side:
1248
+ errors.append(f"{_get_merge_error_str(col)} in {side}")
1249
+
1218
1250
  ops = [
1219
- self.c(left) == right_ds.c(right)
1220
- for left, right in zip(on_columns, right_on_columns)
1251
+ _resolve(self, left, "left")
1252
+ == _resolve(right_ds, right, "right" if right_on else None)
1253
+ for left, right in zip(on, right_on or on)
1221
1254
  ]
1222
1255
 
1256
+ if errors:
1257
+ raise DatasetMergeError(
1258
+ on, right_on, f"Could not resolve {', '.join(errors)}"
1259
+ )
1260
+
1223
1261
  ds = self.join(right_ds, sqlalchemy.and_(*ops), inner, rname + "{name}")
1224
1262
 
1225
1263
  ds.feature_schema = None
1264
+
1265
+ signals_schema = self.signals_schema.clone_without_sys_signals()
1266
+ right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
1226
1267
  ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
1227
1268
  right_signals_schema, rname
1228
1269
  )
@@ -2,14 +2,14 @@
2
2
  # pip install jmespath
3
3
  #
4
4
  import csv
5
- import io
6
5
  import json
7
- import subprocess
8
- import sys
6
+ import tempfile
9
7
  import uuid
10
8
  from collections.abc import Iterator
9
+ from pathlib import Path
11
10
  from typing import Any, Callable
12
11
 
12
+ import datamodel_code_generator
13
13
  import jmespath as jsp
14
14
  from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
15
15
 
@@ -47,9 +47,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
47
47
  data_string = ""
48
48
  # using uiid to get around issue #1617
49
49
  if not model_name:
50
- uid_str = str(generate_uuid()).replace(
51
- "-", ""
52
- ) # comply with Python class names
50
+ # comply with Python class names
51
+ uid_str = str(generate_uuid()).replace("-", "")
53
52
  model_name = f"Model{data_type}{uid_str}"
54
53
  try:
55
54
  with source_file.open() as fd: # CSV can be larger than memory
@@ -70,33 +69,27 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
70
69
  if data_type == "jsonl":
71
70
  data_type = "json" # treat json line as plain JSON in auto-schema
72
71
  data_string = json.dumps(json_object)
73
- command = [
74
- "datamodel-codegen",
75
- "--input-file-type",
76
- data_type,
77
- "--class-name",
78
- model_name,
79
- "--base-class",
80
- "datachain.lib.meta_formats.UserModel",
81
- ]
82
- try:
83
- result = subprocess.run( # noqa: S603
84
- command,
85
- input=data_string,
86
- text=True,
87
- capture_output=True,
88
- check=True,
72
+
73
+ input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
74
+ input_file_type = input_file_types[data_type]
75
+ with tempfile.TemporaryDirectory() as tmpdir:
76
+ output = Path(tmpdir) / "model.py"
77
+ datamodel_code_generator.generate(
78
+ data_string,
79
+ input_file_type=input_file_type,
80
+ output=output,
81
+ target_python_version=datamodel_code_generator.PythonVersion.PY_39,
82
+ base_class="datachain.lib.meta_formats.UserModel",
83
+ class_name=model_name,
84
+ additional_imports=["datachain.lib.data_model.DataModel"],
85
+ use_standard_collections=True,
89
86
  )
90
- model_output = (
91
- result.stdout
92
- ) # This will contain the output from datamodel-codegen
93
- except subprocess.CalledProcessError as e:
94
- model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
95
- print(f"{model_output}")
96
- print("from datachain.lib.data_model import DataModel")
97
- print("\n" + f"DataModel.register({model_name})" + "\n")
98
- print("\n" + f"spec={model_name}" + "\n")
99
- return model_output
87
+ epilogue = f"""
88
+ {model_name}.model_rebuild()
89
+ DataModel.register({model_name})
90
+ spec = {model_name}
91
+ """
92
+ return output.read_text() + epilogue
100
93
 
101
94
 
102
95
  #
@@ -113,34 +106,24 @@ def read_meta( # noqa: C901
113
106
  ) -> Callable:
114
107
  from datachain.lib.dc import DataChain
115
108
 
116
- # ugly hack: datachain is run redirecting printed outputs to a variable
117
109
  if schema_from:
118
- captured_output = io.StringIO()
119
- current_stdout = sys.stdout
120
- sys.stdout = captured_output
121
- try:
122
- chain = (
123
- DataChain.from_storage(schema_from, type="text")
124
- .limit(1)
125
- .map( # dummy column created (#1615)
126
- meta_schema=lambda file: read_schema(
127
- file, data_type=meta_type, expr=jmespath, model_name=model_name
128
- ),
129
- output=str,
130
- )
110
+ chain = (
111
+ DataChain.from_storage(schema_from, type="text")
112
+ .limit(1)
113
+ .map( # dummy column created (#1615)
114
+ meta_schema=lambda file: read_schema(
115
+ file, data_type=meta_type, expr=jmespath, model_name=model_name
116
+ ),
117
+ output=str,
131
118
  )
132
- chain.exec()
133
- finally:
134
- sys.stdout = current_stdout
135
- model_output = captured_output.getvalue()
136
- captured_output.close()
137
-
119
+ )
120
+ (model_output,) = chain.collect("meta_schema")
138
121
  if print_schema:
139
122
  print(f"{model_output}")
140
123
  # Below 'spec' should be a dynamically converted DataModel from Pydantic
141
124
  if not spec:
142
125
  local_vars: dict[str, Any] = {}
143
- exec(model_output, globals(), local_vars) # noqa: S102
126
+ exec(model_output, globals(), local_vars) # type: ignore[arg-type] # noqa: S102
144
127
  spec = local_vars["spec"]
145
128
 
146
129
  if not (spec) and not (schema_from):
@@ -1,6 +1,5 @@
1
1
  import contextlib
2
2
  import inspect
3
- import json
4
3
  import logging
5
4
  import os
6
5
  import random
@@ -37,11 +36,7 @@ from sqlalchemy.sql.selectable import Select
37
36
  from tqdm import tqdm
38
37
 
39
38
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
40
- from datachain.catalog import (
41
- QUERY_SCRIPT_CANCELED_EXIT_CODE,
42
- QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
43
- get_catalog,
44
- )
39
+ from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
45
40
  from datachain.data_storage.schema import (
46
41
  PARTITION_COLUMN_ID,
47
42
  partition_col_names,
@@ -1173,8 +1168,12 @@ class DatasetQuery:
1173
1168
  """
1174
1169
  return self.name is not None and self.version is not None
1175
1170
 
1176
- def c(self, name: Union[C, str]) -> "ColumnClause[Any]":
1177
- col = sqlalchemy.column(name) if isinstance(name, str) else name
1171
+ def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
1172
+ col: sqlalchemy.ColumnClause = (
1173
+ sqlalchemy.column(column)
1174
+ if isinstance(column, str)
1175
+ else sqlalchemy.column(column.name, column.type)
1176
+ )
1178
1177
  col.table = self.table
1179
1178
  return col
1180
1179
 
@@ -1710,27 +1709,14 @@ class DatasetQuery:
1710
1709
  return self.__class__(name=name, version=version, catalog=self.catalog)
1711
1710
 
1712
1711
 
1713
- def _get_output_fd_for_write() -> Union[str, int]:
1714
- handle = os.getenv("DATACHAIN_OUTPUT_FD")
1715
- if not handle:
1716
- return os.devnull
1717
-
1718
- if os.name != "nt":
1719
- return int(handle)
1720
-
1721
- import msvcrt
1722
-
1723
- return msvcrt.open_osfhandle(int(handle), os.O_WRONLY) # type: ignore[attr-defined]
1724
-
1725
-
1726
- def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1712
+ def query_wrapper(dataset_query: Any) -> Any:
1727
1713
  """
1728
1714
  Wrapper function that wraps the last statement of user query script.
1729
1715
  Last statement MUST be instance of DatasetQuery, otherwise script exits with
1730
1716
  error code 10
1731
1717
  """
1732
1718
  if not isinstance(dataset_query, DatasetQuery):
1733
- sys.exit(QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE)
1719
+ return dataset_query
1734
1720
 
1735
1721
  catalog = dataset_query.catalog
1736
1722
  save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
@@ -1742,13 +1728,4 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1742
1728
  if save and (is_session_temp_dataset or not dataset_query.attached):
1743
1729
  name = catalog.generate_query_dataset_name()
1744
1730
  dataset_query = dataset_query.save(name)
1745
-
1746
- dataset: Optional[tuple[str, int]] = None
1747
- if dataset_query.attached:
1748
- assert dataset_query.name, "Dataset name should be provided"
1749
- assert dataset_query.version, "Dataset version should be provided"
1750
- dataset = dataset_query.name, dataset_query.version
1751
-
1752
- with open(_get_output_fd_for_write(), mode="w") as f:
1753
- json.dump(dataset, f)
1754
1731
  return dataset_query
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.10
3
+ Version: 0.3.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -96,6 +96,10 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
96
96
  Provides-Extra: vector
97
97
  Requires-Dist: usearch ; extra == 'vector'
98
98
 
99
+ .. image:: docs/assets/datachain_logotype.svg
100
+ :height: 48
101
+ :alt: DataChain logo
102
+
99
103
  |PyPI| |Python Version| |Codecov| |Tests|
100
104
 
101
105
  .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
@@ -111,7 +115,6 @@ Requires-Dist: usearch ; extra == 'vector'
111
115
  :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
112
116
  :alt: Tests
113
117
 
114
- AI 🔗 DataChain
115
118
  ----------------
116
119
 
117
120
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
@@ -7,7 +7,7 @@ datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
- datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
10
+ datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
12
12
  datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
@@ -17,10 +17,9 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=VO-Otcg3QLbb3E9H8gmgu-xJWQqIbWmLP2QyPg8cUos,75386
20
+ datachain/catalog/catalog.py,sha256=NgS7_SlmpJdUSp1v8KdCuLTjFklmYvT_jOLdzTyyK5I,72313
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
- datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
24
23
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
24
  datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
26
25
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
@@ -43,13 +42,13 @@ datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
43
42
  datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
44
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
45
44
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
46
- datachain/lib/dc.py,sha256=TOC5-Ar8GQBkFpWkxVeg1og_iCJt_c0FCqA8IGzUrAk,66929
45
+ datachain/lib/dc.py,sha256=s4E-bD6_T6JFJ7TEa5Y9RS705lIfcV9OUJwDD6RNCX0,68156
47
46
  datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
48
47
  datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
49
48
  datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
50
49
  datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
51
50
  datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
52
- datachain/lib/meta_formats.py,sha256=0YM7PMcGSLpUKZppyzFi8RvoSwYOqbciFGvzkvYdTXA,7133
51
+ datachain/lib/meta_formats.py,sha256=67uF9trQ2II6xFvN0u6eo5NNRf5xvCkpMHj7ThiG41Y,6777
53
52
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
54
53
  datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
55
54
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
@@ -70,7 +69,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
70
69
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
71
70
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
72
71
  datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
73
- datachain/query/dataset.py,sha256=v5gCAWswv6DoEWkN7DuOc7BL4Afz8p5ZSA_GNxn5_R4,59056
72
+ datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
74
73
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
75
74
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
96
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
98
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.10.dist-info/METADATA,sha256=eUsgu4Y4iK_rJbx66MCmeKuPaWS1iMKRL6mtbEB6ucY,17056
102
- datachain-0.3.10.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
103
- datachain-0.3.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.10.dist-info/RECORD,,
99
+ datachain-0.3.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.3.11.dist-info/METADATA,sha256=iSdfjWpVT1Iqzlg82eN5QzJ-icaYxkG7TUKEpEOi5sk,17124
101
+ datachain-0.3.11.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
102
+ datachain-0.3.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.3.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.3.11.dist-info/RECORD,,
@@ -1,60 +0,0 @@
1
- import ast
2
-
3
-
4
- class SubclassFinder(ast.NodeVisitor):
5
- """Finds subclasses of a target class in an AST."""
6
-
7
- def __init__(self, target_classes: list[str]):
8
- self.imports: list[ast.AST] = []
9
- self.main_body: list[ast.AST] = []
10
-
11
- self.target_classes: list[str] = target_classes
12
- self.aliases: dict[str, str] = {}
13
- self.feature_class: list[ast.AST] = []
14
-
15
- def visit_ImportFrom(self, node): # noqa: N802
16
- module = node.module
17
- for alias in node.names:
18
- full_name = f"{module}.{alias.name}"
19
- self.aliases[alias.asname or alias.name] = full_name
20
- self.imports.append(node)
21
-
22
- def visit_Import(self, node): # noqa: N802
23
- for alias in node.names:
24
- self.aliases[alias.asname or alias.name] = alias.name
25
- self.imports.append(node)
26
-
27
- def visit_ClassDef(self, node): # noqa: N802
28
- base_names = [self.get_base_name(base) for base in node.bases]
29
- if any(self.is_subclass(name) for name in base_names):
30
- self.feature_class.append(node)
31
- else:
32
- self.main_body.append(node)
33
-
34
- def visit(self, node):
35
- if isinstance(
36
- node,
37
- (ast.Import, ast.ImportFrom, ast.ClassDef, ast.Module),
38
- ):
39
- return super().visit(node)
40
- self.main_body.append(node)
41
- return node
42
-
43
- def get_base_name(self, node):
44
- if isinstance(node, ast.Name):
45
- return self.aliases.get(node.id, node.id)
46
- if isinstance(node, ast.Attribute):
47
- return self.get_full_attr_name(node)
48
- if isinstance(node, ast.Subscript):
49
- return self.get_base_name(node.value)
50
- return None
51
-
52
- def get_full_attr_name(self, node):
53
- if isinstance(node.value, ast.Name):
54
- return f"{node.value.id}.{node.attr}"
55
- if isinstance(node.value, ast.Attribute):
56
- return f"{self.get_full_attr_name(node.value)}.{node.attr}"
57
- return node.attr
58
-
59
- def is_subclass(self, base_name):
60
- return base_name and base_name.split(".")[-1] in self.target_classes