datachain 0.2.18__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/file.py CHANGED
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
6
6
  from contextlib import contextmanager
7
7
  from datetime import datetime
8
8
  from io import BytesIO
9
- from pathlib import Path
9
+ from pathlib import Path, PurePosixPath
10
10
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
11
11
  from urllib.parse import unquote, urlparse
12
12
  from urllib.request import url2pathname
@@ -111,8 +111,7 @@ class File(DataModel):
111
111
  """`DataModel` for reading binary files."""
112
112
 
113
113
  source: str = Field(default="")
114
- parent: str = Field(default="")
115
- name: str
114
+ path: str
116
115
  size: int = Field(default=0)
117
116
  version: str = Field(default="")
118
117
  etag: str = Field(default="")
@@ -123,8 +122,7 @@ class File(DataModel):
123
122
 
124
123
  _datachain_column_types: ClassVar[dict[str, Any]] = {
125
124
  "source": String,
126
- "parent": String,
127
- "name": String,
125
+ "path": String,
128
126
  "size": Int,
129
127
  "version": String,
130
128
  "etag": String,
@@ -136,8 +134,7 @@ class File(DataModel):
136
134
 
137
135
  _unique_id_keys: ClassVar[list[str]] = [
138
136
  "source",
139
- "parent",
140
- "name",
137
+ "path",
141
138
  "size",
142
139
  "etag",
143
140
  "version",
@@ -168,11 +165,9 @@ class File(DataModel):
168
165
  def validate_location(cls, v):
169
166
  return File._validate_dict(v)
170
167
 
171
- @field_validator("parent", mode="before")
168
+ @field_validator("path", mode="before")
172
169
  @classmethod
173
170
  def validate_path(cls, path):
174
- if path == "":
175
- return ""
176
171
  return Path(path).as_posix()
177
172
 
178
173
  def model_dump_custom(self):
@@ -185,6 +180,14 @@ class File(DataModel):
185
180
  self._catalog = None
186
181
  self._caching_enabled = False
187
182
 
183
+ @property
184
+ def name(self):
185
+ return PurePosixPath(self.path).name
186
+
187
+ @property
188
+ def parent(self):
189
+ return str(PurePosixPath(self.path).parent)
190
+
188
191
  @contextmanager
189
192
  def open(self, mode: Literal["rb", "r"] = "rb"):
190
193
  """Open the file and return a file object."""
@@ -261,19 +264,19 @@ class File(DataModel):
261
264
 
262
265
  def get_file_suffix(self):
263
266
  """Returns last part of file name with `.`."""
264
- return Path(self.name).suffix
267
+ return PurePosixPath(self.path).suffix
265
268
 
266
269
  def get_file_ext(self):
267
270
  """Returns last part of file name without `.`."""
268
- return Path(self.name).suffix.strip(".")
271
+ return PurePosixPath(self.path).suffix.strip(".")
269
272
 
270
273
  def get_file_stem(self):
271
274
  """Returns file name without extension."""
272
- return Path(self.name).stem
275
+ return PurePosixPath(self.path).stem
273
276
 
274
277
  def get_full_name(self):
275
278
  """Returns name with parent directories."""
276
- return (Path(self.parent) / self.name).as_posix()
279
+ return self.path
277
280
 
278
281
  def get_uri(self):
279
282
  """Returns file URI."""
@@ -355,8 +358,7 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
355
358
 
356
359
  def get_file_type(
357
360
  source: str,
358
- parent: str,
359
- name: str,
361
+ path: str,
360
362
  size: int,
361
363
  version: str,
362
364
  etag: str,
@@ -367,8 +369,7 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
367
369
  ) -> file: # type: ignore[valid-type]
368
370
  return file(
369
371
  source=source,
370
- parent=parent,
371
- name=name,
372
+ path=path,
372
373
  size=size,
373
374
  version=version,
374
375
  etag=etag,
datachain/lib/udf.py CHANGED
@@ -1,6 +1,5 @@
1
1
  import sys
2
2
  import traceback
3
- from collections.abc import Iterable, Iterator
4
3
  from typing import TYPE_CHECKING, Callable, Optional
5
4
 
6
5
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -14,16 +13,19 @@ from datachain.lib.model_store import ModelStore
14
13
  from datachain.lib.signal_schema import SignalSchema
15
14
  from datachain.lib.udf_signature import UdfSignature
16
15
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
17
- from datachain.query.batch import RowBatch
16
+ from datachain.query.batch import UDFInputBatch
18
17
  from datachain.query.schema import ColumnParameter
19
18
  from datachain.query.udf import UDFBase as _UDFBase
20
- from datachain.query.udf import UDFProperties, UDFResult
19
+ from datachain.query.udf import UDFProperties
21
20
 
22
21
  if TYPE_CHECKING:
22
+ from collections.abc import Iterable, Iterator, Sequence
23
+
23
24
  from typing_extensions import Self
24
25
 
25
26
  from datachain.catalog import Catalog
26
- from datachain.query.batch import BatchingResult
27
+ from datachain.query.batch import RowsOutput, UDFInput
28
+ from datachain.query.udf import UDFResult
27
29
 
28
30
 
29
31
  class UdfError(DataChainParamsError):
@@ -42,22 +44,27 @@ class UDFAdapter(_UDFBase):
42
44
 
43
45
  def run(
44
46
  self,
45
- udf_inputs: "Iterable[BatchingResult]",
47
+ udf_fields: "Sequence[str]",
48
+ udf_inputs: "Iterable[RowsOutput]",
46
49
  catalog: "Catalog",
47
50
  is_generator: bool,
48
51
  cache: bool,
49
52
  download_cb: Callback = DEFAULT_CALLBACK,
50
53
  processed_cb: Callback = DEFAULT_CALLBACK,
51
- ) -> Iterator[Iterable["UDFResult"]]:
54
+ ) -> "Iterator[Iterable[UDFResult]]":
52
55
  self.inner._catalog = catalog
53
56
  if hasattr(self.inner, "setup") and callable(self.inner.setup):
54
57
  self.inner.setup()
55
58
 
56
- for batch in udf_inputs:
57
- n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
58
- output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
59
- processed_cb.relative_update(n_rows)
60
- yield output
59
+ yield from super().run(
60
+ udf_fields,
61
+ udf_inputs,
62
+ catalog,
63
+ is_generator,
64
+ cache,
65
+ download_cb,
66
+ processed_cb,
67
+ )
61
68
 
62
69
  if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
63
70
  self.inner.teardown()
@@ -65,12 +72,12 @@ class UDFAdapter(_UDFBase):
65
72
  def run_once(
66
73
  self,
67
74
  catalog: "Catalog",
68
- arg: "BatchingResult",
75
+ arg: "UDFInput",
69
76
  is_generator: bool = False,
70
77
  cache: bool = False,
71
78
  cb: Callback = DEFAULT_CALLBACK,
72
- ) -> Iterable[UDFResult]:
73
- if isinstance(arg, RowBatch):
79
+ ) -> "Iterable[UDFResult]":
80
+ if isinstance(arg, UDFInputBatch):
74
81
  udf_inputs = [
75
82
  self.bind_parameters(catalog, row, cache=cache, cb=cb)
76
83
  for row in arg.rows
@@ -119,7 +119,7 @@ class Builder:
119
119
  return self._tar.extractfile(item).read().decode(self._encoding)
120
120
 
121
121
  def add(self, file: tarfile.TarInfo):
122
- fstream = File(name=file.name)
122
+ fstream = File(path=file.name)
123
123
  ext = fstream.get_file_ext()
124
124
  stem = fstream.get_file_stem()
125
125
 
@@ -176,9 +176,8 @@ class Builder:
176
176
  )
177
177
  etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
178
178
  return File(
179
- name=core_file.name,
180
179
  source=self._tar_stream.source,
181
- parent=new_parent,
180
+ path=f"{new_parent}/{core_file.name}",
182
181
  version=self._tar_stream.version,
183
182
  size=core_file.size,
184
183
  etag=etag,
datachain/listing.py CHANGED
@@ -5,11 +5,12 @@ from itertools import zip_longest
5
5
  from typing import TYPE_CHECKING, Optional
6
6
 
7
7
  from fsspec.asyn import get_loop, sync
8
- from sqlalchemy import Column, case
8
+ from sqlalchemy import Column
9
9
  from sqlalchemy.sql import func
10
10
  from tqdm import tqdm
11
11
 
12
12
  from datachain.node import DirType, Entry, Node, NodeWithPath
13
+ from datachain.sql.functions import path as pathfunc
13
14
  from datachain.utils import suffix_to_number
14
15
 
15
16
  if TYPE_CHECKING:
@@ -129,7 +130,7 @@ class Listing:
129
130
  dir_path = []
130
131
  if not copy_dir_contents:
131
132
  dir_path.append(node.name)
132
- subtree_nodes = src.find(sort=["parent", "name"])
133
+ subtree_nodes = src.find(sort=["path"])
133
134
  all_nodes.extend(
134
135
  NodeWithPath(n.n, path=dir_path + n.path) for n in subtree_nodes
135
136
  )
@@ -148,8 +149,7 @@ class Listing:
148
149
  elif from_dataset:
149
150
  node_path = [
150
151
  src.listing.client.name,
151
- node.parent,
152
- node.name,
152
+ node.path,
153
153
  ]
154
154
  else:
155
155
  node_path = [node.name]
@@ -201,25 +201,19 @@ class Listing:
201
201
  dr = self.dataset_rows
202
202
  conds = []
203
203
  if names:
204
- f = Column("name").op("GLOB")
205
- conds.extend(f(name) for name in names)
204
+ for name in names:
205
+ conds.append(pathfunc.name(Column("path")).op("GLOB")(name))
206
206
  if inames:
207
- f = func.lower(Column("name")).op("GLOB")
208
- conds.extend(f(iname.lower()) for iname in inames)
207
+ for iname in inames:
208
+ conds.append(
209
+ func.lower(pathfunc.name(Column("path"))).op("GLOB")(iname.lower())
210
+ )
209
211
  if paths:
210
- node_path = case(
211
- (Column("parent") == "", Column("name")),
212
- else_=Column("parent") + "/" + Column("name"),
213
- )
214
- f = node_path.op("GLOB")
215
- conds.extend(f(path) for path in paths)
212
+ for path in paths:
213
+ conds.append(Column("path").op("GLOB")(path))
216
214
  if ipaths:
217
- node_path = case(
218
- (Column("parent") == "", Column("name")),
219
- else_=Column("parent") + "/" + Column("name"),
220
- )
221
- f = func.lower(node_path).op("GLOB")
222
- conds.extend(f(ipath.lower()) for ipath in ipaths)
215
+ for ipath in ipaths:
216
+ conds.append(func.lower(Column("path")).op("GLOB")(ipath.lower()))
223
217
 
224
218
  if size is not None:
225
219
  size_limit = suffix_to_number(size)
datachain/node.py CHANGED
@@ -50,8 +50,7 @@ class Node:
50
50
  sys__rand: int = -1
51
51
  vtype: str = ""
52
52
  dir_type: Optional[int] = None
53
- parent: str = ""
54
- name: str = ""
53
+ path: str = ""
55
54
  etag: str = ""
56
55
  version: Optional[str] = None
57
56
  is_latest: bool = True
@@ -62,10 +61,6 @@ class Node:
62
61
  location: Optional[str] = None
63
62
  source: StorageURI = StorageURI("")
64
63
 
65
- @property
66
- def path(self) -> str:
67
- return f"{self.parent}/{self.name}" if self.parent else self.name
68
-
69
64
  @property
70
65
  def is_dir(self) -> bool:
71
66
  return self.dir_type == DirType.DIR
@@ -107,13 +102,12 @@ class Node:
107
102
  return self.path + "/"
108
103
  return self.path
109
104
 
110
- def as_uid(self, storage: Optional[StorageURI] = None):
105
+ def as_uid(self, storage: Optional[StorageURI] = None) -> UniqueId:
111
106
  if storage is None:
112
107
  storage = self.source
113
108
  return UniqueId(
114
109
  storage=storage,
115
- parent=self.parent,
116
- name=self.name,
110
+ path=self.path,
117
111
  size=self.size,
118
112
  version=self.version or "",
119
113
  etag=self.etag,
@@ -129,20 +123,30 @@ class Node:
129
123
  return cls(**kw)
130
124
 
131
125
  @classmethod
132
- def from_dir(cls, parent, name, **kwargs) -> "Node":
133
- return cls(sys__id=-1, dir_type=DirType.DIR, parent=parent, name=name, **kwargs)
126
+ def from_dir(cls, path, **kwargs) -> "Node":
127
+ return cls(sys__id=-1, dir_type=DirType.DIR, path=path, **kwargs)
134
128
 
135
129
  @classmethod
136
130
  def root(cls) -> "Node":
137
131
  return cls(sys__id=-1, dir_type=DirType.DIR)
138
132
 
133
+ @property
134
+ def name(self):
135
+ return self.path.rsplit("/", 1)[-1]
136
+
137
+ @property
138
+ def parent(self):
139
+ split = self.path.rsplit("/", 1)
140
+ if len(split) <= 1:
141
+ return ""
142
+ return split[0]
143
+
139
144
 
140
145
  @attrs.define
141
146
  class Entry:
142
147
  vtype: str = ""
143
148
  dir_type: Optional[int] = None
144
- parent: str = ""
145
- name: str = ""
149
+ path: str = ""
146
150
  etag: str = ""
147
151
  version: str = ""
148
152
  is_latest: bool = True
@@ -157,27 +161,34 @@ class Entry:
157
161
  return self.dir_type == DirType.DIR
158
162
 
159
163
  @classmethod
160
- def from_dir(cls, parent: str, name: str, **kwargs) -> "Entry":
161
- return cls(dir_type=DirType.DIR, parent=parent, name=name, **kwargs)
164
+ def from_dir(cls, path: str, **kwargs) -> "Entry":
165
+ return cls(dir_type=DirType.DIR, path=path, **kwargs)
162
166
 
163
167
  @classmethod
164
- def from_file(cls, parent: str, name: str, **kwargs) -> "Entry":
165
- return cls(dir_type=DirType.FILE, parent=parent, name=name, **kwargs)
168
+ def from_file(cls, path: str, **kwargs) -> "Entry":
169
+ return cls(dir_type=DirType.FILE, path=path, **kwargs)
166
170
 
167
171
  @classmethod
168
172
  def root(cls):
169
173
  return cls(dir_type=DirType.DIR)
170
174
 
171
- @property
172
- def path(self) -> str:
173
- return f"{self.parent}/{self.name}" if self.parent else self.name
174
-
175
175
  @property
176
176
  def full_path(self) -> str:
177
177
  if self.is_dir and self.path:
178
178
  return self.path + "/"
179
179
  return self.path
180
180
 
181
+ @property
182
+ def name(self):
183
+ return self.path.rsplit("/", 1)[-1]
184
+
185
+ @property
186
+ def parent(self):
187
+ split = self.path.rsplit("/", 1)
188
+ if len(split) <= 1:
189
+ return ""
190
+ return split[0]
191
+
181
192
 
182
193
  def get_path(parent: str, name: str):
183
194
  return f"{parent}/{name}" if parent else name
datachain/query/batch.py CHANGED
@@ -5,21 +5,29 @@ from collections.abc import Generator, Sequence
5
5
  from dataclasses import dataclass
6
6
  from typing import TYPE_CHECKING, Callable, Optional, Union
7
7
 
8
- import sqlalchemy as sa
9
-
10
8
  from datachain.data_storage.schema import PARTITION_COLUMN_ID
11
9
  from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
12
10
 
13
11
  if TYPE_CHECKING:
12
+ from sqlalchemy import Select
13
+
14
14
  from datachain.dataset import RowDict
15
15
 
16
16
 
17
17
  @dataclass
18
- class RowBatch:
18
+ class RowsOutputBatch:
19
+ rows: Sequence[Sequence]
20
+
21
+
22
+ RowsOutput = Union[Sequence, RowsOutputBatch]
23
+
24
+
25
+ @dataclass
26
+ class UDFInputBatch:
19
27
  rows: Sequence["RowDict"]
20
28
 
21
29
 
22
- BatchingResult = Union["RowDict", RowBatch]
30
+ UDFInput = Union["RowDict", UDFInputBatch]
23
31
 
24
32
 
25
33
  class BatchingStrategy(ABC):
@@ -28,9 +36,9 @@ class BatchingStrategy(ABC):
28
36
  @abstractmethod
29
37
  def __call__(
30
38
  self,
31
- execute: Callable,
32
- query: sa.sql.selectable.Select,
33
- ) -> Generator[BatchingResult, None, None]:
39
+ execute: Callable[..., Generator[Sequence, None, None]],
40
+ query: "Select",
41
+ ) -> Generator[RowsOutput, None, None]:
34
42
  """Apply the provided parameters to the UDF."""
35
43
 
36
44
 
@@ -42,10 +50,10 @@ class NoBatching(BatchingStrategy):
42
50
 
43
51
  def __call__(
44
52
  self,
45
- execute: Callable,
46
- query: sa.sql.selectable.Select,
47
- ) -> Generator["RowDict", None, None]:
48
- return execute(query, limit=query._limit, order_by=query._order_by_clauses)
53
+ execute: Callable[..., Generator[Sequence, None, None]],
54
+ query: "Select",
55
+ ) -> Generator[Sequence, None, None]:
56
+ return execute(query)
49
57
 
50
58
 
51
59
  class Batch(BatchingStrategy):
@@ -59,31 +67,24 @@ class Batch(BatchingStrategy):
59
67
 
60
68
  def __call__(
61
69
  self,
62
- execute: Callable,
63
- query: sa.sql.selectable.Select,
64
- ) -> Generator[RowBatch, None, None]:
70
+ execute: Callable[..., Generator[Sequence, None, None]],
71
+ query: "Select",
72
+ ) -> Generator[RowsOutputBatch, None, None]:
65
73
  # choose page size that is a multiple of the batch size
66
74
  page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
67
75
 
68
76
  # select rows in batches
69
- results: list[RowDict] = []
70
-
71
- with contextlib.closing(
72
- execute(
73
- query,
74
- page_size=page_size,
75
- limit=query._limit,
76
- order_by=query._order_by_clauses,
77
- )
78
- ) as rows:
77
+ results: list[Sequence] = []
78
+
79
+ with contextlib.closing(execute(query, page_size=page_size)) as rows:
79
80
  for row in rows:
80
81
  results.append(row)
81
82
  if len(results) >= self.count:
82
83
  batch, results = results[: self.count], results[self.count :]
83
- yield RowBatch(batch)
84
+ yield RowsOutputBatch(batch)
84
85
 
85
86
  if len(results) > 0:
86
- yield RowBatch(results)
87
+ yield RowsOutputBatch(results)
87
88
 
88
89
 
89
90
  class Partition(BatchingStrategy):
@@ -95,27 +96,30 @@ class Partition(BatchingStrategy):
95
96
 
96
97
  def __call__(
97
98
  self,
98
- execute: Callable,
99
- query: sa.sql.selectable.Select,
100
- ) -> Generator[RowBatch, None, None]:
99
+ execute: Callable[..., Generator[Sequence, None, None]],
100
+ query: "Select",
101
+ ) -> Generator[RowsOutputBatch, None, None]:
101
102
  current_partition: Optional[int] = None
102
- batch: list[RowDict] = []
103
-
104
- with contextlib.closing(
105
- execute(
106
- query,
107
- order_by=(PARTITION_COLUMN_ID, "sys__id", *query._order_by_clauses),
108
- limit=query._limit,
109
- )
110
- ) as rows:
103
+ batch: list[Sequence] = []
104
+
105
+ query_fields = [str(c.name) for c in query.selected_columns]
106
+ partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
107
+
108
+ ordered_query = query.order_by(None).order_by(
109
+ PARTITION_COLUMN_ID,
110
+ "sys__id",
111
+ *query._order_by_clauses,
112
+ )
113
+
114
+ with contextlib.closing(execute(ordered_query)) as rows:
111
115
  for row in rows:
112
- partition = row[PARTITION_COLUMN_ID]
116
+ partition = row[partition_column_idx]
113
117
  if current_partition != partition:
114
118
  current_partition = partition
115
119
  if len(batch) > 0:
116
- yield RowBatch(batch)
120
+ yield RowsOutputBatch(batch)
117
121
  batch = []
118
122
  batch.append(row)
119
123
 
120
124
  if len(batch) > 0:
121
- yield RowBatch(batch)
125
+ yield RowsOutputBatch(batch)
@@ -20,8 +20,7 @@ def load_tar(raw):
20
20
  @udf(
21
21
  (
22
22
  C.source,
23
- C.name,
24
- C.parent,
23
+ C.path,
25
24
  C.size,
26
25
  C.vtype,
27
26
  C.dir_type,
@@ -37,8 +36,7 @@ def load_tar(raw):
37
36
  )
38
37
  def index_tar(
39
38
  source,
40
- name,
41
- parent,
39
+ parent_path,
42
40
  size,
43
41
  vtype,
44
42
  dir_type,
@@ -52,9 +50,8 @@ def index_tar(
52
50
  ):
53
51
  # generate original tar files as well, along with subobjects
54
52
  yield DatasetRow.create(
55
- name,
56
53
  source=source,
57
- parent=parent,
54
+ path=parent_path,
58
55
  size=size,
59
56
  vtype=vtype,
60
57
  dir_type=dir_type,
@@ -66,15 +63,12 @@ def index_tar(
66
63
  etag=etag,
67
64
  )
68
65
 
69
- parent_path = name if not parent else f"{parent}/{name}"
70
66
  for info in tar_entries:
71
67
  if info.isfile():
72
68
  full_path = f"{parent_path}/{info.name}"
73
- parent_dir, subobject_name = full_path.rsplit("/", 1)
74
69
  yield DatasetRow.create(
75
- subobject_name,
76
70
  source=source,
77
- parent=parent_dir,
71
+ path=full_path,
78
72
  size=info.size,
79
73
  vtype="tar",
80
74
  location={
@@ -83,8 +77,7 @@ def index_tar(
83
77
  "size": info.size,
84
78
  "parent": {
85
79
  "source": source,
86
- "parent": parent,
87
- "name": name,
80
+ "path": parent_path,
88
81
  "version": version,
89
82
  "size": size,
90
83
  "etag": etag,
@@ -307,7 +307,7 @@ class Subtract(DatasetDiffOperation):
307
307
  class Changed(DatasetDiffOperation):
308
308
  """
309
309
  Calculates rows that are changed in a source query compared to target query
310
- Changed means it has same source + parent + name but different last_modified
310
+ Changed means it has same source + path but different last_modified
311
311
  Example:
312
312
  >>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
313
313
  >>> ds_updated = (
@@ -461,6 +461,8 @@ class UDFStep(Step, ABC):
461
461
 
462
462
  processes = determine_processes(self.parallel)
463
463
 
464
+ udf_fields = [str(c.name) for c in query.selected_columns]
465
+
464
466
  try:
465
467
  if workers:
466
468
  from datachain.catalog.loader import get_distributed_class
@@ -473,6 +475,7 @@ class UDFStep(Step, ABC):
473
475
  query,
474
476
  workers,
475
477
  processes,
478
+ udf_fields=udf_fields,
476
479
  is_generator=self.is_generator,
477
480
  use_partitioning=use_partitioning,
478
481
  cache=self.cache,
@@ -489,6 +492,7 @@ class UDFStep(Step, ABC):
489
492
  "warehouse_clone_params": self.catalog.warehouse.clone_params(),
490
493
  "table": udf_table,
491
494
  "query": query,
495
+ "udf_fields": udf_fields,
492
496
  "batching": batching,
493
497
  "processes": processes,
494
498
  "is_generator": self.is_generator,
@@ -528,6 +532,7 @@ class UDFStep(Step, ABC):
528
532
  generated_cb = get_generated_callback(self.is_generator)
529
533
  try:
530
534
  udf_results = udf.run(
535
+ udf_fields,
531
536
  udf_inputs,
532
537
  self.catalog,
533
538
  self.is_generator,
@@ -1244,21 +1249,23 @@ class DatasetQuery:
1244
1249
  actual_params = [normalize_param(p) for p in params]
1245
1250
  try:
1246
1251
  query = self.apply_steps().select()
1252
+ query_fields = [str(c.name) for c in query.selected_columns]
1247
1253
 
1248
- def row_iter() -> Generator[RowDict, None, None]:
1254
+ def row_iter() -> Generator[Sequence, None, None]:
1249
1255
  # warehouse isn't threadsafe, we need to clone() it
1250
1256
  # in the thread that uses the results
1251
1257
  with self.catalog.warehouse.clone() as warehouse:
1252
- gen = warehouse.dataset_select_paginated(
1253
- query, limit=query._limit, order_by=query._order_by_clauses
1254
- )
1258
+ gen = warehouse.dataset_select_paginated(query)
1255
1259
  with contextlib.closing(gen) as rows:
1256
1260
  yield from rows
1257
1261
 
1258
- async def get_params(row: RowDict) -> tuple:
1262
+ async def get_params(row: Sequence) -> tuple:
1263
+ row_dict = RowDict(zip(query_fields, row))
1259
1264
  return tuple(
1260
1265
  [
1261
- await p.get_value_async(self.catalog, row, mapper, **kwargs)
1266
+ await p.get_value_async(
1267
+ self.catalog, row_dict, mapper, **kwargs
1268
+ )
1262
1269
  for p in actual_params
1263
1270
  ]
1264
1271
  )
@@ -1526,7 +1533,7 @@ class DatasetQuery:
1526
1533
 
1527
1534
  @detach
1528
1535
  def subtract(self, dq: "DatasetQuery") -> "Self":
1529
- return self._subtract(dq, on=["source", "parent", "name"])
1536
+ return self._subtract(dq, on=["source", "path"])
1530
1537
 
1531
1538
  @detach
1532
1539
  def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":