datachain 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -676,7 +676,7 @@ class Catalog:
676
676
 
677
677
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
678
678
  config = config or self.client_config
679
- return Client.parse_url(uri, self.metastore, self.cache, **config)
679
+ return Client.parse_url(uri, self.cache, **config)
680
680
 
681
681
  def get_client(self, uri: StorageURI, **config: Any) -> Client:
682
682
  """
@@ -37,7 +37,6 @@ from datachain.storage import StorageURI
37
37
  if TYPE_CHECKING:
38
38
  from fsspec.spec import AbstractFileSystem
39
39
 
40
- from datachain.data_storage import AbstractMetastore
41
40
 
42
41
  logger = logging.getLogger("datachain")
43
42
 
@@ -116,13 +115,12 @@ class Client(ABC):
116
115
  @staticmethod
117
116
  def parse_url(
118
117
  source: str,
119
- metastore: "AbstractMetastore",
120
118
  cache: DataChainCache,
121
119
  **kwargs,
122
120
  ) -> tuple["Client", str]:
123
121
  cls = Client.get_implementation(source)
124
122
  storage_url, rel_path = cls.split_url(source)
125
- client = cls.from_name(storage_url, metastore, cache, kwargs)
123
+ client = cls.from_name(storage_url, cache, kwargs)
126
124
  return client, rel_path
127
125
 
128
126
  @classmethod
@@ -136,7 +134,6 @@ class Client(ABC):
136
134
  def from_name(
137
135
  cls,
138
136
  name: str,
139
- metastore: "AbstractMetastore",
140
137
  cache: DataChainCache,
141
138
  kwargs: dict[str, Any],
142
139
  ) -> "Client":
datachain/client/local.py CHANGED
@@ -2,7 +2,7 @@ import os
2
2
  import posixpath
3
3
  from datetime import datetime, timezone
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import Any
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
@@ -12,9 +12,6 @@ from datachain.storage import StorageURI
12
12
 
13
13
  from .fsspec import Client
14
14
 
15
- if TYPE_CHECKING:
16
- from datachain.data_storage import AbstractMetastore
17
-
18
15
 
19
16
  class FileClient(Client):
20
17
  FS_CLASS = LocalFileSystem
@@ -97,9 +94,7 @@ class FileClient(Client):
97
94
  return cls.root_dir(), uri.removeprefix(cls.root_path().as_uri())
98
95
 
99
96
  @classmethod
100
- def from_name(
101
- cls, name: str, metastore: "AbstractMetastore", cache, kwargs
102
- ) -> "FileClient":
97
+ def from_name(cls, name: str, cache, kwargs) -> "FileClient":
103
98
  use_symlinks = kwargs.pop("use_symlinks", False)
104
99
  return cls(name, kwargs, cache, use_symlinks=use_symlinks)
105
100
 
@@ -17,7 +17,7 @@ from sqlalchemy.sql.expression import true
17
17
 
18
18
  from datachain.client import Client
19
19
  from datachain.data_storage.serializer import Serializable
20
- from datachain.dataset import DatasetRecord, RowDict
20
+ from datachain.dataset import DatasetRecord
21
21
  from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
22
22
  from datachain.sql.functions import path as pathfunc
23
23
  from datachain.sql.types import Int, SQLType
@@ -201,23 +201,17 @@ class AbstractWarehouse(ABC, Serializable):
201
201
  def dataset_select_paginated(
202
202
  self,
203
203
  query,
204
- limit: Optional[int] = None,
205
- order_by: tuple["ColumnElement[Any]", ...] = (),
206
204
  page_size: int = SELECT_BATCH_SIZE,
207
- ) -> Generator[RowDict, None, None]:
205
+ ) -> Generator[Sequence, None, None]:
208
206
  """
209
207
  This is equivalent to `db.execute`, but for selecting rows in batches
210
208
  """
211
- cols = query.selected_columns
212
- cols_names = [c.name for c in cols]
209
+ limit = query._limit
210
+ paginated_query = query.limit(page_size)
213
211
 
214
- if not order_by:
215
- ordering = [cols.sys__id]
216
- else:
217
- ordering = order_by # type: ignore[assignment]
218
-
219
- # reset query order by and apply new order by id
220
- paginated_query = query.order_by(None).order_by(*ordering).limit(page_size)
212
+ if not paginated_query._order_by_clauses:
213
+ # default order by is order by `sys__id`
214
+ paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
221
215
 
222
216
  results = None
223
217
  offset = 0
@@ -236,7 +230,7 @@ class AbstractWarehouse(ABC, Serializable):
236
230
  processed = False
237
231
  for row in results:
238
232
  processed = True
239
- yield RowDict(zip(cols_names, row))
233
+ yield row
240
234
  num_yielded += 1
241
235
 
242
236
  if not processed:
datachain/lib/dc.py CHANGED
@@ -1623,7 +1623,7 @@ class DataChain(DatasetQuery):
1623
1623
 
1624
1624
  Using glob to match patterns
1625
1625
  ```py
1626
- dc.filter(C("file.name").glob("*.jpg))
1626
+ dc.filter(C("file.name").glob("*.jpg"))
1627
1627
  ```
1628
1628
 
1629
1629
  Using `datachain.sql.functions`
datachain/lib/udf.py CHANGED
@@ -1,6 +1,5 @@
1
1
  import sys
2
2
  import traceback
3
- from collections.abc import Iterable, Iterator
4
3
  from typing import TYPE_CHECKING, Callable, Optional
5
4
 
6
5
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -14,16 +13,19 @@ from datachain.lib.model_store import ModelStore
14
13
  from datachain.lib.signal_schema import SignalSchema
15
14
  from datachain.lib.udf_signature import UdfSignature
16
15
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
17
- from datachain.query.batch import RowBatch
16
+ from datachain.query.batch import UDFInputBatch
18
17
  from datachain.query.schema import ColumnParameter
19
18
  from datachain.query.udf import UDFBase as _UDFBase
20
- from datachain.query.udf import UDFProperties, UDFResult
19
+ from datachain.query.udf import UDFProperties
21
20
 
22
21
  if TYPE_CHECKING:
22
+ from collections.abc import Iterable, Iterator, Sequence
23
+
23
24
  from typing_extensions import Self
24
25
 
25
26
  from datachain.catalog import Catalog
26
- from datachain.query.batch import BatchingResult
27
+ from datachain.query.batch import RowsOutput, UDFInput
28
+ from datachain.query.udf import UDFResult
27
29
 
28
30
 
29
31
  class UdfError(DataChainParamsError):
@@ -42,22 +44,27 @@ class UDFAdapter(_UDFBase):
42
44
 
43
45
  def run(
44
46
  self,
45
- udf_inputs: "Iterable[BatchingResult]",
47
+ udf_fields: "Sequence[str]",
48
+ udf_inputs: "Iterable[RowsOutput]",
46
49
  catalog: "Catalog",
47
50
  is_generator: bool,
48
51
  cache: bool,
49
52
  download_cb: Callback = DEFAULT_CALLBACK,
50
53
  processed_cb: Callback = DEFAULT_CALLBACK,
51
- ) -> Iterator[Iterable["UDFResult"]]:
54
+ ) -> "Iterator[Iterable[UDFResult]]":
52
55
  self.inner._catalog = catalog
53
56
  if hasattr(self.inner, "setup") and callable(self.inner.setup):
54
57
  self.inner.setup()
55
58
 
56
- for batch in udf_inputs:
57
- n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
58
- output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
59
- processed_cb.relative_update(n_rows)
60
- yield output
59
+ yield from super().run(
60
+ udf_fields,
61
+ udf_inputs,
62
+ catalog,
63
+ is_generator,
64
+ cache,
65
+ download_cb,
66
+ processed_cb,
67
+ )
61
68
 
62
69
  if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
63
70
  self.inner.teardown()
@@ -65,12 +72,12 @@ class UDFAdapter(_UDFBase):
65
72
  def run_once(
66
73
  self,
67
74
  catalog: "Catalog",
68
- arg: "BatchingResult",
75
+ arg: "UDFInput",
69
76
  is_generator: bool = False,
70
77
  cache: bool = False,
71
78
  cb: Callback = DEFAULT_CALLBACK,
72
- ) -> Iterable[UDFResult]:
73
- if isinstance(arg, RowBatch):
79
+ ) -> "Iterable[UDFResult]":
80
+ if isinstance(arg, UDFInputBatch):
74
81
  udf_inputs = [
75
82
  self.bind_parameters(catalog, row, cache=cache, cb=cb)
76
83
  for row in arg.rows
datachain/query/batch.py CHANGED
@@ -5,21 +5,29 @@ from collections.abc import Generator, Sequence
5
5
  from dataclasses import dataclass
6
6
  from typing import TYPE_CHECKING, Callable, Optional, Union
7
7
 
8
- import sqlalchemy as sa
9
-
10
8
  from datachain.data_storage.schema import PARTITION_COLUMN_ID
11
9
  from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
12
10
 
13
11
  if TYPE_CHECKING:
12
+ from sqlalchemy import Select
13
+
14
14
  from datachain.dataset import RowDict
15
15
 
16
16
 
17
17
  @dataclass
18
- class RowBatch:
18
+ class RowsOutputBatch:
19
+ rows: Sequence[Sequence]
20
+
21
+
22
+ RowsOutput = Union[Sequence, RowsOutputBatch]
23
+
24
+
25
+ @dataclass
26
+ class UDFInputBatch:
19
27
  rows: Sequence["RowDict"]
20
28
 
21
29
 
22
- BatchingResult = Union["RowDict", RowBatch]
30
+ UDFInput = Union["RowDict", UDFInputBatch]
23
31
 
24
32
 
25
33
  class BatchingStrategy(ABC):
@@ -28,9 +36,9 @@ class BatchingStrategy(ABC):
28
36
  @abstractmethod
29
37
  def __call__(
30
38
  self,
31
- execute: Callable,
32
- query: sa.sql.selectable.Select,
33
- ) -> Generator[BatchingResult, None, None]:
39
+ execute: Callable[..., Generator[Sequence, None, None]],
40
+ query: "Select",
41
+ ) -> Generator[RowsOutput, None, None]:
34
42
  """Apply the provided parameters to the UDF."""
35
43
 
36
44
 
@@ -42,10 +50,10 @@ class NoBatching(BatchingStrategy):
42
50
 
43
51
  def __call__(
44
52
  self,
45
- execute: Callable,
46
- query: sa.sql.selectable.Select,
47
- ) -> Generator["RowDict", None, None]:
48
- return execute(query, limit=query._limit, order_by=query._order_by_clauses)
53
+ execute: Callable[..., Generator[Sequence, None, None]],
54
+ query: "Select",
55
+ ) -> Generator[Sequence, None, None]:
56
+ return execute(query)
49
57
 
50
58
 
51
59
  class Batch(BatchingStrategy):
@@ -59,31 +67,24 @@ class Batch(BatchingStrategy):
59
67
 
60
68
  def __call__(
61
69
  self,
62
- execute: Callable,
63
- query: sa.sql.selectable.Select,
64
- ) -> Generator[RowBatch, None, None]:
70
+ execute: Callable[..., Generator[Sequence, None, None]],
71
+ query: "Select",
72
+ ) -> Generator[RowsOutputBatch, None, None]:
65
73
  # choose page size that is a multiple of the batch size
66
74
  page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
67
75
 
68
76
  # select rows in batches
69
- results: list[RowDict] = []
70
-
71
- with contextlib.closing(
72
- execute(
73
- query,
74
- page_size=page_size,
75
- limit=query._limit,
76
- order_by=query._order_by_clauses,
77
- )
78
- ) as rows:
77
+ results: list[Sequence] = []
78
+
79
+ with contextlib.closing(execute(query, page_size=page_size)) as rows:
79
80
  for row in rows:
80
81
  results.append(row)
81
82
  if len(results) >= self.count:
82
83
  batch, results = results[: self.count], results[self.count :]
83
- yield RowBatch(batch)
84
+ yield RowsOutputBatch(batch)
84
85
 
85
86
  if len(results) > 0:
86
- yield RowBatch(results)
87
+ yield RowsOutputBatch(results)
87
88
 
88
89
 
89
90
  class Partition(BatchingStrategy):
@@ -95,27 +96,30 @@ class Partition(BatchingStrategy):
95
96
 
96
97
  def __call__(
97
98
  self,
98
- execute: Callable,
99
- query: sa.sql.selectable.Select,
100
- ) -> Generator[RowBatch, None, None]:
99
+ execute: Callable[..., Generator[Sequence, None, None]],
100
+ query: "Select",
101
+ ) -> Generator[RowsOutputBatch, None, None]:
101
102
  current_partition: Optional[int] = None
102
- batch: list[RowDict] = []
103
-
104
- with contextlib.closing(
105
- execute(
106
- query,
107
- order_by=(PARTITION_COLUMN_ID, "sys__id", *query._order_by_clauses),
108
- limit=query._limit,
109
- )
110
- ) as rows:
103
+ batch: list[Sequence] = []
104
+
105
+ query_fields = [str(c.name) for c in query.selected_columns]
106
+ partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
107
+
108
+ ordered_query = query.order_by(None).order_by(
109
+ PARTITION_COLUMN_ID,
110
+ "sys__id",
111
+ *query._order_by_clauses,
112
+ )
113
+
114
+ with contextlib.closing(execute(ordered_query)) as rows:
111
115
  for row in rows:
112
- partition = row[PARTITION_COLUMN_ID]
116
+ partition = row[partition_column_idx]
113
117
  if current_partition != partition:
114
118
  current_partition = partition
115
119
  if len(batch) > 0:
116
- yield RowBatch(batch)
120
+ yield RowsOutputBatch(batch)
117
121
  batch = []
118
122
  batch.append(row)
119
123
 
120
124
  if len(batch) > 0:
121
- yield RowBatch(batch)
125
+ yield RowsOutputBatch(batch)
@@ -461,6 +461,8 @@ class UDFStep(Step, ABC):
461
461
 
462
462
  processes = determine_processes(self.parallel)
463
463
 
464
+ udf_fields = [str(c.name) for c in query.selected_columns]
465
+
464
466
  try:
465
467
  if workers:
466
468
  from datachain.catalog.loader import get_distributed_class
@@ -473,6 +475,7 @@ class UDFStep(Step, ABC):
473
475
  query,
474
476
  workers,
475
477
  processes,
478
+ udf_fields=udf_fields,
476
479
  is_generator=self.is_generator,
477
480
  use_partitioning=use_partitioning,
478
481
  cache=self.cache,
@@ -489,6 +492,7 @@ class UDFStep(Step, ABC):
489
492
  "warehouse_clone_params": self.catalog.warehouse.clone_params(),
490
493
  "table": udf_table,
491
494
  "query": query,
495
+ "udf_fields": udf_fields,
492
496
  "batching": batching,
493
497
  "processes": processes,
494
498
  "is_generator": self.is_generator,
@@ -528,6 +532,7 @@ class UDFStep(Step, ABC):
528
532
  generated_cb = get_generated_callback(self.is_generator)
529
533
  try:
530
534
  udf_results = udf.run(
535
+ udf_fields,
531
536
  udf_inputs,
532
537
  self.catalog,
533
538
  self.is_generator,
@@ -1244,21 +1249,23 @@ class DatasetQuery:
1244
1249
  actual_params = [normalize_param(p) for p in params]
1245
1250
  try:
1246
1251
  query = self.apply_steps().select()
1252
+ query_fields = [str(c.name) for c in query.selected_columns]
1247
1253
 
1248
- def row_iter() -> Generator[RowDict, None, None]:
1254
+ def row_iter() -> Generator[Sequence, None, None]:
1249
1255
  # warehouse isn't threadsafe, we need to clone() it
1250
1256
  # in the thread that uses the results
1251
1257
  with self.catalog.warehouse.clone() as warehouse:
1252
- gen = warehouse.dataset_select_paginated(
1253
- query, limit=query._limit, order_by=query._order_by_clauses
1254
- )
1258
+ gen = warehouse.dataset_select_paginated(query)
1255
1259
  with contextlib.closing(gen) as rows:
1256
1260
  yield from rows
1257
1261
 
1258
- async def get_params(row: RowDict) -> tuple:
1262
+ async def get_params(row: Sequence) -> tuple:
1263
+ row_dict = RowDict(zip(query_fields, row))
1259
1264
  return tuple(
1260
1265
  [
1261
- await p.get_value_async(self.catalog, row, mapper, **kwargs)
1266
+ await p.get_value_async(
1267
+ self.catalog, row_dict, mapper, **kwargs
1268
+ )
1262
1269
  for p in actual_params
1263
1270
  ]
1264
1271
  )
@@ -2,11 +2,8 @@ import contextlib
2
2
  from collections.abc import Iterator, Sequence
3
3
  from itertools import chain
4
4
  from multiprocessing import cpu_count
5
- from queue import Empty, Full, Queue
6
5
  from sys import stdin
7
- from time import sleep
8
- from types import GeneratorType
9
- from typing import Any, Optional
6
+ from typing import Optional
10
7
 
11
8
  import attrs
12
9
  import multiprocess
@@ -22,7 +19,16 @@ from datachain.query.dataset import (
22
19
  get_processed_callback,
23
20
  process_udf_outputs,
24
21
  )
22
+ from datachain.query.queue import (
23
+ get_from_queue,
24
+ marshal,
25
+ msgpack_pack,
26
+ msgpack_unpack,
27
+ put_into_queue,
28
+ unmarshal,
29
+ )
25
30
  from datachain.query.udf import UDFBase, UDFFactory, UDFResult
31
+ from datachain.utils import batched_it
26
32
 
27
33
  DEFAULT_BATCH_SIZE = 10000
28
34
  STOP_SIGNAL = "STOP"
@@ -44,44 +50,6 @@ def get_n_workers_from_arg(n_workers: Optional[int] = None) -> int:
44
50
  return n_workers
45
51
 
46
52
 
47
- # For more context on the get_from_queue and put_into_queue functions, see the
48
- # discussion here:
49
- # https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
50
- # This problem is not exactly described by, but is also related to these Python issues:
51
- # https://github.com/python/cpython/issues/66587
52
- # https://github.com/python/cpython/issues/88628
53
- # https://github.com/python/cpython/issues/108645
54
-
55
-
56
- def get_from_queue(queue: Queue) -> Any:
57
- """
58
- Gets an item from a queue.
59
- This is required to handle signals, such as KeyboardInterrupt exceptions
60
- while waiting for items to be available, although only on certain installations.
61
- (See the above comment for more context.)
62
- """
63
- while True:
64
- try:
65
- return queue.get_nowait()
66
- except Empty:
67
- sleep(0.01)
68
-
69
-
70
- def put_into_queue(queue: Queue, item: Any) -> None:
71
- """
72
- Puts an item into a queue.
73
- This is required to handle signals, such as KeyboardInterrupt exceptions
74
- while waiting for items to be queued, although only on certain installations.
75
- (See the above comment for more context.)
76
- """
77
- while True:
78
- try:
79
- queue.put_nowait(item)
80
- return
81
- except Full:
82
- sleep(0.01)
83
-
84
-
85
53
  def udf_entrypoint() -> int:
86
54
  # Load UDF info from stdin
87
55
  udf_info = load(stdin.buffer)
@@ -100,8 +68,9 @@ def udf_entrypoint() -> int:
100
68
  udf_info["id_generator_clone_params"],
101
69
  udf_info["metastore_clone_params"],
102
70
  udf_info["warehouse_clone_params"],
103
- is_generator=udf_info.get("is_generator", False),
71
+ udf_fields=udf_info["udf_fields"],
104
72
  cache=udf_info["cache"],
73
+ is_generator=udf_info.get("is_generator", False),
105
74
  )
106
75
 
107
76
  query = udf_info["query"]
@@ -121,7 +90,7 @@ def udf_entrypoint() -> int:
121
90
  generated_cb = get_generated_callback(dispatch.is_generator)
122
91
  try:
123
92
  udf_results = dispatch.run_udf_parallel(
124
- udf_inputs,
93
+ marshal(udf_inputs),
125
94
  n_workers=n_workers,
126
95
  processed_cb=processed_cb,
127
96
  download_cb=download_cb,
@@ -142,6 +111,9 @@ def udf_worker_entrypoint() -> int:
142
111
 
143
112
 
144
113
  class UDFDispatcher:
114
+ catalog: Optional[Catalog] = None
115
+ task_queue: Optional[multiprocess.Queue] = None
116
+ done_queue: Optional[multiprocess.Queue] = None
145
117
  _batch_size: Optional[int] = None
146
118
 
147
119
  def __init__(
@@ -151,9 +123,10 @@ class UDFDispatcher:
151
123
  id_generator_clone_params,
152
124
  metastore_clone_params,
153
125
  warehouse_clone_params,
154
- cache,
155
- is_generator=False,
156
- buffer_size=DEFAULT_BATCH_SIZE,
126
+ udf_fields: "Sequence[str]",
127
+ cache: bool,
128
+ is_generator: bool = False,
129
+ buffer_size: int = DEFAULT_BATCH_SIZE,
157
130
  ):
158
131
  self.udf_data = udf_data
159
132
  self.catalog_init_params = catalog_init_params
@@ -172,12 +145,13 @@ class UDFDispatcher:
172
145
  self.warehouse_args,
173
146
  self.warehouse_kwargs,
174
147
  ) = warehouse_clone_params
175
- self.is_generator = is_generator
148
+ self.udf_fields = udf_fields
176
149
  self.cache = cache
150
+ self.is_generator = is_generator
151
+ self.buffer_size = buffer_size
177
152
  self.catalog = None
178
153
  self.task_queue = None
179
154
  self.done_queue = None
180
- self.buffer_size = buffer_size
181
155
  self.ctx = get_context("spawn")
182
156
 
183
157
  @property
@@ -226,6 +200,7 @@ class UDFDispatcher:
226
200
  self.done_queue,
227
201
  self.is_generator,
228
202
  self.cache,
203
+ self.udf_fields,
229
204
  )
230
205
 
231
206
  def _run_worker(self) -> None:
@@ -233,7 +208,11 @@ class UDFDispatcher:
233
208
  worker = self._create_worker()
234
209
  worker.run()
235
210
  except (Exception, KeyboardInterrupt) as e:
236
- put_into_queue(self.done_queue, {"status": FAILED_STATUS, "exception": e})
211
+ if self.done_queue:
212
+ put_into_queue(
213
+ self.done_queue,
214
+ {"status": FAILED_STATUS, "exception": e},
215
+ )
237
216
  raise
238
217
 
239
218
  @staticmethod
@@ -249,7 +228,6 @@ class UDFDispatcher:
249
228
  self,
250
229
  input_rows,
251
230
  n_workers: Optional[int] = None,
252
- cache: bool = False,
253
231
  input_queue=None,
254
232
  processed_cb: Callback = DEFAULT_CALLBACK,
255
233
  download_cb: Callback = DEFAULT_CALLBACK,
@@ -299,21 +277,24 @@ class UDFDispatcher:
299
277
  result = get_from_queue(self.done_queue)
300
278
  status = result["status"]
301
279
  if status == NOTIFY_STATUS:
302
- download_cb.relative_update(result["downloaded"])
280
+ if downloaded := result.get("downloaded"):
281
+ download_cb.relative_update(downloaded)
282
+ if processed := result.get("processed"):
283
+ processed_cb.relative_update(processed)
303
284
  elif status == FINISHED_STATUS:
304
285
  # Worker finished
305
286
  n_workers -= 1
306
287
  elif status == OK_STATUS:
307
- processed_cb.relative_update(result["processed"])
308
- yield result["result"]
288
+ if processed := result.get("processed"):
289
+ processed_cb.relative_update(processed)
290
+ yield msgpack_unpack(result["result"])
309
291
  else: # Failed / error
310
292
  n_workers -= 1
311
- exc = result.get("exception")
312
- if exc:
293
+ if exc := result.get("exception"):
313
294
  raise exc
314
295
  raise RuntimeError("Internal error: Parallel UDF execution failed")
315
296
 
316
- if not streaming_mode and not input_finished:
297
+ if status == OK_STATUS and not streaming_mode and not input_finished:
317
298
  try:
318
299
  put_into_queue(self.task_queue, next(input_data))
319
300
  except StopIteration:
@@ -348,7 +329,7 @@ class UDFDispatcher:
348
329
 
349
330
 
350
331
  class WorkerCallback(Callback):
351
- def __init__(self, queue: multiprocess.Queue):
332
+ def __init__(self, queue: "multiprocess.Queue"):
352
333
  self.queue = queue
353
334
  super().__init__()
354
335
 
@@ -369,10 +350,11 @@ class ProcessedCallback(Callback):
369
350
  class UDFWorker:
370
351
  catalog: Catalog
371
352
  udf: UDFBase
372
- task_queue: multiprocess.Queue
373
- done_queue: multiprocess.Queue
353
+ task_queue: "multiprocess.Queue"
354
+ done_queue: "multiprocess.Queue"
374
355
  is_generator: bool
375
356
  cache: bool
357
+ udf_fields: Sequence[str]
376
358
  cb: Callback = attrs.field()
377
359
 
378
360
  @cb.default
@@ -382,7 +364,8 @@ class UDFWorker:
382
364
  def run(self) -> None:
383
365
  processed_cb = ProcessedCallback()
384
366
  udf_results = self.udf.run(
385
- self.get_inputs(),
367
+ self.udf_fields,
368
+ unmarshal(self.get_inputs()),
386
369
  self.catalog,
387
370
  self.is_generator,
388
371
  self.cache,
@@ -390,15 +373,17 @@ class UDFWorker:
390
373
  processed_cb=processed_cb,
391
374
  )
392
375
  for udf_output in udf_results:
393
- if isinstance(udf_output, GeneratorType):
394
- udf_output = list(udf_output) # can not pickle generator
376
+ for batch in batched_it(udf_output, DEFAULT_BATCH_SIZE):
377
+ put_into_queue(
378
+ self.done_queue,
379
+ {
380
+ "status": OK_STATUS,
381
+ "result": msgpack_pack(list(batch)),
382
+ },
383
+ )
395
384
  put_into_queue(
396
385
  self.done_queue,
397
- {
398
- "status": OK_STATUS,
399
- "result": udf_output,
400
- "processed": processed_cb.processed_rows,
401
- },
386
+ {"status": NOTIFY_STATUS, "processed": processed_cb.processed_rows},
402
387
  )
403
388
  put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
404
389
 
@@ -0,0 +1,120 @@
1
+ import datetime
2
+ from collections.abc import Iterable, Iterator
3
+ from queue import Empty, Full, Queue
4
+ from struct import pack, unpack
5
+ from time import sleep
6
+ from typing import Any
7
+
8
+ import msgpack
9
+
10
+ from datachain.query.batch import RowsOutput, RowsOutputBatch
11
+
12
+ DEFAULT_BATCH_SIZE = 10000
13
+ STOP_SIGNAL = "STOP"
14
+ OK_STATUS = "OK"
15
+ FINISHED_STATUS = "FINISHED"
16
+ FAILED_STATUS = "FAILED"
17
+ NOTIFY_STATUS = "NOTIFY"
18
+
19
+
20
+ # For more context on the get_from_queue and put_into_queue functions, see the
21
+ # discussion here:
22
+ # https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
23
+ # This problem is not exactly described by, but is also related to these Python issues:
24
+ # https://github.com/python/cpython/issues/66587
25
+ # https://github.com/python/cpython/issues/88628
26
+ # https://github.com/python/cpython/issues/108645
27
+
28
+
29
+ def get_from_queue(queue: Queue) -> Any:
30
+ """
31
+ Gets an item from a queue.
32
+ This is required to handle signals, such as KeyboardInterrupt exceptions
33
+ while waiting for items to be available, although only on certain installations.
34
+ (See the above comment for more context.)
35
+ """
36
+ while True:
37
+ try:
38
+ return queue.get_nowait()
39
+ except Empty:
40
+ sleep(0.01)
41
+
42
+
43
+ def put_into_queue(queue: Queue, item: Any) -> None:
44
+ """
45
+ Puts an item into a queue.
46
+ This is required to handle signals, such as KeyboardInterrupt exceptions
47
+ while waiting for items to be queued, although only on certain installations.
48
+ (See the above comment for more context.)
49
+ """
50
+ while True:
51
+ try:
52
+ queue.put_nowait(item)
53
+ return
54
+ except Full:
55
+ sleep(0.01)
56
+
57
+
58
+ MSGPACK_EXT_TYPE_DATETIME = 42
59
+ MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH = 43
60
+
61
+
62
+ def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
63
+ if isinstance(obj, datetime.datetime):
64
+ # packing date object as 1 or 2 variables, depending if timezone info is present
65
+ # - timestamp
66
+ # - [OPTIONAL] timezone offset from utc in seconds if timezone info exists
67
+ if obj.tzinfo:
68
+ data = (obj.timestamp(), int(obj.utcoffset().total_seconds())) # type: ignore # noqa: PGH003
69
+ return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!dl", *data))
70
+ data = (obj.timestamp(),) # type: ignore # noqa: PGH003
71
+ return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!d", *data))
72
+
73
+ if isinstance(obj, RowsOutputBatch):
74
+ return msgpack.ExtType(
75
+ MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH,
76
+ msgpack_pack(obj.rows),
77
+ )
78
+
79
+ raise TypeError(f"Unknown type: {obj}")
80
+
81
+
82
+ def msgpack_pack(obj: Any) -> bytes:
83
+ return msgpack.packb(obj, default=_msgpack_pack_extended_types)
84
+
85
+
86
+ def _msgpack_unpack_extended_types(code: int, data: bytes) -> Any:
87
+ if code == MSGPACK_EXT_TYPE_DATETIME:
88
+ has_timezone = False
89
+ if len(data) == 8:
90
+ # we send only timestamp without timezone if data is 8 bytes
91
+ values = unpack("!d", data)
92
+ else:
93
+ has_timezone = True
94
+ values = unpack("!dl", data)
95
+
96
+ timestamp = values[0]
97
+ tz_info = None
98
+ if has_timezone:
99
+ timezone_offset = values[1]
100
+ tz_info = datetime.timezone(datetime.timedelta(seconds=timezone_offset))
101
+ return datetime.datetime.fromtimestamp(timestamp, tz=tz_info)
102
+
103
+ if code == MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH:
104
+ return RowsOutputBatch(msgpack_unpack(data))
105
+
106
+ return msgpack.ExtType(code, data)
107
+
108
+
109
+ def msgpack_unpack(data: bytes) -> Any:
110
+ return msgpack.unpackb(data, ext_hook=_msgpack_unpack_extended_types)
111
+
112
+
113
+ def marshal(obj: Iterator[RowsOutput]) -> Iterable[bytes]:
114
+ for row in obj:
115
+ yield msgpack_pack(row)
116
+
117
+
118
+ def unmarshal(obj: Iterator[bytes]) -> Iterable[RowsOutput]:
119
+ for row in obj:
120
+ yield msgpack_unpack(row)
datachain/query/udf.py CHANGED
@@ -15,7 +15,14 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
15
 
16
16
  from datachain.dataset import RowDict
17
17
 
18
- from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
18
+ from .batch import (
19
+ Batch,
20
+ BatchingStrategy,
21
+ NoBatching,
22
+ Partition,
23
+ RowsOutputBatch,
24
+ UDFInputBatch,
25
+ )
19
26
  from .schema import (
20
27
  UDFParameter,
21
28
  UDFParamSpec,
@@ -25,7 +32,7 @@ from .schema import (
25
32
  if TYPE_CHECKING:
26
33
  from datachain.catalog import Catalog
27
34
 
28
- from .batch import BatchingResult
35
+ from .batch import RowsOutput, UDFInput
29
36
 
30
37
  ColumnType = Any
31
38
 
@@ -107,7 +114,8 @@ class UDFBase:
107
114
 
108
115
  def run(
109
116
  self,
110
- udf_inputs: "Iterable[BatchingResult]",
117
+ udf_fields: "Sequence[str]",
118
+ udf_inputs: "Iterable[RowsOutput]",
111
119
  catalog: "Catalog",
112
120
  is_generator: bool,
113
121
  cache: bool,
@@ -115,15 +123,22 @@ class UDFBase:
115
123
  processed_cb: Callback = DEFAULT_CALLBACK,
116
124
  ) -> Iterator[Iterable["UDFResult"]]:
117
125
  for batch in udf_inputs:
118
- n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
119
- output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
126
+ if isinstance(batch, RowsOutputBatch):
127
+ n_rows = len(batch.rows)
128
+ inputs: UDFInput = UDFInputBatch(
129
+ [RowDict(zip(udf_fields, row)) for row in batch.rows]
130
+ )
131
+ else:
132
+ n_rows = 1
133
+ inputs = RowDict(zip(udf_fields, batch))
134
+ output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
120
135
  processed_cb.relative_update(n_rows)
121
136
  yield output
122
137
 
123
138
  def run_once(
124
139
  self,
125
140
  catalog: "Catalog",
126
- arg: "BatchingResult",
141
+ arg: "UDFInput",
127
142
  is_generator: bool = False,
128
143
  cache: bool = False,
129
144
  cb: Callback = DEFAULT_CALLBACK,
@@ -199,12 +214,12 @@ class UDFWrapper(UDFBase):
199
214
  def run_once(
200
215
  self,
201
216
  catalog: "Catalog",
202
- arg: "BatchingResult",
217
+ arg: "UDFInput",
203
218
  is_generator: bool = False,
204
219
  cache: bool = False,
205
220
  cb: Callback = DEFAULT_CALLBACK,
206
221
  ) -> Iterable[UDFResult]:
207
- if isinstance(arg, RowBatch):
222
+ if isinstance(arg, UDFInputBatch):
208
223
  udf_inputs = [
209
224
  self.bind_parameters(catalog, row, cache=cache, cb=cb)
210
225
  for row in arg.rows
datachain/utils.py CHANGED
@@ -10,7 +10,7 @@ import sys
10
10
  import time
11
11
  from collections.abc import Iterable, Iterator, Sequence
12
12
  from datetime import date, datetime, timezone
13
- from itertools import islice
13
+ from itertools import chain, islice
14
14
  from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
15
15
  from uuid import UUID
16
16
 
@@ -241,7 +241,7 @@ _T_co = TypeVar("_T_co", covariant=True)
241
241
 
242
242
 
243
243
  def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
244
- "Batch data into tuples of length n. The last batch may be shorter."
244
+ """Batch data into tuples of length n. The last batch may be shorter."""
245
245
  # Based on: https://docs.python.org/3/library/itertools.html#itertools-recipes
246
246
  # batched('ABCDEFG', 3) --> ABC DEF G
247
247
  if n < 1:
@@ -251,6 +251,21 @@ def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
251
251
  yield batch
252
252
 
253
253
 
254
+ def batched_it(iterable: Iterable[_T_co], n: int) -> Iterator[Iterator[_T_co]]:
255
+ """Batch data into iterators of length n. The last batch may be shorter."""
256
+ # batched('ABCDEFG', 3) --> ABC DEF G
257
+ if n < 1:
258
+ raise ValueError("Batch size must be at least one")
259
+ it = iter(iterable)
260
+ while True:
261
+ chunk_it = islice(it, n)
262
+ try:
263
+ first_el = next(chunk_it)
264
+ except StopIteration:
265
+ return
266
+ yield chain((first_el,), chunk_it)
267
+
268
+
254
269
  def flatten(items):
255
270
  for item in items:
256
271
  if isinstance(item, list):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -15,18 +15,18 @@ datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A
15
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
- datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
18
+ datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=BJ8ZP9mleUbN5Y4CoYJ94R_tnnsA9sHdZq2RBGwVN5Y,80291
20
+ datachain/catalog/catalog.py,sha256=9-7SnMjh5ruH9sdKDo8P5EklX9oC2EHH6bnku6ZqLko,80275
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
25
  datachain/client/azure.py,sha256=3RfDTAI_TszDy9WazHQd3bI3sS2wDFrNXfNqCDewZgE,2214
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
- datachain/client/fsspec.py,sha256=VrssoNenXsFxznr-Xx1haZPlXU-dr-WHdxmdbgFI_UA,13378
27
+ datachain/client/fsspec.py,sha256=G4QTm3KPhlaV74T3gLXJ86345_ak8CH38ezn2ET-oLc,13230
28
28
  datachain/client/gcs.py,sha256=Mt77W_l8_fK61gLm4mmxNmENuOM0ETwxdiFp4S8d-_w,4105
29
- datachain/client/local.py,sha256=yhC-pMKdprJ-rMGwPpBmPkdkG5riIIKkVSe6kNpyCok,5076
29
+ datachain/client/local.py,sha256=SyGnqcrbtSvDK6IJsQa6NxxHwbWaWIP1GLZsQBXg_IA,4939
30
30
  datachain/client/s3.py,sha256=GfRZZzNPQPRsYjoef8bbsLbanJPUlCbyGTTK8ojzp8A,6136
31
31
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
32
32
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
@@ -36,13 +36,13 @@ datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz
36
36
  datachain/data_storage/schema.py,sha256=Idi-29fckvZozzvkyz3nTR2FOIajPlSuPdIEO7SMvXM,7863
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
38
  datachain/data_storage/sqlite.py,sha256=0r6L_a2hdGRoR_gl06v1qWhEFOS_Q31aldHyk07Yx-M,26857
39
- datachain/data_storage/warehouse.py,sha256=eEZvzYwpqwzzLXqHWjB6l4tRsIHifIr8VWI5STm53LE,33310
39
+ datachain/data_storage/warehouse.py,sha256=MXYkUG69UK2wbIFsZFvT7rKzXlnSitDMp3Vzj_IIsnA,33089
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
42
42
  datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
43
  datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=bU45N7vBlxSyS6bpe0ShQ1c0DpXKFVfNcFcvbBrE1Ag,58011
45
+ datachain/lib/dc.py,sha256=e24ecfIcypVkmVBqvr-p06zpwrw7GD20gy1gBJQPT-I,58012
46
46
  datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
48
  datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
@@ -51,7 +51,7 @@ datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
51
51
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
52
52
  datachain/lib/signal_schema.py,sha256=VL9TR0CJ3eRzjIDr-8e-e7cZKuMBbPUZtY2lGAsucc0,15734
53
53
  datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
54
- datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
54
+ datachain/lib/udf.py,sha256=n3x6No-7l5LAciPJPWwZbA8WtTnGUU7d0wRL6CyfZh8,11847
55
55
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
56
56
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
57
57
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -64,15 +64,16 @@ datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxI
64
64
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
65
65
  datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
66
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
67
- datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
67
+ datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
68
68
  datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
69
- datachain/query/dataset.py,sha256=nfRRz6mkUz0tcD084rx-ps4PUWnZr5JQlIlRUF-PpSc,59919
70
- datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
69
+ datachain/query/dataset.py,sha256=sRKY2it_znlzTNOt_OCRe008rHu0TXMnFwvGsnthSO0,60209
70
+ datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
71
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
72
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
73
+ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
73
74
  datachain/query/schema.py,sha256=O3mTM5DRjvRAJCI7O9mR8wOdFJbgI1jIjvtfl5YvjI4,7755
74
75
  datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
75
- datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
76
+ datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
76
77
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
78
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
78
79
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
@@ -92,9 +93,9 @@ datachain/sql/sqlite/base.py,sha256=LBYmXqXsVF30fbcnR55evCZHbPDCzMdGk_ogPLps63s,
92
93
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
93
94
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
94
95
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
- datachain-0.3.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.3.0.dist-info/METADATA,sha256=x0jqtxoQE9ynjAAKFeyrz0rvyuv_E2e0D6UuhU3Yu_I,17268
97
- datachain-0.3.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
98
- datachain-0.3.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.3.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.3.0.dist-info/RECORD,,
96
+ datachain-0.3.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
97
+ datachain-0.3.1.dist-info/METADATA,sha256=qR3OMpGUkx0cKelnl51d9uksn5H-Wn4LvTJbUnTMDuQ,17268
98
+ datachain-0.3.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
99
+ datachain-0.3.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
100
+ datachain-0.3.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
101
+ datachain-0.3.1.dist-info/RECORD,,