datachain 0.14.3__py3-none-any.whl → 0.14.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -12,6 +12,7 @@ from datachain.dataset import (
12
12
  )
13
13
  from datachain.job import Job
14
14
  from datachain.lib.data_model import DataModel
15
+ from datachain.query.session import Session
15
16
  from datachain.utils import TIME_ZERO
16
17
 
17
18
  if TYPE_CHECKING:
@@ -32,6 +33,10 @@ class DatasetInfo(DataModel):
32
33
  error_message: str = Field(default="")
33
34
  error_stack: str = Field(default="")
34
35
 
36
+ @property
37
+ def is_temp(self) -> bool:
38
+ return Session.is_temp_dataset(self.name)
39
+
35
40
  @staticmethod
36
41
  def _validate_dict(
37
42
  v: Optional[Union[str, dict]],
@@ -140,6 +140,8 @@ def datasets(
140
140
  )
141
141
  ]
142
142
 
143
+ datasets_values = [d for d in datasets_values if not d.is_temp]
144
+
143
145
  return read_values(
144
146
  session=session,
145
147
  settings=settings,
datachain/lib/udf.py CHANGED
@@ -16,7 +16,6 @@ from datachain.lib.convert.flatten import flatten
16
16
  from datachain.lib.data_model import DataValue
17
17
  from datachain.lib.file import File
18
18
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
19
- from datachain.progress import CombinedDownloadCallback
20
19
  from datachain.query.batch import (
21
20
  Batch,
22
21
  BatchingStrategy,
@@ -327,8 +326,9 @@ def _prefetch_inputs(
327
326
 
328
327
  if after_prefetch is None:
329
328
  after_prefetch = noop
330
- if isinstance(download_cb, CombinedDownloadCallback):
331
- after_prefetch = download_cb.increment_file_count
329
+ if download_cb and hasattr(download_cb, "increment_file_count"):
330
+ increment_file_count: Callable[[], None] = download_cb.increment_file_count
331
+ after_prefetch = increment_file_count
332
332
 
333
333
  f = partial(_prefetch_input, download_cb=download_cb, after_prefetch=after_prefetch)
334
334
  mapper = AsyncMapper(f, prepared_inputs, workers=prefetch)
@@ -4,9 +4,8 @@ from itertools import chain
4
4
  from multiprocessing import cpu_count
5
5
  from sys import stdin
6
6
  from threading import Timer
7
- from typing import TYPE_CHECKING, Optional
7
+ from typing import TYPE_CHECKING, Literal, Optional
8
8
 
9
- import attrs
10
9
  import multiprocess
11
10
  from cloudpickle import load, loads
12
11
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -60,6 +59,7 @@ def udf_entrypoint() -> int:
60
59
  query = udf_info["query"]
61
60
  rows_total = udf_info["rows_total"]
62
61
  batching = udf_info["batching"]
62
+ is_generator = udf_info["is_generator"]
63
63
  n_workers = udf_info["processes"]
64
64
  if n_workers is True:
65
65
  n_workers = None # Use default number of CPUs (cores)
@@ -72,17 +72,20 @@ def udf_entrypoint() -> int:
72
72
  ) as udf_inputs:
73
73
  download_cb = get_download_callback()
74
74
  processed_cb = get_processed_callback()
75
+ generated_cb = get_generated_callback(is_generator)
75
76
  try:
76
77
  dispatch.run_udf_parallel(
77
78
  udf_inputs,
78
79
  rows_total=rows_total,
79
80
  n_workers=n_workers,
80
- processed_cb=processed_cb,
81
81
  download_cb=download_cb,
82
+ processed_cb=processed_cb,
83
+ generated_cb=generated_cb,
82
84
  )
83
85
  finally:
84
86
  download_cb.close()
85
87
  processed_cb.close()
88
+ generated_cb.close()
86
89
 
87
90
  return 0
88
91
 
@@ -128,7 +131,6 @@ class UDFDispatcher:
128
131
  self.done_queue,
129
132
  self.query,
130
133
  self.table,
131
- self.is_generator,
132
134
  self.is_batching,
133
135
  self.cache,
134
136
  self.udf_fields,
@@ -152,16 +154,14 @@ class UDFDispatcher:
152
154
  for _ in range(n_workers):
153
155
  put_into_queue(task_queue, STOP_SIGNAL)
154
156
 
155
- def create_input_queue(self):
156
- return self.ctx.Queue()
157
-
158
157
  def run_udf_parallel( # noqa: C901, PLR0912
159
158
  self,
160
159
  input_rows: Iterable[RowsOutput],
161
160
  rows_total: int,
162
161
  n_workers: Optional[int] = None,
163
- processed_cb: Callback = DEFAULT_CALLBACK,
164
162
  download_cb: Callback = DEFAULT_CALLBACK,
163
+ processed_cb: Callback = DEFAULT_CALLBACK,
164
+ generated_cb: Callback = DEFAULT_CALLBACK,
165
165
  ) -> None:
166
166
  n_workers = get_n_workers_from_arg(n_workers)
167
167
 
@@ -214,6 +214,8 @@ class UDFDispatcher:
214
214
  download_cb.relative_update(downloaded)
215
215
  if processed := result.get("processed"):
216
216
  processed_cb.relative_update(processed)
217
+ if generated := result.get("generated"):
218
+ generated_cb.relative_update(generated)
217
219
 
218
220
  status = result["status"]
219
221
  if status in (OK_STATUS, NOTIFY_STATUS):
@@ -260,46 +262,61 @@ class UDFDispatcher:
260
262
  p.join()
261
263
 
262
264
 
263
- class WorkerCallback(Callback):
264
- def __init__(self, queue: "multiprocess.Queue"):
265
+ class DownloadCallback(Callback):
266
+ def __init__(self, queue: "multiprocess.Queue") -> None:
265
267
  self.queue = queue
266
268
  super().__init__()
267
269
 
268
270
  def relative_update(self, inc: int = 1) -> None:
271
+ # This callback is used to notify the size of the downloaded files
272
+ pass
273
+
274
+ def increment_file_count(self, inc: int = 1) -> None:
269
275
  put_into_queue(self.queue, {"status": NOTIFY_STATUS, "downloaded": inc})
270
276
 
271
277
 
272
278
  class ProcessedCallback(Callback):
273
- def __init__(self):
274
- self.processed_rows: Optional[int] = None
279
+ def __init__(
280
+ self,
281
+ name: Literal["processed", "generated"],
282
+ queue: "multiprocess.Queue",
283
+ ) -> None:
284
+ self.name = name
285
+ self.queue = queue
275
286
  super().__init__()
276
287
 
277
288
  def relative_update(self, inc: int = 1) -> None:
278
- self.processed_rows = inc
289
+ put_into_queue(self.queue, {"status": NOTIFY_STATUS, self.name: inc})
279
290
 
280
291
 
281
- @attrs.define
282
292
  class UDFWorker:
283
- catalog: "Catalog"
284
- udf: "UDFAdapter"
285
- task_queue: "multiprocess.Queue"
286
- done_queue: "multiprocess.Queue"
287
- query: "Select"
288
- table: "Table"
289
- is_generator: bool
290
- is_batching: bool
291
- cache: bool
292
- udf_fields: Sequence[str]
293
- cb: Callback = attrs.field()
294
-
295
- @cb.default
296
- def _default_callback(self) -> WorkerCallback:
297
- return WorkerCallback(self.done_queue)
293
+ def __init__(
294
+ self,
295
+ catalog: "Catalog",
296
+ udf: "UDFAdapter",
297
+ task_queue: "multiprocess.Queue",
298
+ done_queue: "multiprocess.Queue",
299
+ query: "Select",
300
+ table: "Table",
301
+ is_batching: bool,
302
+ cache: bool,
303
+ udf_fields: Sequence[str],
304
+ ) -> None:
305
+ self.catalog = catalog
306
+ self.udf = udf
307
+ self.task_queue = task_queue
308
+ self.done_queue = done_queue
309
+ self.query = query
310
+ self.table = table
311
+ self.is_batching = is_batching
312
+ self.cache = cache
313
+ self.udf_fields = udf_fields
314
+
315
+ self.download_cb = DownloadCallback(self.done_queue)
316
+ self.processed_cb = ProcessedCallback("processed", self.done_queue)
317
+ self.generated_cb = ProcessedCallback("generated", self.done_queue)
298
318
 
299
319
  def run(self) -> None:
300
- processed_cb = ProcessedCallback()
301
- generated_cb = get_generated_callback(self.is_generator)
302
-
303
320
  prefetch = self.udf.prefetch
304
321
  with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
305
322
  catalog = clone_catalog_with_cache(self.catalog, _cache)
@@ -308,29 +325,22 @@ class UDFWorker:
308
325
  self.get_inputs(),
309
326
  catalog,
310
327
  self.cache,
311
- download_cb=self.cb,
312
- processed_cb=processed_cb,
328
+ download_cb=self.download_cb,
329
+ processed_cb=self.processed_cb,
313
330
  )
314
331
  with safe_closing(udf_results):
315
332
  process_udf_outputs(
316
333
  catalog.warehouse,
317
334
  self.table,
318
- self.notify_and_process(udf_results, processed_cb),
335
+ self.notify_and_process(udf_results),
319
336
  self.udf,
320
- cb=generated_cb,
337
+ cb=self.generated_cb,
321
338
  )
339
+ put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
322
340
 
323
- put_into_queue(
324
- self.done_queue,
325
- {"status": FINISHED_STATUS, "processed": processed_cb.processed_rows},
326
- )
327
-
328
- def notify_and_process(self, udf_results, processed_cb):
341
+ def notify_and_process(self, udf_results):
329
342
  for row in udf_results:
330
- put_into_queue(
331
- self.done_queue,
332
- {"status": OK_STATUS, "processed": processed_cb.processed_rows},
333
- )
343
+ put_into_queue(self.done_queue, {"status": OK_STATUS})
334
344
  yield row
335
345
 
336
346
  def get_inputs(self):
@@ -100,6 +100,10 @@ class Session:
100
100
  def get_temp_prefix(self) -> str:
101
101
  return f"{self.DATASET_PREFIX}{self.name}_"
102
102
 
103
+ @classmethod
104
+ def is_temp_dataset(cls, name) -> bool:
105
+ return name.startswith(cls.DATASET_PREFIX)
106
+
103
107
  def _cleanup_temp_datasets(self) -> None:
104
108
  prefix = self.get_temp_prefix()
105
109
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.3
3
+ Version: 0.14.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -69,7 +69,7 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
69
  datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
70
70
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
71
71
  datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
72
- datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
72
+ datachain/lib/dataset_info.py,sha256=Jnjy7vq4iNVkq1e-SYjqxdojlxIDXvZ352NCLLZg59k,2633
73
73
  datachain/lib/file.py,sha256=HLQXS_WULm7Y-fkHMy0WpibVAcrkLPRS6CrZy6rwFe0,30450
74
74
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
75
75
  datachain/lib/image.py,sha256=butvUY_33PVEYPKX2nVCPeJjJVcBaptZwsE9REQsTS8,3247
@@ -82,7 +82,7 @@ datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,253
82
82
  datachain/lib/signal_schema.py,sha256=DRatqSG7OVtCUCWyZvMXe4m7r7XFO6NCfzsJRDErMtg,35185
83
83
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
84
84
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
85
- datachain/lib/udf.py,sha256=h38a457xg-4wO2XcxPs4pzDq8JxTmYm4N84iAf0HRzY,16168
85
+ datachain/lib/udf.py,sha256=JJwjvy41N65PtWGUAq7TYnhdOOR6RiMDUJEKl5xtwLc,16199
86
86
  datachain/lib/udf_signature.py,sha256=2EtsOPDNSPqcOlYwqbCdy6RF5MldI-7smii8aLy8p7Y,7543
87
87
  datachain/lib/utils.py,sha256=QrjVs_oLRXEotOPUYurBJypBFi_ReTJmxcnJeH4j2Uk,1596
88
88
  datachain/lib/video.py,sha256=suH_8Mi8VYk4-IVb1vjSduF_njs64ji1WGKHxDLnGYw,6629
@@ -97,7 +97,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=EFfIGBiVVltJQG8blzsQ1dGXneh4D3w
97
97
  datachain/lib/dc/__init__.py,sha256=6rKKHS6MA3mS6UJXiysrv4TURs4R_UWAQK2tJ2t1QMs,743
98
98
  datachain/lib/dc/csv.py,sha256=d0ULzpsTTeqp_eM-2jVHb1kYHQN2lJFf4O6LWd5tOJw,4401
99
99
  datachain/lib/dc/datachain.py,sha256=hwuAElfEhRLyh-Uvuc7YIpFx6nsI_B90xwnMqgkkgrI,76390
100
- datachain/lib/dc/datasets.py,sha256=hTzq18Ij9kpOAJOU-VN4-VyThTTxLSWLfVIk3bgzAPs,4329
100
+ datachain/lib/dc/datasets.py,sha256=IYa8fixaqYasl0iE_cEaMDYfVCM4Dn4eOimz73YJTBY,4398
101
101
  datachain/lib/dc/hf.py,sha256=I1vFNOa1C87lBuBj5FHENLY2jTaQ8erngiX0cyBmOp4,2170
102
102
  datachain/lib/dc/json.py,sha256=9ei9ZNzWVXZWD4HNGTfBhcoLPnXBBDywKV-3Wi1mT28,2725
103
103
  datachain/lib/dc/listings.py,sha256=qPy1DTvYkbNICT1ujo8LwezzMEW8E3dln1knw7Jwl0I,1044
@@ -119,12 +119,12 @@ datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F
119
119
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
120
120
  datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
121
121
  datachain/query/dataset.py,sha256=caUsFzaVZXOz8NmeTMeOdyRQLQP8KCnxYMxF-pG4yFQ,58712
122
- datachain/query/dispatch.py,sha256=T4vdJE0k3Ff1osaQzYTC_2gOOkT0mXwKzNy-5aZcrTE,12300
122
+ datachain/query/dispatch.py,sha256=ErdK-biHYhRLDsm7on6vAHSjX-hAHgEHsBRHmuMS_4E,12979
123
123
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
124
124
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
125
125
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
126
126
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
127
- datachain/query/session.py,sha256=I1KG8jDIaxGAfRfDRucMx8DqsANf_VYWtwtXjeD19lI,6399
127
+ datachain/query/session.py,sha256=wNdOHAi4HrsEihfzdcTlfB5i1xyj0dw6rlUz84StOoU,6512
128
128
  datachain/query/udf.py,sha256=ljAYaF-J77t7iS4zc1-g1ssYd4c6Q-ccKGEc3VQQmeM,1322
129
129
  datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
130
130
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -150,9 +150,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
150
150
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
151
151
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
152
152
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
153
- datachain-0.14.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
154
- datachain-0.14.3.dist-info/METADATA,sha256=hTVICGrF_sALHSr7uuAipFr5HVrgydiP2JcDlZ-Q_w0,11338
155
- datachain-0.14.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
156
- datachain-0.14.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
157
- datachain-0.14.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
158
- datachain-0.14.3.dist-info/RECORD,,
153
+ datachain-0.14.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
154
+ datachain-0.14.4.dist-info/METADATA,sha256=CmIX9sam07khHa1Wrw5eMcSUo06rQs10kF0zF9flz28,11338
155
+ datachain-0.14.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
156
+ datachain-0.14.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
157
+ datachain-0.14.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
158
+ datachain-0.14.4.dist-info/RECORD,,