datachain 0.34.7__py3-none-any.whl → 0.35.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/data_storage/warehouse.py +34 -18
- datachain/lib/file.py +6 -2
- datachain/query/batch.py +1 -2
- datachain/query/dataset.py +12 -22
- datachain/query/dispatch.py +25 -35
- {datachain-0.34.7.dist-info → datachain-0.35.1.dist-info}/METADATA +1 -1
- {datachain-0.34.7.dist-info → datachain-0.35.1.dist-info}/RECORD +11 -12
- datachain/query/utils.py +0 -38
- {datachain-0.34.7.dist-info → datachain-0.35.1.dist-info}/WHEEL +0 -0
- {datachain-0.34.7.dist-info → datachain-0.35.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.34.7.dist-info → datachain-0.35.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.34.7.dist-info → datachain-0.35.1.dist-info}/top_level.txt +0 -0
|
@@ -22,7 +22,6 @@ from datachain.lib.signal_schema import SignalSchema
|
|
|
22
22
|
from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
23
23
|
from datachain.query.batch import RowsOutput
|
|
24
24
|
from datachain.query.schema import ColumnMeta
|
|
25
|
-
from datachain.query.utils import get_query_id_column
|
|
26
25
|
from datachain.sql.functions import path as pathfunc
|
|
27
26
|
from datachain.sql.types import Int, SQLType
|
|
28
27
|
from datachain.utils import sql_escape_like
|
|
@@ -228,7 +227,8 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
228
227
|
while True:
|
|
229
228
|
if limit is not None:
|
|
230
229
|
limit -= num_yielded
|
|
231
|
-
|
|
230
|
+
num_yielded = 0
|
|
231
|
+
if limit <= 0:
|
|
232
232
|
break
|
|
233
233
|
if limit < page_size:
|
|
234
234
|
paginated_query = paginated_query.limit(None).limit(limit)
|
|
@@ -246,32 +246,48 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
246
246
|
break # no more results
|
|
247
247
|
offset += page_size
|
|
248
248
|
|
|
249
|
-
def _regenerate_system_columns(
|
|
250
|
-
|
|
249
|
+
def _regenerate_system_columns(
|
|
250
|
+
self,
|
|
251
|
+
selectable: sa.Select | sa.CTE,
|
|
252
|
+
keep_existing_columns: bool = False,
|
|
253
|
+
) -> sa.Select:
|
|
254
|
+
"""
|
|
255
|
+
Return a SELECT that regenerates sys__id and sys__rand deterministically.
|
|
251
256
|
|
|
257
|
+
If keep_existing_columns is True, existing sys__id and sys__rand columns
|
|
258
|
+
will be kept as-is if they exist in the input selectable.
|
|
259
|
+
"""
|
|
252
260
|
base = selectable.subquery() if hasattr(selectable, "subquery") else selectable
|
|
253
261
|
|
|
262
|
+
result_columns: dict[str, sa.ColumnElement] = {}
|
|
263
|
+
for col in base.c:
|
|
264
|
+
if col.name in result_columns:
|
|
265
|
+
raise ValueError(f"Duplicate column name {col.name} in SELECT")
|
|
266
|
+
if col.name in ("sys__id", "sys__rand"):
|
|
267
|
+
if keep_existing_columns:
|
|
268
|
+
result_columns[col.name] = col
|
|
269
|
+
else:
|
|
270
|
+
result_columns[col.name] = col
|
|
271
|
+
|
|
254
272
|
system_types: dict[str, sa.types.TypeEngine] = {
|
|
255
273
|
sys_col.name: sys_col.type
|
|
256
274
|
for sys_col in self.schema.dataset_row_cls.sys_columns()
|
|
257
275
|
}
|
|
258
276
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
result_columns.append(expr.label("sys__rand"))
|
|
269
|
-
else:
|
|
270
|
-
result_columns.append(col)
|
|
277
|
+
# Add missing system columns if needed
|
|
278
|
+
if "sys__id" not in result_columns:
|
|
279
|
+
expr = self._system_row_number_expr()
|
|
280
|
+
expr = sa.cast(expr, system_types["sys__id"])
|
|
281
|
+
result_columns["sys__id"] = expr.label("sys__id")
|
|
282
|
+
if "sys__rand" not in result_columns:
|
|
283
|
+
expr = self._system_random_expr()
|
|
284
|
+
expr = sa.cast(expr, system_types["sys__rand"])
|
|
285
|
+
result_columns["sys__rand"] = expr.label("sys__rand")
|
|
271
286
|
|
|
272
287
|
# Wrap in subquery to materialize window functions, then wrap again in SELECT
|
|
273
288
|
# This ensures window functions are computed before INSERT...FROM SELECT
|
|
274
|
-
|
|
289
|
+
columns = list(result_columns.values())
|
|
290
|
+
inner = sa.select(*columns).select_from(base).subquery()
|
|
275
291
|
return sa.select(*inner.c).select_from(inner)
|
|
276
292
|
|
|
277
293
|
def _system_row_number_expr(self):
|
|
@@ -380,7 +396,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
380
396
|
"""
|
|
381
397
|
Fetch dataset rows from database using a list of IDs.
|
|
382
398
|
"""
|
|
383
|
-
if (id_col :=
|
|
399
|
+
if (id_col := query.selected_columns.get("sys__id")) is None:
|
|
384
400
|
raise RuntimeError("sys__id column not found in query")
|
|
385
401
|
|
|
386
402
|
query = query._clone().offset(None).limit(None).order_by(None)
|
datachain/lib/file.py
CHANGED
|
@@ -681,7 +681,7 @@ class File(DataModel):
|
|
|
681
681
|
normalized_path = self.get_path_normalized()
|
|
682
682
|
info = client.fs.info(client.get_full_path(normalized_path))
|
|
683
683
|
converted_info = client.info_to_file(info, normalized_path)
|
|
684
|
-
|
|
684
|
+
res = type(self)(
|
|
685
685
|
path=self.path,
|
|
686
686
|
source=self.source,
|
|
687
687
|
size=converted_info.size,
|
|
@@ -691,6 +691,8 @@ class File(DataModel):
|
|
|
691
691
|
last_modified=converted_info.last_modified,
|
|
692
692
|
location=self.location,
|
|
693
693
|
)
|
|
694
|
+
res._set_stream(self._catalog)
|
|
695
|
+
return res
|
|
694
696
|
except FileError as e:
|
|
695
697
|
logger.warning(
|
|
696
698
|
"File error when resolving %s/%s: %s", self.source, self.path, str(e)
|
|
@@ -703,7 +705,7 @@ class File(DataModel):
|
|
|
703
705
|
str(e),
|
|
704
706
|
)
|
|
705
707
|
|
|
706
|
-
|
|
708
|
+
res = type(self)(
|
|
707
709
|
path=self.path,
|
|
708
710
|
source=self.source,
|
|
709
711
|
size=0,
|
|
@@ -713,6 +715,8 @@ class File(DataModel):
|
|
|
713
715
|
last_modified=TIME_ZERO,
|
|
714
716
|
location=self.location,
|
|
715
717
|
)
|
|
718
|
+
res._set_stream(self._catalog)
|
|
719
|
+
return res
|
|
716
720
|
|
|
717
721
|
def rebase(
|
|
718
722
|
self,
|
datachain/query/batch.py
CHANGED
|
@@ -6,7 +6,6 @@ from collections.abc import Callable, Generator, Sequence
|
|
|
6
6
|
import sqlalchemy as sa
|
|
7
7
|
|
|
8
8
|
from datachain.data_storage.schema import PARTITION_COLUMN_ID
|
|
9
|
-
from datachain.query.utils import get_query_column
|
|
10
9
|
|
|
11
10
|
RowsOutputBatch = Sequence[Sequence]
|
|
12
11
|
RowsOutput = Sequence | RowsOutputBatch
|
|
@@ -106,7 +105,7 @@ class Partition(BatchingStrategy):
|
|
|
106
105
|
query: sa.Select,
|
|
107
106
|
id_col: sa.ColumnElement | None = None,
|
|
108
107
|
) -> Generator[RowsOutput, None, None]:
|
|
109
|
-
if (partition_col :=
|
|
108
|
+
if (partition_col := query.selected_columns.get(PARTITION_COLUMN_ID)) is None:
|
|
110
109
|
raise RuntimeError("partition column not found in query")
|
|
111
110
|
|
|
112
111
|
ids_only = False
|
datachain/query/dataset.py
CHANGED
|
@@ -438,6 +438,9 @@ class UDFStep(Step, ABC):
|
|
|
438
438
|
"""
|
|
439
439
|
|
|
440
440
|
def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
|
|
441
|
+
if "sys__id" not in query.selected_columns:
|
|
442
|
+
raise RuntimeError("Query must have sys__id column to run UDF")
|
|
443
|
+
|
|
441
444
|
if (rows_total := self.catalog.warehouse.query_count(query)) == 0:
|
|
442
445
|
return
|
|
443
446
|
|
|
@@ -580,13 +583,10 @@ class UDFStep(Step, ABC):
|
|
|
580
583
|
"""
|
|
581
584
|
Create temporary table with group by partitions.
|
|
582
585
|
"""
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
assert any(c.name == "sys__id" for c in query.selected_columns), (
|
|
588
|
-
"Query must have sys__id column to use partitioning."
|
|
589
|
-
)
|
|
586
|
+
if self.partition_by is None:
|
|
587
|
+
raise RuntimeError("Query must have partition_by set to use partitioning")
|
|
588
|
+
if (id_col := query.selected_columns.get("sys__id")) is None:
|
|
589
|
+
raise RuntimeError("Query must have sys__id column to use partitioning")
|
|
590
590
|
|
|
591
591
|
if isinstance(self.partition_by, (list, tuple, GeneratorType)):
|
|
592
592
|
list_partition_by = list(self.partition_by)
|
|
@@ -602,7 +602,7 @@ class UDFStep(Step, ABC):
|
|
|
602
602
|
|
|
603
603
|
# fill table with partitions
|
|
604
604
|
cols = [
|
|
605
|
-
|
|
605
|
+
id_col,
|
|
606
606
|
f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
|
|
607
607
|
]
|
|
608
608
|
self.catalog.warehouse.db.execute(
|
|
@@ -634,21 +634,11 @@ class UDFStep(Step, ABC):
|
|
|
634
634
|
|
|
635
635
|
# Apply partitioning if needed.
|
|
636
636
|
if self.partition_by is not None:
|
|
637
|
-
if
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
columns = [
|
|
642
|
-
c if isinstance(c, Column) else Column(c.name, c.type)
|
|
643
|
-
for c in query.subquery().columns
|
|
644
|
-
]
|
|
645
|
-
temp_table = self.catalog.warehouse.create_dataset_rows_table(
|
|
646
|
-
self.catalog.warehouse.temp_table_name(),
|
|
647
|
-
columns=columns,
|
|
637
|
+
if "sys__id" not in query.selected_columns:
|
|
638
|
+
_query = query = self.catalog.warehouse._regenerate_system_columns(
|
|
639
|
+
query,
|
|
640
|
+
keep_existing_columns=True,
|
|
648
641
|
)
|
|
649
|
-
temp_tables.append(temp_table.name)
|
|
650
|
-
self.catalog.warehouse.copy_table(temp_table, query)
|
|
651
|
-
_query = query = temp_table.select()
|
|
652
642
|
|
|
653
643
|
partition_tbl = self.create_partitions_table(query)
|
|
654
644
|
temp_tables.append(partition_tbl.name)
|
datachain/query/dispatch.py
CHANGED
|
@@ -22,7 +22,6 @@ from datachain.query.dataset import (
|
|
|
22
22
|
)
|
|
23
23
|
from datachain.query.queue import get_from_queue, put_into_queue
|
|
24
24
|
from datachain.query.udf import UdfInfo
|
|
25
|
-
from datachain.query.utils import get_query_id_column
|
|
26
25
|
from datachain.utils import batched, flatten, safe_closing
|
|
27
26
|
|
|
28
27
|
if TYPE_CHECKING:
|
|
@@ -55,6 +54,9 @@ def udf_entrypoint() -> int:
|
|
|
55
54
|
udf_info: UdfInfo = load(stdin.buffer)
|
|
56
55
|
|
|
57
56
|
query = udf_info["query"]
|
|
57
|
+
if "sys__id" not in query.selected_columns:
|
|
58
|
+
raise RuntimeError("sys__id column is required in UDF query")
|
|
59
|
+
|
|
58
60
|
batching = udf_info["batching"]
|
|
59
61
|
is_generator = udf_info["is_generator"]
|
|
60
62
|
|
|
@@ -65,15 +67,16 @@ def udf_entrypoint() -> int:
|
|
|
65
67
|
wh_cls, wh_args, wh_kwargs = udf_info["warehouse_clone_params"]
|
|
66
68
|
warehouse: AbstractWarehouse = wh_cls(*wh_args, **wh_kwargs)
|
|
67
69
|
|
|
68
|
-
id_col = get_query_id_column(query)
|
|
69
|
-
|
|
70
70
|
with contextlib.closing(
|
|
71
|
-
batching(
|
|
71
|
+
batching(
|
|
72
|
+
warehouse.dataset_select_paginated,
|
|
73
|
+
query,
|
|
74
|
+
id_col=query.selected_columns.sys__id,
|
|
75
|
+
)
|
|
72
76
|
) as udf_inputs:
|
|
73
77
|
try:
|
|
74
78
|
UDFDispatcher(udf_info).run_udf(
|
|
75
79
|
udf_inputs,
|
|
76
|
-
ids_only=id_col is not None,
|
|
77
80
|
download_cb=download_cb,
|
|
78
81
|
processed_cb=processed_cb,
|
|
79
82
|
generated_cb=generated_cb,
|
|
@@ -147,10 +150,10 @@ class UDFDispatcher:
|
|
|
147
150
|
self.udf_fields,
|
|
148
151
|
)
|
|
149
152
|
|
|
150
|
-
def _run_worker(self
|
|
153
|
+
def _run_worker(self) -> None:
|
|
151
154
|
try:
|
|
152
155
|
worker = self._create_worker()
|
|
153
|
-
worker.run(
|
|
156
|
+
worker.run()
|
|
154
157
|
except (Exception, KeyboardInterrupt) as e:
|
|
155
158
|
if self.done_queue:
|
|
156
159
|
put_into_queue(
|
|
@@ -164,7 +167,6 @@ class UDFDispatcher:
|
|
|
164
167
|
def run_udf(
|
|
165
168
|
self,
|
|
166
169
|
input_rows: Iterable["RowsOutput"],
|
|
167
|
-
ids_only: bool,
|
|
168
170
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
169
171
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
170
172
|
generated_cb: Callback = DEFAULT_CALLBACK,
|
|
@@ -178,9 +180,7 @@ class UDFDispatcher:
|
|
|
178
180
|
|
|
179
181
|
if n_workers == 1:
|
|
180
182
|
# no need to spawn worker processes if we are running in a single process
|
|
181
|
-
self.run_udf_single(
|
|
182
|
-
input_rows, ids_only, download_cb, processed_cb, generated_cb
|
|
183
|
-
)
|
|
183
|
+
self.run_udf_single(input_rows, download_cb, processed_cb, generated_cb)
|
|
184
184
|
else:
|
|
185
185
|
if self.buffer_size < n_workers:
|
|
186
186
|
raise RuntimeError(
|
|
@@ -189,13 +189,12 @@ class UDFDispatcher:
|
|
|
189
189
|
)
|
|
190
190
|
|
|
191
191
|
self.run_udf_parallel(
|
|
192
|
-
n_workers, input_rows,
|
|
192
|
+
n_workers, input_rows, download_cb, processed_cb, generated_cb
|
|
193
193
|
)
|
|
194
194
|
|
|
195
195
|
def run_udf_single(
|
|
196
196
|
self,
|
|
197
197
|
input_rows: Iterable["RowsOutput"],
|
|
198
|
-
ids_only: bool,
|
|
199
198
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
200
199
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
201
200
|
generated_cb: Callback = DEFAULT_CALLBACK,
|
|
@@ -204,18 +203,15 @@ class UDFDispatcher:
|
|
|
204
203
|
# Rebuild schemas in single process too for consistency (cheap, idempotent).
|
|
205
204
|
ModelStore.rebuild_all()
|
|
206
205
|
|
|
207
|
-
if
|
|
206
|
+
if not self.is_batching:
|
|
208
207
|
input_rows = flatten(input_rows)
|
|
209
208
|
|
|
210
209
|
def get_inputs() -> Iterable["RowsOutput"]:
|
|
211
210
|
warehouse = self.catalog.warehouse.clone()
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
)
|
|
217
|
-
else:
|
|
218
|
-
yield from input_rows
|
|
211
|
+
for ids in batched(input_rows, DEFAULT_BATCH_SIZE):
|
|
212
|
+
yield from warehouse.dataset_rows_select_from_ids(
|
|
213
|
+
self.query, ids, self.is_batching
|
|
214
|
+
)
|
|
219
215
|
|
|
220
216
|
prefetch = udf.prefetch
|
|
221
217
|
with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
|
|
@@ -249,7 +245,6 @@ class UDFDispatcher:
|
|
|
249
245
|
self,
|
|
250
246
|
n_workers: int,
|
|
251
247
|
input_rows: Iterable["RowsOutput"],
|
|
252
|
-
ids_only: bool,
|
|
253
248
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
254
249
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
255
250
|
generated_cb: Callback = DEFAULT_CALLBACK,
|
|
@@ -258,9 +253,7 @@ class UDFDispatcher:
|
|
|
258
253
|
self.done_queue = self.ctx.Queue()
|
|
259
254
|
|
|
260
255
|
pool = [
|
|
261
|
-
self.ctx.Process(
|
|
262
|
-
name=f"Worker-UDF-{i}", target=self._run_worker, args=[ids_only]
|
|
263
|
-
)
|
|
256
|
+
self.ctx.Process(name=f"Worker-UDF-{i}", target=self._run_worker)
|
|
264
257
|
for i in range(n_workers)
|
|
265
258
|
]
|
|
266
259
|
for p in pool:
|
|
@@ -406,13 +399,13 @@ class UDFWorker:
|
|
|
406
399
|
self.processed_cb = ProcessedCallback("processed", self.done_queue)
|
|
407
400
|
self.generated_cb = ProcessedCallback("generated", self.done_queue)
|
|
408
401
|
|
|
409
|
-
def run(self
|
|
402
|
+
def run(self) -> None:
|
|
410
403
|
prefetch = self.udf.prefetch
|
|
411
404
|
with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
|
|
412
405
|
catalog = clone_catalog_with_cache(self.catalog, _cache)
|
|
413
406
|
udf_results = self.udf.run(
|
|
414
407
|
self.udf_fields,
|
|
415
|
-
self.get_inputs(
|
|
408
|
+
self.get_inputs(),
|
|
416
409
|
catalog,
|
|
417
410
|
self.cache,
|
|
418
411
|
download_cb=self.download_cb,
|
|
@@ -434,13 +427,10 @@ class UDFWorker:
|
|
|
434
427
|
put_into_queue(self.done_queue, {"status": OK_STATUS})
|
|
435
428
|
yield row
|
|
436
429
|
|
|
437
|
-
def get_inputs(self
|
|
430
|
+
def get_inputs(self) -> Iterable["RowsOutput"]:
|
|
438
431
|
warehouse = self.catalog.warehouse.clone()
|
|
439
432
|
while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
)
|
|
445
|
-
else:
|
|
446
|
-
yield from batch
|
|
433
|
+
for ids in batched(batch, DEFAULT_BATCH_SIZE):
|
|
434
|
+
yield from warehouse.dataset_rows_select_from_ids(
|
|
435
|
+
self.query, ids, self.is_batching
|
|
436
|
+
)
|
|
@@ -57,7 +57,7 @@ datachain/data_storage/metastore.py,sha256=uh8oFO9NeYN8tosi5F2QhWpdXR8dzDyfN2rrD
|
|
|
57
57
|
datachain/data_storage/schema.py,sha256=4FZZFgPTI9e3gUFdlm1smPdES7FHctwXQNdNfY69tj8,9807
|
|
58
58
|
datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
|
|
59
59
|
datachain/data_storage/sqlite.py,sha256=xQZ944neP57K_25HSetIy35IakAcyA0cUKVe-xeIEgQ,31168
|
|
60
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
60
|
+
datachain/data_storage/warehouse.py,sha256=rNz2wFlFA-pyBAuy14RL6lRIFhrNEnX02c9SgGs4v58,34994
|
|
61
61
|
datachain/diff/__init__.py,sha256=pixXOnbOcoxfkBvbaiDNGPhJMEyTiHb9EIFxR7QqY5A,9533
|
|
62
62
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
@@ -79,7 +79,7 @@ datachain/lib/audio.py,sha256=3QWQ7PHuRnen7al8EjgjWuKbRKe4SvrbWELJ1T_Cin0,7545
|
|
|
79
79
|
datachain/lib/clip.py,sha256=nF8-N6Uz0MbAsPJBY2iXEYa3DPLo80OOer5SRNAtcGM,6149
|
|
80
80
|
datachain/lib/data_model.py,sha256=H-bagx24-cLlC7ngSP6Dby4mB6kSxxV7KDiHxQjzwlg,3798
|
|
81
81
|
datachain/lib/dataset_info.py,sha256=Ym7yYcGpfUmPLrfdxueijCVRP2Go6KbyuLk_fmzYgDU,3273
|
|
82
|
-
datachain/lib/file.py,sha256=
|
|
82
|
+
datachain/lib/file.py,sha256=YO4QUaZVZ0TVW9fahERZ3HJXPNXjB4oYzvLQntQYT9s,47501
|
|
83
83
|
datachain/lib/hf.py,sha256=jmyqRDXdksojUJCiU_2XFSIoMzzDJAZQs9xr-sEwEJc,7281
|
|
84
84
|
datachain/lib/image.py,sha256=xKyVsFKi1Shji7oluvd4Ibr3Atiz-Q0MNJhIsXeGcMI,3197
|
|
85
85
|
datachain/lib/listing.py,sha256=pXRzHCUxX0b1sZrFWPN77bHY69Hrn6rFwr5IzSxuhvI,7060
|
|
@@ -130,16 +130,15 @@ datachain/model/ultralytics/bbox.py,sha256=C-aDiBhVa_ML2oERWvksRkyMU1XuYSpb6eItH
|
|
|
130
130
|
datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigFYNZWUA,3392
|
|
131
131
|
datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
|
|
132
132
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
133
|
-
datachain/query/batch.py,sha256=
|
|
134
|
-
datachain/query/dataset.py,sha256=
|
|
135
|
-
datachain/query/dispatch.py,sha256=
|
|
133
|
+
datachain/query/batch.py,sha256=ugTlSFqh_kxMcG6vJ5XrEzG9jBXRdb7KRAEEsFWiPew,4190
|
|
134
|
+
datachain/query/dataset.py,sha256=lv5Ta7FjFZWQRUTz9_97oeoT5OvD62unRoNLgEueWUU,67384
|
|
135
|
+
datachain/query/dispatch.py,sha256=B0sxnyN6unU8VFc35eWa_pe_TX6JfHDDbzyIQtp8AoM,15665
|
|
136
136
|
datachain/query/metrics.py,sha256=qOMHiYPTMtVs2zI-mUSy8OPAVwrg4oJtVF85B9tdQyM,810
|
|
137
137
|
datachain/query/params.py,sha256=JkVz6IKUIpF58JZRkUXFT8DAHX2yfaULbhVaGmHKFLc,826
|
|
138
138
|
datachain/query/queue.py,sha256=v0UeK4ilmdiRoJ5OdjB5qpnHTYDxRP4vhVp5Iw_toaI,3512
|
|
139
139
|
datachain/query/schema.py,sha256=Cn1keXjktptAbEDbHlxSzdoCu5H6h_Vzp_DtNpMSr5w,6697
|
|
140
140
|
datachain/query/session.py,sha256=lbwMDvxjZ2BS2rA9qk7MVBRzlsSrwH92yJ_waP3uvDc,6781
|
|
141
141
|
datachain/query/udf.py,sha256=SLLLNLz3QmtaM04ZVTu7K6jo58I-1j5Jf7Lb4ORv4tQ,1385
|
|
142
|
-
datachain/query/utils.py,sha256=UbsyU2QVJCHLnm3dAYOjacXOiwa0-tSOawwMb8SrRdY,1251
|
|
143
142
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
144
143
|
datachain/remote/studio.py,sha256=4voPFVDXAU6BSBHDAvB_LTYiCACA6Zr0IfYnDjrnN6s,16737
|
|
145
144
|
datachain/sql/__init__.py,sha256=8D2omsBiATt8bjLjGo6jBEtaKEkOlnlNFWhVryHMDv0,388
|
|
@@ -165,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
165
164
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
166
165
|
datachain/toolkit/split.py,sha256=xQzzmvQRKsPteDKbpgOxd4r971BnFaK33mcOl0FuGeI,2883
|
|
167
166
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
168
|
-
datachain-0.
|
|
169
|
-
datachain-0.
|
|
170
|
-
datachain-0.
|
|
171
|
-
datachain-0.
|
|
172
|
-
datachain-0.
|
|
173
|
-
datachain-0.
|
|
167
|
+
datachain-0.35.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
168
|
+
datachain-0.35.1.dist-info/METADATA,sha256=269z2Y2d1NZiTqvHExCQMAtcEcz2qYEb7RiIvvAZnKw,13606
|
|
169
|
+
datachain-0.35.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
170
|
+
datachain-0.35.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
171
|
+
datachain-0.35.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
172
|
+
datachain-0.35.1.dist-info/RECORD,,
|
datachain/query/utils.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import sqlalchemy as sa
|
|
2
|
-
|
|
3
|
-
ColT = sa.ColumnClause | sa.Column | sa.ColumnElement | sa.TextClause | sa.Label
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def column_name(col: ColT) -> str:
|
|
7
|
-
"""Returns column name from column element."""
|
|
8
|
-
return (
|
|
9
|
-
col.name
|
|
10
|
-
if isinstance(col, (sa.ColumnClause, sa.Column, sa.Label))
|
|
11
|
-
else str(col)
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_query_column(query: sa.Select, name: str) -> ColT | None:
|
|
16
|
-
"""Returns column element from query by name or None if column not found."""
|
|
17
|
-
return next((col for col in query.inner_columns if column_name(col) == name), None)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def get_query_id_column(query: sa.Select) -> sa.ColumnElement | None:
|
|
21
|
-
"""Returns ID column element from query or None if column not found."""
|
|
22
|
-
col = get_query_column(query, "sys__id")
|
|
23
|
-
return col if col is not None and isinstance(col, sa.ColumnElement) else None
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def select_only_columns(query: sa.Select, *names: str) -> sa.Select:
|
|
27
|
-
"""Returns query selecting defined columns only."""
|
|
28
|
-
if not names:
|
|
29
|
-
return query
|
|
30
|
-
|
|
31
|
-
cols: list[ColT] = []
|
|
32
|
-
for name in names:
|
|
33
|
-
col = get_query_column(query, name)
|
|
34
|
-
if col is None:
|
|
35
|
-
raise ValueError(f"Column '{name}' not found in query")
|
|
36
|
-
cols.append(col)
|
|
37
|
-
|
|
38
|
-
return query.with_only_columns(*cols)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|