datachain 0.30.6__py3-none-any.whl → 0.30.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/data_storage/sqlite.py +18 -15
- datachain/data_storage/warehouse.py +7 -1
- datachain/lib/dc/database.py +2 -2
- datachain/lib/dc/datachain.py +28 -28
- datachain/lib/dc/records.py +2 -4
- datachain/lib/settings.py +188 -85
- datachain/lib/udf.py +3 -20
- datachain/query/batch.py +2 -2
- datachain/query/dataset.py +44 -17
- datachain/query/dispatch.py +6 -0
- datachain/query/udf.py +2 -0
- datachain/utils.py +9 -10
- {datachain-0.30.6.dist-info → datachain-0.30.7.dist-info}/METADATA +1 -1
- {datachain-0.30.6.dist-info → datachain-0.30.7.dist-info}/RECORD +18 -18
- {datachain-0.30.6.dist-info → datachain-0.30.7.dist-info}/WHEEL +0 -0
- {datachain-0.30.6.dist-info → datachain-0.30.7.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.6.dist-info → datachain-0.30.7.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.6.dist-info → datachain-0.30.7.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -37,6 +37,7 @@ from datachain import semver
|
|
|
37
37
|
from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
38
38
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
39
39
|
from datachain.data_storage.schema import DefaultSchema
|
|
40
|
+
from datachain.data_storage.warehouse import INSERT_BATCH_SIZE
|
|
40
41
|
from datachain.dataset import DatasetRecord, StorageURI
|
|
41
42
|
from datachain.error import DataChainError, OutdatedDatabaseSchemaError
|
|
42
43
|
from datachain.namespace import Namespace
|
|
@@ -44,7 +45,7 @@ from datachain.project import Project
|
|
|
44
45
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
45
46
|
from datachain.sql.sqlite.base import load_usearch_extension
|
|
46
47
|
from datachain.sql.types import SQLType
|
|
47
|
-
from datachain.utils import DataChainDir, batched_it
|
|
48
|
+
from datachain.utils import DataChainDir, batched, batched_it
|
|
48
49
|
|
|
49
50
|
if TYPE_CHECKING:
|
|
50
51
|
from sqlalchemy.dialects.sqlite import Insert
|
|
@@ -712,19 +713,21 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
712
713
|
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
713
714
|
return (e.model_dump() for e in entries)
|
|
714
715
|
|
|
715
|
-
def insert_rows(
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
716
|
+
def insert_rows(
|
|
717
|
+
self,
|
|
718
|
+
table: Table,
|
|
719
|
+
rows: Iterable[dict[str, Any]],
|
|
720
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
721
|
+
) -> None:
|
|
722
|
+
for row_chunk in batched(rows, batch_size):
|
|
723
|
+
with self.db.transaction() as conn:
|
|
724
|
+
# transactions speeds up inserts significantly as there is no separate
|
|
725
|
+
# transaction created for each insert row
|
|
726
|
+
self.db.executemany(
|
|
727
|
+
table.insert().values({f: bindparam(f) for f in row_chunk[0]}),
|
|
728
|
+
row_chunk,
|
|
729
|
+
conn=conn,
|
|
730
|
+
)
|
|
728
731
|
|
|
729
732
|
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
|
|
730
733
|
dr = self.dataset_rows(dataset, version)
|
|
@@ -797,7 +800,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
797
800
|
.limit(None)
|
|
798
801
|
)
|
|
799
802
|
|
|
800
|
-
for batch in batched_it(ids,
|
|
803
|
+
for batch in batched_it(ids, INSERT_BATCH_SIZE):
|
|
801
804
|
batch_ids = [row[0] for row in batch]
|
|
802
805
|
select_q._where_criteria = (col_id.in_(batch_ids),)
|
|
803
806
|
q = table.insert().from_select(list(select_q.selected_columns), select_q)
|
|
@@ -43,6 +43,7 @@ if TYPE_CHECKING:
|
|
|
43
43
|
logger = logging.getLogger("datachain")
|
|
44
44
|
|
|
45
45
|
SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
|
|
46
|
+
INSERT_BATCH_SIZE = 10_000 # number of rows to insert at a time
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class AbstractWarehouse(ABC, Serializable):
|
|
@@ -415,7 +416,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
415
416
|
"""Convert File entries so they can be passed on to `insert_rows()`"""
|
|
416
417
|
|
|
417
418
|
@abstractmethod
|
|
418
|
-
def insert_rows(
|
|
419
|
+
def insert_rows(
|
|
420
|
+
self,
|
|
421
|
+
table: sa.Table,
|
|
422
|
+
rows: Iterable[dict[str, Any]],
|
|
423
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
424
|
+
) -> None:
|
|
419
425
|
"""Does batch inserts of any kind of rows into table"""
|
|
420
426
|
|
|
421
427
|
def insert_rows_done(self, table: sa.Table) -> None:
|
datachain/lib/dc/database.py
CHANGED
|
@@ -73,7 +73,7 @@ def to_database(
|
|
|
73
73
|
table_name: str,
|
|
74
74
|
connection: "ConnectionType",
|
|
75
75
|
*,
|
|
76
|
-
|
|
76
|
+
batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
77
77
|
on_conflict: Optional[str] = None,
|
|
78
78
|
conflict_columns: Optional[list[str]] = None,
|
|
79
79
|
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
@@ -124,7 +124,7 @@ def to_database(
|
|
|
124
124
|
table.create(conn, checkfirst=True)
|
|
125
125
|
|
|
126
126
|
rows_iter = chain._leaf_values()
|
|
127
|
-
for batch in batched(rows_iter,
|
|
127
|
+
for batch in batched(rows_iter, batch_size):
|
|
128
128
|
rows_affected = _process_batch(
|
|
129
129
|
conn,
|
|
130
130
|
table,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -342,15 +342,15 @@ class DataChain:
|
|
|
342
342
|
|
|
343
343
|
def settings(
|
|
344
344
|
self,
|
|
345
|
-
cache=None,
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
prefetch: Optional[int] = None,
|
|
350
|
-
sys: Optional[bool] = None,
|
|
345
|
+
cache: Optional[bool] = None,
|
|
346
|
+
prefetch: Optional[Union[bool, int]] = None,
|
|
347
|
+
parallel: Optional[Union[bool, int]] = None,
|
|
348
|
+
workers: Optional[int] = None,
|
|
351
349
|
namespace: Optional[str] = None,
|
|
352
350
|
project: Optional[str] = None,
|
|
353
|
-
|
|
351
|
+
min_task_size: Optional[int] = None,
|
|
352
|
+
batch_size: Optional[int] = None,
|
|
353
|
+
sys: Optional[bool] = None,
|
|
354
354
|
) -> "Self":
|
|
355
355
|
"""Change settings for chain.
|
|
356
356
|
|
|
@@ -359,23 +359,23 @@ class DataChain:
|
|
|
359
359
|
|
|
360
360
|
Parameters:
|
|
361
361
|
cache : data caching. (default=False)
|
|
362
|
+
prefetch : number of workers to use for downloading files in advance.
|
|
363
|
+
This is enabled by default and uses 2 workers.
|
|
364
|
+
To disable prefetching, set it to 0 or False.
|
|
362
365
|
parallel : number of thread for processors. True is a special value to
|
|
363
366
|
enable all available CPUs. (default=1)
|
|
364
367
|
workers : number of distributed workers. Only for Studio mode. (default=1)
|
|
365
|
-
min_task_size : minimum number of tasks. (default=1)
|
|
366
|
-
prefetch : number of workers to use for downloading files in advance.
|
|
367
|
-
This is enabled by default and uses 2 workers.
|
|
368
|
-
To disable prefetching, set it to 0.
|
|
369
368
|
namespace : namespace name.
|
|
370
369
|
project : project name.
|
|
371
|
-
|
|
370
|
+
min_task_size : minimum number of tasks. (default=1)
|
|
371
|
+
batch_size : row limit per insert to balance speed and memory usage.
|
|
372
372
|
(default=2000)
|
|
373
373
|
|
|
374
374
|
Example:
|
|
375
375
|
```py
|
|
376
376
|
chain = (
|
|
377
377
|
chain
|
|
378
|
-
.settings(cache=True, parallel=8,
|
|
378
|
+
.settings(cache=True, parallel=8, batch_size=300)
|
|
379
379
|
.map(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
380
380
|
)
|
|
381
381
|
```
|
|
@@ -385,14 +385,14 @@ class DataChain:
|
|
|
385
385
|
settings = copy.copy(self._settings)
|
|
386
386
|
settings.add(
|
|
387
387
|
Settings(
|
|
388
|
-
cache,
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
388
|
+
cache=cache,
|
|
389
|
+
prefetch=prefetch,
|
|
390
|
+
parallel=parallel,
|
|
391
|
+
workers=workers,
|
|
392
|
+
namespace=namespace,
|
|
393
|
+
project=project,
|
|
394
|
+
min_task_size=min_task_size,
|
|
395
|
+
batch_size=batch_size,
|
|
396
396
|
)
|
|
397
397
|
)
|
|
398
398
|
return self._evolve(settings=settings, _sys=sys)
|
|
@@ -745,7 +745,7 @@ class DataChain:
|
|
|
745
745
|
|
|
746
746
|
return self._evolve(
|
|
747
747
|
query=self._query.add_signals(
|
|
748
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
748
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size),
|
|
749
749
|
**self._settings.to_dict(),
|
|
750
750
|
),
|
|
751
751
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -783,7 +783,7 @@ class DataChain:
|
|
|
783
783
|
udf_obj.prefetch = prefetch
|
|
784
784
|
return self._evolve(
|
|
785
785
|
query=self._query.generate(
|
|
786
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
786
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size),
|
|
787
787
|
**self._settings.to_dict(),
|
|
788
788
|
),
|
|
789
789
|
signal_schema=udf_obj.output,
|
|
@@ -919,7 +919,7 @@ class DataChain:
|
|
|
919
919
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
920
920
|
return self._evolve(
|
|
921
921
|
query=self._query.generate(
|
|
922
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
922
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size),
|
|
923
923
|
partition_by=processed_partition_by,
|
|
924
924
|
**self._settings.to_dict(),
|
|
925
925
|
),
|
|
@@ -968,7 +968,7 @@ class DataChain:
|
|
|
968
968
|
|
|
969
969
|
return self._evolve(
|
|
970
970
|
query=self._query.add_signals(
|
|
971
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
971
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size, batch=batch),
|
|
972
972
|
**self._settings.to_dict(),
|
|
973
973
|
),
|
|
974
974
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -2314,7 +2314,7 @@ class DataChain:
|
|
|
2314
2314
|
table_name: str,
|
|
2315
2315
|
connection: "ConnectionType",
|
|
2316
2316
|
*,
|
|
2317
|
-
|
|
2317
|
+
batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
2318
2318
|
on_conflict: Optional[str] = None,
|
|
2319
2319
|
conflict_columns: Optional[list[str]] = None,
|
|
2320
2320
|
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
@@ -2336,7 +2336,7 @@ class DataChain:
|
|
|
2336
2336
|
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
2337
2337
|
responsible for engine disposal and connection closure for the
|
|
2338
2338
|
SQLAlchemy connectable; str connections are closed automatically.
|
|
2339
|
-
|
|
2339
|
+
batch_size: Number of rows to insert per batch for optimal performance.
|
|
2340
2340
|
Larger batches are faster but use more memory. Default: 10,000.
|
|
2341
2341
|
on_conflict: Strategy for handling duplicate rows (requires table
|
|
2342
2342
|
constraints):
|
|
@@ -2417,7 +2417,7 @@ class DataChain:
|
|
|
2417
2417
|
self,
|
|
2418
2418
|
table_name,
|
|
2419
2419
|
connection,
|
|
2420
|
-
|
|
2420
|
+
batch_size=batch_size,
|
|
2421
2421
|
on_conflict=on_conflict,
|
|
2422
2422
|
conflict_columns=conflict_columns,
|
|
2423
2423
|
column_mapping=column_mapping,
|
datachain/lib/dc/records.py
CHANGED
|
@@ -31,7 +31,7 @@ def read_records(
|
|
|
31
31
|
|
|
32
32
|
Parameters:
|
|
33
33
|
to_insert : records (or a single record) to insert. Each record is
|
|
34
|
-
a dictionary of signals and
|
|
34
|
+
a dictionary of signals and their values.
|
|
35
35
|
schema : describes chain signals and their corresponding types
|
|
36
36
|
|
|
37
37
|
Example:
|
|
@@ -45,7 +45,6 @@ def read_records(
|
|
|
45
45
|
"""
|
|
46
46
|
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
47
47
|
from datachain.sql.types import SQLType
|
|
48
|
-
from datachain.utils import batched
|
|
49
48
|
|
|
50
49
|
from .datasets import read_dataset
|
|
51
50
|
|
|
@@ -96,7 +95,6 @@ def read_records(
|
|
|
96
95
|
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
97
96
|
)
|
|
98
97
|
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
99
|
-
|
|
100
|
-
warehouse.insert_rows(table, chunk)
|
|
98
|
+
warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
|
|
101
99
|
warehouse.insert_rows_done(table)
|
|
102
100
|
return read_dataset(name=dsr.full_name, session=session, settings=settings)
|
datachain/lib/settings.py
CHANGED
|
@@ -1,111 +1,214 @@
|
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
|
+
|
|
1
3
|
from datachain.lib.utils import DataChainParamsError
|
|
2
|
-
|
|
4
|
+
|
|
5
|
+
DEFAULT_CACHE = False
|
|
6
|
+
DEFAULT_PREFETCH = 2
|
|
7
|
+
DEFAULT_BATCH_SIZE = 2_000
|
|
3
8
|
|
|
4
9
|
|
|
5
10
|
class SettingsError(DataChainParamsError):
|
|
6
|
-
def __init__(self, msg):
|
|
11
|
+
def __init__(self, msg: str) -> None:
|
|
7
12
|
super().__init__(f"Dataset settings error: {msg}")
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class Settings:
|
|
11
|
-
|
|
16
|
+
"""Settings for datachain."""
|
|
17
|
+
|
|
18
|
+
_cache: Optional[bool]
|
|
19
|
+
_prefetch: Optional[int]
|
|
20
|
+
_parallel: Optional[Union[bool, int]]
|
|
21
|
+
_workers: Optional[int]
|
|
22
|
+
_namespace: Optional[str]
|
|
23
|
+
_project: Optional[str]
|
|
24
|
+
_min_task_size: Optional[int]
|
|
25
|
+
_batch_size: Optional[int]
|
|
26
|
+
|
|
27
|
+
def __init__( # noqa: C901, PLR0912
|
|
12
28
|
self,
|
|
13
|
-
cache=None,
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
):
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
)
|
|
29
|
+
cache: Optional[bool] = None,
|
|
30
|
+
prefetch: Optional[Union[bool, int]] = None,
|
|
31
|
+
parallel: Optional[Union[bool, int]] = None,
|
|
32
|
+
workers: Optional[int] = None,
|
|
33
|
+
namespace: Optional[str] = None,
|
|
34
|
+
project: Optional[str] = None,
|
|
35
|
+
min_task_size: Optional[int] = None,
|
|
36
|
+
batch_size: Optional[int] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
if cache is None:
|
|
39
|
+
self._cache = None
|
|
40
|
+
else:
|
|
41
|
+
if not isinstance(cache, bool):
|
|
42
|
+
raise SettingsError(
|
|
43
|
+
"'cache' argument must be bool"
|
|
44
|
+
f" while {cache.__class__.__name__} was given"
|
|
45
|
+
)
|
|
46
|
+
self._cache = cache
|
|
47
|
+
|
|
48
|
+
if prefetch is None or prefetch is True:
|
|
49
|
+
self._prefetch = None
|
|
50
|
+
elif prefetch is False:
|
|
51
|
+
self._prefetch = 0 # disable prefetch (False == 0)
|
|
52
|
+
else:
|
|
53
|
+
if not isinstance(prefetch, int):
|
|
54
|
+
raise SettingsError(
|
|
55
|
+
"'prefetch' argument must be int or bool"
|
|
56
|
+
f" while {prefetch.__class__.__name__} was given"
|
|
57
|
+
)
|
|
58
|
+
if prefetch < 0:
|
|
59
|
+
raise SettingsError(
|
|
60
|
+
"'prefetch' argument must be non-negative integer"
|
|
61
|
+
f", {prefetch} was given"
|
|
62
|
+
)
|
|
63
|
+
self._prefetch = prefetch
|
|
64
|
+
|
|
65
|
+
if parallel is None or parallel is False:
|
|
66
|
+
self._parallel = None
|
|
67
|
+
elif parallel is True:
|
|
68
|
+
self._parallel = True
|
|
69
|
+
else:
|
|
70
|
+
if not isinstance(parallel, int):
|
|
71
|
+
raise SettingsError(
|
|
72
|
+
"'parallel' argument must be int or bool"
|
|
73
|
+
f" while {parallel.__class__.__name__} was given"
|
|
74
|
+
)
|
|
75
|
+
if parallel <= 0:
|
|
76
|
+
raise SettingsError(
|
|
77
|
+
"'parallel' argument must be positive integer"
|
|
78
|
+
f", {parallel} was given"
|
|
79
|
+
)
|
|
80
|
+
self._parallel = parallel
|
|
81
|
+
|
|
82
|
+
if workers is None:
|
|
83
|
+
self._workers = None
|
|
84
|
+
else:
|
|
85
|
+
if not isinstance(workers, int) or isinstance(workers, bool):
|
|
86
|
+
raise SettingsError(
|
|
87
|
+
"'workers' argument must be int"
|
|
88
|
+
f" while {workers.__class__.__name__} was given"
|
|
89
|
+
)
|
|
90
|
+
if workers <= 0:
|
|
91
|
+
raise SettingsError(
|
|
92
|
+
f"'workers' argument must be positive integer, {workers} was given"
|
|
93
|
+
)
|
|
94
|
+
self._workers = workers
|
|
95
|
+
|
|
96
|
+
if namespace is None:
|
|
97
|
+
self._namespace = None
|
|
98
|
+
else:
|
|
99
|
+
if not isinstance(namespace, str):
|
|
100
|
+
raise SettingsError(
|
|
101
|
+
"'namespace' argument must be str"
|
|
102
|
+
f", {namespace.__class__.__name__} was given"
|
|
103
|
+
)
|
|
104
|
+
self._namespace = namespace
|
|
105
|
+
|
|
106
|
+
if project is None:
|
|
107
|
+
self._project = None
|
|
108
|
+
else:
|
|
109
|
+
if not isinstance(project, str):
|
|
110
|
+
raise SettingsError(
|
|
111
|
+
"'project' argument must be str"
|
|
112
|
+
f", {project.__class__.__name__} was given"
|
|
113
|
+
)
|
|
114
|
+
self._project = project
|
|
115
|
+
|
|
116
|
+
if min_task_size is None:
|
|
117
|
+
self._min_task_size = None
|
|
118
|
+
else:
|
|
119
|
+
if not isinstance(min_task_size, int) or isinstance(min_task_size, bool):
|
|
120
|
+
raise SettingsError(
|
|
121
|
+
"'min_task_size' argument must be int"
|
|
122
|
+
f", {min_task_size.__class__.__name__} was given"
|
|
123
|
+
)
|
|
124
|
+
if min_task_size <= 0:
|
|
125
|
+
raise SettingsError(
|
|
126
|
+
"'min_task_size' argument must be positive integer"
|
|
127
|
+
f", {min_task_size} was given"
|
|
128
|
+
)
|
|
129
|
+
self._min_task_size = min_task_size
|
|
130
|
+
|
|
131
|
+
if batch_size is None:
|
|
132
|
+
self._batch_size = None
|
|
133
|
+
else:
|
|
134
|
+
if not isinstance(batch_size, int) or isinstance(batch_size, bool):
|
|
135
|
+
raise SettingsError(
|
|
136
|
+
"'batch_size' argument must be int"
|
|
137
|
+
f", {batch_size.__class__.__name__} was given"
|
|
138
|
+
)
|
|
139
|
+
if batch_size <= 0:
|
|
140
|
+
raise SettingsError(
|
|
141
|
+
"'batch_size' argument must be positive integer"
|
|
142
|
+
f", {batch_size} was given"
|
|
143
|
+
)
|
|
144
|
+
self._batch_size = batch_size
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def cache(self) -> bool:
|
|
148
|
+
return self._cache if self._cache is not None else DEFAULT_CACHE
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def prefetch(self) -> Optional[int]:
|
|
152
|
+
return self._prefetch if self._prefetch is not None else DEFAULT_PREFETCH
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def parallel(self) -> Optional[Union[bool, int]]:
|
|
156
|
+
return self._parallel if self._parallel is not None else None
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def workers(self) -> Optional[int]:
|
|
160
|
+
return self._workers if self._workers is not None else None
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def namespace(self) -> Optional[str]:
|
|
164
|
+
return self._namespace if self._namespace is not None else None
|
|
70
165
|
|
|
71
166
|
@property
|
|
72
|
-
def
|
|
73
|
-
return self.
|
|
167
|
+
def project(self) -> Optional[str]:
|
|
168
|
+
return self._project if self._project is not None else None
|
|
74
169
|
|
|
75
170
|
@property
|
|
76
|
-
def
|
|
77
|
-
return self.
|
|
171
|
+
def min_task_size(self) -> Optional[int]:
|
|
172
|
+
return self._min_task_size if self._min_task_size is not None else None
|
|
78
173
|
|
|
79
174
|
@property
|
|
80
|
-
def
|
|
81
|
-
return self.
|
|
175
|
+
def batch_size(self) -> int:
|
|
176
|
+
return self._batch_size if self._batch_size is not None else DEFAULT_BATCH_SIZE
|
|
82
177
|
|
|
83
|
-
def to_dict(self):
|
|
84
|
-
res = {}
|
|
178
|
+
def to_dict(self) -> dict[str, Any]:
|
|
179
|
+
res: dict[str, Any] = {}
|
|
85
180
|
if self._cache is not None:
|
|
86
181
|
res["cache"] = self.cache
|
|
87
|
-
if self.
|
|
182
|
+
if self._prefetch is not None:
|
|
183
|
+
res["prefetch"] = self.prefetch
|
|
184
|
+
if self._parallel is not None:
|
|
88
185
|
res["parallel"] = self.parallel
|
|
89
186
|
if self._workers is not None:
|
|
90
187
|
res["workers"] = self.workers
|
|
91
|
-
if self.
|
|
188
|
+
if self._min_task_size is not None:
|
|
92
189
|
res["min_task_size"] = self.min_task_size
|
|
93
|
-
if self.
|
|
190
|
+
if self._namespace is not None:
|
|
94
191
|
res["namespace"] = self.namespace
|
|
95
|
-
if self.
|
|
192
|
+
if self._project is not None:
|
|
96
193
|
res["project"] = self.project
|
|
97
|
-
if self.
|
|
98
|
-
res["
|
|
194
|
+
if self._batch_size is not None:
|
|
195
|
+
res["batch_size"] = self.batch_size
|
|
99
196
|
return res
|
|
100
197
|
|
|
101
|
-
def add(self, settings: "Settings"):
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
if settings.
|
|
109
|
-
self.
|
|
110
|
-
if settings.
|
|
111
|
-
self.
|
|
198
|
+
def add(self, settings: "Settings") -> None:
|
|
199
|
+
if settings._cache is not None:
|
|
200
|
+
self._cache = settings._cache
|
|
201
|
+
if settings._prefetch is not None:
|
|
202
|
+
self._prefetch = settings._prefetch
|
|
203
|
+
if settings._parallel is not None:
|
|
204
|
+
self._parallel = settings._parallel
|
|
205
|
+
if settings._workers is not None:
|
|
206
|
+
self._workers = settings._workers
|
|
207
|
+
if settings._namespace is not None:
|
|
208
|
+
self._namespace = settings._namespace
|
|
209
|
+
if settings._project is not None:
|
|
210
|
+
self._project = settings._project
|
|
211
|
+
if settings._min_task_size is not None:
|
|
212
|
+
self._min_task_size = settings._min_task_size
|
|
213
|
+
if settings._batch_size is not None:
|
|
214
|
+
self._batch_size = settings._batch_size
|
datachain/lib/udf.py
CHANGED
|
@@ -54,23 +54,11 @@ UDFOutputSpec = Mapping[str, ColumnType]
|
|
|
54
54
|
UDFResult = dict[str, Any]
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
@attrs.define
|
|
58
|
-
class UDFProperties:
|
|
59
|
-
udf: "UDFAdapter"
|
|
60
|
-
|
|
61
|
-
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
62
|
-
return self.udf.get_batching(use_partitioning)
|
|
63
|
-
|
|
64
|
-
@property
|
|
65
|
-
def batch_rows(self):
|
|
66
|
-
return self.udf.batch_rows
|
|
67
|
-
|
|
68
|
-
|
|
69
57
|
@attrs.define(slots=False)
|
|
70
58
|
class UDFAdapter:
|
|
71
59
|
inner: "UDFBase"
|
|
72
60
|
output: UDFOutputSpec
|
|
73
|
-
|
|
61
|
+
batch_size: Optional[int] = None
|
|
74
62
|
batch: int = 1
|
|
75
63
|
|
|
76
64
|
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
@@ -83,11 +71,6 @@ class UDFAdapter:
|
|
|
83
71
|
return Batch(self.batch)
|
|
84
72
|
raise ValueError(f"invalid batch size {self.batch}")
|
|
85
73
|
|
|
86
|
-
@property
|
|
87
|
-
def properties(self):
|
|
88
|
-
# For backwards compatibility.
|
|
89
|
-
return UDFProperties(self)
|
|
90
|
-
|
|
91
74
|
def run(
|
|
92
75
|
self,
|
|
93
76
|
udf_fields: "Sequence[str]",
|
|
@@ -237,13 +220,13 @@ class UDFBase(AbstractUDF):
|
|
|
237
220
|
|
|
238
221
|
def to_udf_wrapper(
|
|
239
222
|
self,
|
|
240
|
-
|
|
223
|
+
batch_size: Optional[int] = None,
|
|
241
224
|
batch: int = 1,
|
|
242
225
|
) -> UDFAdapter:
|
|
243
226
|
return UDFAdapter(
|
|
244
227
|
self,
|
|
245
228
|
self.output.to_udf_spec(),
|
|
246
|
-
|
|
229
|
+
batch_size,
|
|
247
230
|
batch,
|
|
248
231
|
)
|
|
249
232
|
|
datachain/query/batch.py
CHANGED
|
@@ -81,8 +81,8 @@ class Batch(BatchingStrategy):
|
|
|
81
81
|
# select rows in batches
|
|
82
82
|
results = []
|
|
83
83
|
|
|
84
|
-
with contextlib.closing(execute(query, page_size=page_size)) as
|
|
85
|
-
for row in
|
|
84
|
+
with contextlib.closing(execute(query, page_size=page_size)) as rows:
|
|
85
|
+
for row in rows:
|
|
86
86
|
results.append(row)
|
|
87
87
|
if len(results) >= self.count:
|
|
88
88
|
batch, results = results[: self.count], results[self.count :]
|
datachain/query/dataset.py
CHANGED
|
@@ -55,7 +55,6 @@ from datachain.query.udf import UdfInfo
|
|
|
55
55
|
from datachain.sql.functions.random import rand
|
|
56
56
|
from datachain.sql.types import SQLType
|
|
57
57
|
from datachain.utils import (
|
|
58
|
-
batched,
|
|
59
58
|
determine_processes,
|
|
60
59
|
determine_workers,
|
|
61
60
|
filtered_cloudpickle_dumps,
|
|
@@ -334,10 +333,10 @@ def process_udf_outputs(
|
|
|
334
333
|
udf_results: Iterator[Iterable["UDFResult"]],
|
|
335
334
|
udf: "UDFAdapter",
|
|
336
335
|
cb: Callback = DEFAULT_CALLBACK,
|
|
336
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
337
337
|
) -> None:
|
|
338
338
|
# Optimization: Compute row types once, rather than for every row.
|
|
339
339
|
udf_col_types = get_col_types(warehouse, udf.output)
|
|
340
|
-
batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
|
|
341
340
|
|
|
342
341
|
def _insert_rows():
|
|
343
342
|
for udf_output in udf_results:
|
|
@@ -349,9 +348,7 @@ def process_udf_outputs(
|
|
|
349
348
|
cb.relative_update()
|
|
350
349
|
yield adjust_outputs(warehouse, row, udf_col_types)
|
|
351
350
|
|
|
352
|
-
|
|
353
|
-
warehouse.insert_rows(udf_table, row_chunk)
|
|
354
|
-
|
|
351
|
+
warehouse.insert_rows(udf_table, _insert_rows(), batch_size=batch_size)
|
|
355
352
|
warehouse.insert_rows_done(udf_table)
|
|
356
353
|
|
|
357
354
|
|
|
@@ -388,12 +385,13 @@ class UDFStep(Step, ABC):
|
|
|
388
385
|
udf: "UDFAdapter"
|
|
389
386
|
catalog: "Catalog"
|
|
390
387
|
partition_by: Optional[PartitionByType] = None
|
|
388
|
+
is_generator = False
|
|
389
|
+
# Parameters from Settings
|
|
390
|
+
cache: bool = False
|
|
391
391
|
parallel: Optional[int] = None
|
|
392
392
|
workers: Union[bool, int] = False
|
|
393
393
|
min_task_size: Optional[int] = None
|
|
394
|
-
|
|
395
|
-
cache: bool = False
|
|
396
|
-
batch_rows: Optional[int] = None
|
|
394
|
+
batch_size: Optional[int] = None
|
|
397
395
|
|
|
398
396
|
@abstractmethod
|
|
399
397
|
def create_udf_table(self, query: Select) -> "Table":
|
|
@@ -450,6 +448,7 @@ class UDFStep(Step, ABC):
|
|
|
450
448
|
use_cache=self.cache,
|
|
451
449
|
is_generator=self.is_generator,
|
|
452
450
|
min_task_size=self.min_task_size,
|
|
451
|
+
batch_size=self.batch_size,
|
|
453
452
|
)
|
|
454
453
|
udf_distributor()
|
|
455
454
|
return
|
|
@@ -486,6 +485,7 @@ class UDFStep(Step, ABC):
|
|
|
486
485
|
is_generator=self.is_generator,
|
|
487
486
|
cache=self.cache,
|
|
488
487
|
rows_total=rows_total,
|
|
488
|
+
batch_size=self.batch_size or INSERT_BATCH_SIZE,
|
|
489
489
|
)
|
|
490
490
|
|
|
491
491
|
# Run the UDFDispatcher in another process to avoid needing
|
|
@@ -534,6 +534,7 @@ class UDFStep(Step, ABC):
|
|
|
534
534
|
udf_results,
|
|
535
535
|
self.udf,
|
|
536
536
|
cb=generated_cb,
|
|
537
|
+
batch_size=self.batch_size or INSERT_BATCH_SIZE,
|
|
537
538
|
)
|
|
538
539
|
finally:
|
|
539
540
|
download_cb.close()
|
|
@@ -595,7 +596,7 @@ class UDFStep(Step, ABC):
|
|
|
595
596
|
parallel=self.parallel,
|
|
596
597
|
workers=self.workers,
|
|
597
598
|
min_task_size=self.min_task_size,
|
|
598
|
-
|
|
599
|
+
batch_size=self.batch_size,
|
|
599
600
|
)
|
|
600
601
|
return self.__class__(self.udf, self.catalog)
|
|
601
602
|
|
|
@@ -641,7 +642,16 @@ class UDFStep(Step, ABC):
|
|
|
641
642
|
|
|
642
643
|
@frozen
|
|
643
644
|
class UDFSignal(UDFStep):
|
|
645
|
+
udf: "UDFAdapter"
|
|
646
|
+
catalog: "Catalog"
|
|
647
|
+
partition_by: Optional[PartitionByType] = None
|
|
644
648
|
is_generator = False
|
|
649
|
+
# Parameters from Settings
|
|
650
|
+
cache: bool = False
|
|
651
|
+
parallel: Optional[int] = None
|
|
652
|
+
workers: Union[bool, int] = False
|
|
653
|
+
min_task_size: Optional[int] = None
|
|
654
|
+
batch_size: Optional[int] = None
|
|
645
655
|
|
|
646
656
|
def create_udf_table(self, query: Select) -> "Table":
|
|
647
657
|
udf_output_columns: list[sqlalchemy.Column[Any]] = [
|
|
@@ -711,7 +721,16 @@ class UDFSignal(UDFStep):
|
|
|
711
721
|
class RowGenerator(UDFStep):
|
|
712
722
|
"""Extend dataset with new rows."""
|
|
713
723
|
|
|
724
|
+
udf: "UDFAdapter"
|
|
725
|
+
catalog: "Catalog"
|
|
726
|
+
partition_by: Optional[PartitionByType] = None
|
|
714
727
|
is_generator = True
|
|
728
|
+
# Parameters from Settings
|
|
729
|
+
cache: bool = False
|
|
730
|
+
parallel: Optional[int] = None
|
|
731
|
+
workers: Union[bool, int] = False
|
|
732
|
+
min_task_size: Optional[int] = None
|
|
733
|
+
batch_size: Optional[int] = None
|
|
715
734
|
|
|
716
735
|
def create_udf_table(self, query: Select) -> "Table":
|
|
717
736
|
warehouse = self.catalog.warehouse
|
|
@@ -1626,12 +1645,17 @@ class DatasetQuery:
|
|
|
1626
1645
|
def add_signals(
|
|
1627
1646
|
self,
|
|
1628
1647
|
udf: "UDFAdapter",
|
|
1648
|
+
partition_by: Optional[PartitionByType] = None,
|
|
1649
|
+
# Parameters from Settings
|
|
1650
|
+
cache: bool = False,
|
|
1629
1651
|
parallel: Optional[int] = None,
|
|
1630
1652
|
workers: Union[bool, int] = False,
|
|
1631
1653
|
min_task_size: Optional[int] = None,
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1654
|
+
batch_size: Optional[int] = None,
|
|
1655
|
+
# Parameters are unused, kept only to match the signature of Settings.to_dict
|
|
1656
|
+
prefetch: Optional[int] = None,
|
|
1657
|
+
namespace: Optional[str] = None,
|
|
1658
|
+
project: Optional[str] = None,
|
|
1635
1659
|
) -> "Self":
|
|
1636
1660
|
"""
|
|
1637
1661
|
Adds one or more signals based on the results from the provided UDF.
|
|
@@ -1657,7 +1681,7 @@ class DatasetQuery:
|
|
|
1657
1681
|
workers=workers,
|
|
1658
1682
|
min_task_size=min_task_size,
|
|
1659
1683
|
cache=cache,
|
|
1660
|
-
|
|
1684
|
+
batch_size=batch_size,
|
|
1661
1685
|
)
|
|
1662
1686
|
)
|
|
1663
1687
|
return query
|
|
@@ -1672,14 +1696,17 @@ class DatasetQuery:
|
|
|
1672
1696
|
def generate(
|
|
1673
1697
|
self,
|
|
1674
1698
|
udf: "UDFAdapter",
|
|
1699
|
+
partition_by: Optional[PartitionByType] = None,
|
|
1700
|
+
# Parameters from Settings
|
|
1701
|
+
cache: bool = False,
|
|
1675
1702
|
parallel: Optional[int] = None,
|
|
1676
1703
|
workers: Union[bool, int] = False,
|
|
1677
1704
|
min_task_size: Optional[int] = None,
|
|
1678
|
-
|
|
1705
|
+
batch_size: Optional[int] = None,
|
|
1706
|
+
# Parameters are unused, kept only to match the signature of Settings.to_dict:
|
|
1707
|
+
prefetch: Optional[int] = None,
|
|
1679
1708
|
namespace: Optional[str] = None,
|
|
1680
1709
|
project: Optional[str] = None,
|
|
1681
|
-
cache: bool = False,
|
|
1682
|
-
batch_rows: Optional[int] = None,
|
|
1683
1710
|
) -> "Self":
|
|
1684
1711
|
query = self.clone()
|
|
1685
1712
|
steps = query.steps
|
|
@@ -1692,7 +1719,7 @@ class DatasetQuery:
|
|
|
1692
1719
|
workers=workers,
|
|
1693
1720
|
min_task_size=min_task_size,
|
|
1694
1721
|
cache=cache,
|
|
1695
|
-
|
|
1722
|
+
batch_size=batch_size,
|
|
1696
1723
|
)
|
|
1697
1724
|
)
|
|
1698
1725
|
return query
|
datachain/query/dispatch.py
CHANGED
|
@@ -114,6 +114,7 @@ class UDFDispatcher:
|
|
|
114
114
|
self.is_batching = udf_info["batching"].is_batching
|
|
115
115
|
self.processes = udf_info["processes"]
|
|
116
116
|
self.rows_total = udf_info["rows_total"]
|
|
117
|
+
self.batch_size = udf_info["batch_size"]
|
|
117
118
|
self.buffer_size = buffer_size
|
|
118
119
|
self.task_queue = None
|
|
119
120
|
self.done_queue = None
|
|
@@ -142,6 +143,7 @@ class UDFDispatcher:
|
|
|
142
143
|
self.table,
|
|
143
144
|
self.cache,
|
|
144
145
|
self.is_batching,
|
|
146
|
+
self.batch_size,
|
|
145
147
|
self.udf_fields,
|
|
146
148
|
)
|
|
147
149
|
|
|
@@ -232,6 +234,7 @@ class UDFDispatcher:
|
|
|
232
234
|
udf_results,
|
|
233
235
|
udf,
|
|
234
236
|
cb=generated_cb,
|
|
237
|
+
batch_size=self.batch_size,
|
|
235
238
|
)
|
|
236
239
|
|
|
237
240
|
def input_batch_size(self, n_workers: int) -> int:
|
|
@@ -385,6 +388,7 @@ class UDFWorker:
|
|
|
385
388
|
table: "Table",
|
|
386
389
|
cache: bool,
|
|
387
390
|
is_batching: bool,
|
|
391
|
+
batch_size: int,
|
|
388
392
|
udf_fields: Sequence[str],
|
|
389
393
|
) -> None:
|
|
390
394
|
self.catalog = catalog
|
|
@@ -395,6 +399,7 @@ class UDFWorker:
|
|
|
395
399
|
self.table = table
|
|
396
400
|
self.cache = cache
|
|
397
401
|
self.is_batching = is_batching
|
|
402
|
+
self.batch_size = batch_size
|
|
398
403
|
self.udf_fields = udf_fields
|
|
399
404
|
|
|
400
405
|
self.download_cb = DownloadCallback(self.done_queue)
|
|
@@ -420,6 +425,7 @@ class UDFWorker:
|
|
|
420
425
|
self.notify_and_process(udf_results),
|
|
421
426
|
self.udf,
|
|
422
427
|
cb=self.generated_cb,
|
|
428
|
+
batch_size=self.batch_size,
|
|
423
429
|
)
|
|
424
430
|
put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
|
|
425
431
|
|
datachain/query/udf.py
CHANGED
|
@@ -21,6 +21,7 @@ class UdfInfo(TypedDict):
|
|
|
21
21
|
is_generator: bool
|
|
22
22
|
cache: bool
|
|
23
23
|
rows_total: int
|
|
24
|
+
batch_size: int
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class AbstractUDFDistributor(ABC):
|
|
@@ -39,6 +40,7 @@ class AbstractUDFDistributor(ABC):
|
|
|
39
40
|
use_cache: bool,
|
|
40
41
|
is_generator: bool = False,
|
|
41
42
|
min_task_size: Optional[Union[str, int]] = None,
|
|
43
|
+
batch_size: Optional[int] = None,
|
|
42
44
|
) -> None: ...
|
|
43
45
|
|
|
44
46
|
@abstractmethod
|
datachain/utils.py
CHANGED
|
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
|
|
|
25
25
|
from typing_extensions import Self
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
DEFAULT_BATCH_SIZE = 2000
|
|
29
29
|
|
|
30
30
|
logger = logging.getLogger("datachain")
|
|
31
31
|
|
|
@@ -228,7 +228,7 @@ _T_co = TypeVar("_T_co", covariant=True)
|
|
|
228
228
|
|
|
229
229
|
def _dynamic_batched_core(
|
|
230
230
|
iterable: Iterable[_T_co],
|
|
231
|
-
|
|
231
|
+
batch_size: int,
|
|
232
232
|
) -> Iterator[list[_T_co]]:
|
|
233
233
|
"""Core batching logic that yields lists."""
|
|
234
234
|
|
|
@@ -236,7 +236,7 @@ def _dynamic_batched_core(
|
|
|
236
236
|
|
|
237
237
|
for item in iterable:
|
|
238
238
|
# Check if adding this item would exceed limits
|
|
239
|
-
if len(batch) >=
|
|
239
|
+
if len(batch) >= batch_size and batch: # Yield current batch if we have one
|
|
240
240
|
yield batch
|
|
241
241
|
batch = []
|
|
242
242
|
|
|
@@ -247,23 +247,22 @@ def _dynamic_batched_core(
|
|
|
247
247
|
yield batch
|
|
248
248
|
|
|
249
249
|
|
|
250
|
-
def batched(iterable: Iterable[_T_co],
|
|
250
|
+
def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
|
|
251
251
|
"""
|
|
252
|
-
Batch data into tuples of length
|
|
252
|
+
Batch data into tuples of length batch_size.
|
|
253
253
|
The last batch may be shorter.
|
|
254
254
|
"""
|
|
255
|
-
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable,
|
|
255
|
+
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
256
256
|
|
|
257
257
|
|
|
258
258
|
def batched_it(
|
|
259
259
|
iterable: Iterable[_T_co],
|
|
260
|
-
|
|
260
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
261
261
|
) -> Iterator[Iterator[_T_co]]:
|
|
262
262
|
"""
|
|
263
|
-
Batch data into iterators with dynamic sizing
|
|
264
|
-
based on row count and memory usage.
|
|
263
|
+
Batch data into iterators with dynamic sizing based on row count and memory usage.
|
|
265
264
|
"""
|
|
266
|
-
yield from (iter(batch) for batch in _dynamic_batched_core(iterable,
|
|
265
|
+
yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
267
266
|
|
|
268
267
|
|
|
269
268
|
def flatten(items):
|
|
@@ -19,7 +19,7 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
|
19
19
|
datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
|
|
20
20
|
datachain/studio.py,sha256=27750qCSNxIChEzhV02damIFreLMfr7UdiWqMFyk8AA,15361
|
|
21
21
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
22
|
-
datachain/utils.py,sha256=
|
|
22
|
+
datachain/utils.py,sha256=5ehFeqXau7MFmGUQRsjRyPfDMPoOF1ojpfVciYUo5fE,15659
|
|
23
23
|
datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
|
|
24
24
|
datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
|
|
25
25
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
@@ -52,8 +52,8 @@ datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4
|
|
|
52
52
|
datachain/data_storage/metastore.py,sha256=aSeTRh43hmrOhULi9YD2VlgCj8B4bjE3jqCOvnb_HQs,53851
|
|
53
53
|
datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
|
|
54
54
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
55
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
56
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
55
|
+
datachain/data_storage/sqlite.py,sha256=1fIeIhmB3O8oQVzP8dDKap0KUIgI0n2TdBQSyv0R8J4,30345
|
|
56
|
+
datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
|
|
57
57
|
datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
|
|
58
58
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
59
|
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
@@ -85,11 +85,11 @@ datachain/lib/model_store.py,sha256=A0pSVQ7uaZ9RvANapzirF8Cqq9N6ysosPpMSkzdRPkU,
|
|
|
85
85
|
datachain/lib/namespaces.py,sha256=I6gLC4ZzgyatFtHL85MWR4ml7-yuQOzxHE7IQNbt_ac,2107
|
|
86
86
|
datachain/lib/projects.py,sha256=VJgmzHzKjmNPZD1tm0a1RNHmUQwn6WLWCLpKyc4UrSk,2605
|
|
87
87
|
datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
|
|
88
|
-
datachain/lib/settings.py,sha256=
|
|
88
|
+
datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
|
|
89
89
|
datachain/lib/signal_schema.py,sha256=YMMcc9gHIzBz88zfsreGa1nOoO_56HBtZlT6jf3V1WE,39224
|
|
90
90
|
datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
|
|
91
91
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
92
|
-
datachain/lib/udf.py,sha256=
|
|
92
|
+
datachain/lib/udf.py,sha256=08ia5T3gClen5ZQfIgop-swNnys2G-RIZpszqDnbc0w,17570
|
|
93
93
|
datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
|
|
94
94
|
datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
|
|
95
95
|
datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
|
|
@@ -103,15 +103,15 @@ datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sD
|
|
|
103
103
|
datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
|
|
104
104
|
datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
|
|
105
105
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
106
|
-
datachain/lib/dc/database.py,sha256=
|
|
107
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
106
|
+
datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
|
|
107
|
+
datachain/lib/dc/datachain.py,sha256=AtsvBndqMyKrfW4yH8V0Nf__hfR0LN-NpA2munzfiPM,99888
|
|
108
108
|
datachain/lib/dc/datasets.py,sha256=-Bvyyu4XXDXLiWa-bOnsp0Q11RSYXRO0j5DaX8ShaFs,15355
|
|
109
109
|
datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
|
|
110
110
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
111
111
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
112
112
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
113
113
|
datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
|
|
114
|
-
datachain/lib/dc/records.py,sha256=
|
|
114
|
+
datachain/lib/dc/records.py,sha256=IKf5MArify-cI1P4NgbIvrAi0UQ5cvofTI3u6_zKBP8,3069
|
|
115
115
|
datachain/lib/dc/storage.py,sha256=OMJE-9ob9Ku5le8W6O8J1W-XJ0pwHt2PsO-ZCcee1ZA,7950
|
|
116
116
|
datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
|
|
117
117
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
@@ -125,15 +125,15 @@ datachain/model/ultralytics/bbox.py,sha256=C-aDiBhVa_ML2oERWvksRkyMU1XuYSpb6eItH
|
|
|
125
125
|
datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigFYNZWUA,3392
|
|
126
126
|
datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
|
|
127
127
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
128
|
-
datachain/query/batch.py,sha256
|
|
129
|
-
datachain/query/dataset.py,sha256=
|
|
130
|
-
datachain/query/dispatch.py,sha256=
|
|
128
|
+
datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
|
|
129
|
+
datachain/query/dataset.py,sha256=1eg5EE4vKI7c_Ng04or6zzKmFcOoEubMCoOaYmYPavE,64499
|
|
130
|
+
datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
|
|
131
131
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
132
132
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
133
133
|
datachain/query/queue.py,sha256=v0UeK4ilmdiRoJ5OdjB5qpnHTYDxRP4vhVp5Iw_toaI,3512
|
|
134
134
|
datachain/query/schema.py,sha256=qLpEyvnzKlNCOrThQiTNpUKTUEsVIHT9trt-0UMt6ko,6704
|
|
135
135
|
datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,6809
|
|
136
|
-
datachain/query/udf.py,sha256=
|
|
136
|
+
datachain/query/udf.py,sha256=jqutTpvkT6eHl96ZEgYiiTMAhI7vmTQA6JH9y4WCibI,1405
|
|
137
137
|
datachain/query/utils.py,sha256=a2PTBZ3qsG6XlUcp9XsoGiQfKkca4Q3m-VzFgiGQPAc,1230
|
|
138
138
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
139
139
|
datachain/remote/studio.py,sha256=pDThxvEEpIKVGfa9rmtz_zeqHwrgzh0Lv-Pd4wzDx5k,15448
|
|
@@ -160,9 +160,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
160
160
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
161
161
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
162
162
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
163
|
-
datachain-0.30.
|
|
164
|
-
datachain-0.30.
|
|
165
|
-
datachain-0.30.
|
|
166
|
-
datachain-0.30.
|
|
167
|
-
datachain-0.30.
|
|
168
|
-
datachain-0.30.
|
|
163
|
+
datachain-0.30.7.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
164
|
+
datachain-0.30.7.dist-info/METADATA,sha256=d6ClkSVhY7AFkjh7jgUFEwHpTa7LhpJU75_M8ufegcI,13898
|
|
165
|
+
datachain-0.30.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
datachain-0.30.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
167
|
+
datachain-0.30.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
168
|
+
datachain-0.30.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|