datachain 0.2.14__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/data_storage/sqlite.py +21 -0
- datachain/data_storage/warehouse.py +24 -2
- datachain/lib/arrow.py +27 -8
- datachain/lib/convert/flatten.py +10 -5
- datachain/lib/convert/python_to_sql.py +1 -1
- datachain/lib/data_model.py +6 -1
- datachain/lib/dc.py +109 -27
- datachain/lib/meta_formats.py +6 -6
- datachain/lib/settings.py +1 -17
- datachain/lib/udf.py +18 -10
- datachain/query/dataset.py +5 -44
- datachain/sql/types.py +5 -1
- {datachain-0.2.14.dist-info → datachain-0.2.15.dist-info}/METADATA +1 -1
- {datachain-0.2.14.dist-info → datachain-0.2.15.dist-info}/RECORD +18 -18
- {datachain-0.2.14.dist-info → datachain-0.2.15.dist-info}/WHEEL +1 -1
- {datachain-0.2.14.dist-info → datachain-0.2.15.dist-info}/LICENSE +0 -0
- {datachain-0.2.14.dist-info → datachain-0.2.15.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.14.dist-info → datachain-0.2.15.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -42,6 +42,7 @@ if TYPE_CHECKING:
|
|
|
42
42
|
from sqlalchemy.dialects.sqlite import Insert
|
|
43
43
|
from sqlalchemy.schema import SchemaItem
|
|
44
44
|
from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
|
|
45
|
+
from sqlalchemy.sql.selectable import Select
|
|
45
46
|
from sqlalchemy.types import TypeEngine
|
|
46
47
|
|
|
47
48
|
|
|
@@ -705,3 +706,23 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
705
706
|
client_config=None,
|
|
706
707
|
) -> list[str]:
|
|
707
708
|
raise NotImplementedError("Exporting dataset table not implemented for SQLite")
|
|
709
|
+
|
|
710
|
+
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
711
|
+
"""
|
|
712
|
+
Create a temporary table from a query for use in a UDF.
|
|
713
|
+
"""
|
|
714
|
+
columns = [
|
|
715
|
+
sqlalchemy.Column(c.name, c.type)
|
|
716
|
+
for c in query.selected_columns
|
|
717
|
+
if c.name != "sys__id"
|
|
718
|
+
]
|
|
719
|
+
table = self.create_udf_table(columns)
|
|
720
|
+
|
|
721
|
+
select_q = query.with_only_columns(
|
|
722
|
+
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
723
|
+
)
|
|
724
|
+
self.db.execute(
|
|
725
|
+
table.insert().from_select(list(select_q.selected_columns), select_q)
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
return table
|
|
@@ -2,6 +2,8 @@ import glob
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import posixpath
|
|
5
|
+
import random
|
|
6
|
+
import string
|
|
5
7
|
from abc import ABC, abstractmethod
|
|
6
8
|
from collections.abc import Generator, Iterable, Iterator, Sequence
|
|
7
9
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
@@ -24,6 +26,7 @@ from datachain.utils import sql_escape_like
|
|
|
24
26
|
if TYPE_CHECKING:
|
|
25
27
|
from sqlalchemy.sql._typing import _ColumnsClauseArgument
|
|
26
28
|
from sqlalchemy.sql.elements import ColumnElement
|
|
29
|
+
from sqlalchemy.sql.selectable import Select
|
|
27
30
|
from sqlalchemy.types import TypeEngine
|
|
28
31
|
|
|
29
32
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
@@ -252,6 +255,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
252
255
|
prefix = self.DATASET_SOURCE_TABLE_PREFIX
|
|
253
256
|
return f"{prefix}{dataset_name}_{version}"
|
|
254
257
|
|
|
258
|
+
def temp_table_name(self) -> str:
|
|
259
|
+
return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
|
|
260
|
+
|
|
261
|
+
def udf_table_name(self) -> str:
|
|
262
|
+
return self.UDF_TABLE_NAME_PREFIX + _random_string(6)
|
|
263
|
+
|
|
255
264
|
#
|
|
256
265
|
# Datasets
|
|
257
266
|
#
|
|
@@ -869,8 +878,8 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
869
878
|
|
|
870
879
|
def create_udf_table(
|
|
871
880
|
self,
|
|
872
|
-
name: str,
|
|
873
881
|
columns: Sequence["sa.Column"] = (),
|
|
882
|
+
name: Optional[str] = None,
|
|
874
883
|
) -> "sa.Table":
|
|
875
884
|
"""
|
|
876
885
|
Create a temporary table for storing custom signals generated by a UDF.
|
|
@@ -878,7 +887,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
878
887
|
and UDFs are run in other processes when run in parallel.
|
|
879
888
|
"""
|
|
880
889
|
tbl = sa.Table(
|
|
881
|
-
name,
|
|
890
|
+
name or self.udf_table_name(),
|
|
882
891
|
sa.MetaData(),
|
|
883
892
|
sa.Column("sys__id", Int, primary_key=True),
|
|
884
893
|
*columns,
|
|
@@ -886,6 +895,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
886
895
|
self.db.create_table(tbl, if_not_exists=True)
|
|
887
896
|
return tbl
|
|
888
897
|
|
|
898
|
+
@abstractmethod
|
|
899
|
+
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
900
|
+
"""
|
|
901
|
+
Create a temporary table from a query for use in a UDF.
|
|
902
|
+
"""
|
|
903
|
+
|
|
889
904
|
def is_temp_table_name(self, name: str) -> bool:
|
|
890
905
|
"""Returns if the given table name refers to a temporary
|
|
891
906
|
or no longer needed table."""
|
|
@@ -937,3 +952,10 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
937
952
|
& (tq.c.is_latest == true())
|
|
938
953
|
)
|
|
939
954
|
)
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
def _random_string(length: int) -> str:
|
|
958
|
+
return "".join(
|
|
959
|
+
random.choice(string.ascii_letters + string.digits) # noqa: S311
|
|
960
|
+
for i in range(length)
|
|
961
|
+
)
|
datachain/lib/arrow.py
CHANGED
|
@@ -10,13 +10,17 @@ from datachain.lib.file import File, IndexedFile
|
|
|
10
10
|
from datachain.lib.udf import Generator
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
13
15
|
from datachain.lib.dc import DataChain
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class ArrowGenerator(Generator):
|
|
17
19
|
def __init__(
|
|
18
20
|
self,
|
|
19
|
-
|
|
21
|
+
input_schema: Optional["pa.Schema"] = None,
|
|
22
|
+
output_schema: Optional[type["BaseModel"]] = None,
|
|
23
|
+
source: bool = True,
|
|
20
24
|
nrows: Optional[int] = None,
|
|
21
25
|
**kwargs,
|
|
22
26
|
):
|
|
@@ -25,24 +29,36 @@ class ArrowGenerator(Generator):
|
|
|
25
29
|
|
|
26
30
|
Parameters:
|
|
27
31
|
|
|
28
|
-
|
|
32
|
+
input_schema : Optional pyarrow schema for validation.
|
|
33
|
+
output_schema : Optional pydantic model for validation.
|
|
34
|
+
source : Whether to include info about the source file.
|
|
29
35
|
nrows : Optional row limit.
|
|
30
36
|
kwargs: Parameters to pass to pyarrow.dataset.dataset.
|
|
31
37
|
"""
|
|
32
38
|
super().__init__()
|
|
33
|
-
self.
|
|
39
|
+
self.input_schema = input_schema
|
|
40
|
+
self.output_schema = output_schema
|
|
41
|
+
self.source = source
|
|
34
42
|
self.nrows = nrows
|
|
35
43
|
self.kwargs = kwargs
|
|
36
44
|
|
|
37
45
|
def process(self, file: File):
|
|
38
46
|
path = file.get_path()
|
|
39
|
-
ds = dataset(
|
|
47
|
+
ds = dataset(
|
|
48
|
+
path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
|
|
49
|
+
)
|
|
40
50
|
index = 0
|
|
41
51
|
with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
|
|
42
|
-
for record_batch in ds.to_batches():
|
|
52
|
+
for record_batch in ds.to_batches(use_threads=False):
|
|
43
53
|
for record in record_batch.to_pylist():
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
vals = list(record.values())
|
|
55
|
+
if self.output_schema:
|
|
56
|
+
fields = self.output_schema.model_fields
|
|
57
|
+
vals = [self.output_schema(**dict(zip(fields, vals)))]
|
|
58
|
+
if self.source:
|
|
59
|
+
yield [IndexedFile(file=file, index=index), *vals]
|
|
60
|
+
else:
|
|
61
|
+
yield vals
|
|
46
62
|
index += 1
|
|
47
63
|
if self.nrows and index >= self.nrows:
|
|
48
64
|
return
|
|
@@ -76,7 +92,10 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
76
92
|
if not column:
|
|
77
93
|
column = f"c{default_column}"
|
|
78
94
|
default_column += 1
|
|
79
|
-
|
|
95
|
+
dtype = _arrow_type_mapper(field.type) # type: ignore[assignment]
|
|
96
|
+
if field.nullable:
|
|
97
|
+
dtype = Optional[dtype] # type: ignore[assignment]
|
|
98
|
+
output[column] = dtype
|
|
80
99
|
|
|
81
100
|
return output
|
|
82
101
|
|
datachain/lib/convert/flatten.py
CHANGED
|
@@ -41,17 +41,22 @@ def flatten_list(obj_list):
|
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
def _flatten_list_field(value: list):
|
|
45
|
+
assert isinstance(value, list)
|
|
46
|
+
if value and ModelStore.is_pydantic(type(value[0])):
|
|
47
|
+
return [val.model_dump() for val in value]
|
|
48
|
+
if value and isinstance(value[0], list):
|
|
49
|
+
return [_flatten_list_field(v) for v in value]
|
|
50
|
+
return value
|
|
51
|
+
|
|
52
|
+
|
|
44
53
|
def _flatten_fields_values(fields, obj: BaseModel):
|
|
45
54
|
for name, f_info in fields.items():
|
|
46
55
|
anno = f_info.annotation
|
|
47
56
|
# Optimization: Access attributes directly to skip the model_dump() call.
|
|
48
57
|
value = getattr(obj, name)
|
|
49
|
-
|
|
50
58
|
if isinstance(value, list):
|
|
51
|
-
|
|
52
|
-
yield [val.model_dump() for val in value]
|
|
53
|
-
else:
|
|
54
|
-
yield value
|
|
59
|
+
yield _flatten_list_field(value)
|
|
55
60
|
elif isinstance(value, dict):
|
|
56
61
|
yield {
|
|
57
62
|
key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
|
|
@@ -82,7 +82,7 @@ def python_to_sql(typ): # noqa: PLR0911
|
|
|
82
82
|
def _is_json_inside_union(orig, args) -> bool:
|
|
83
83
|
if orig == Union and len(args) >= 2:
|
|
84
84
|
# List in JSON: Union[dict, list[dict]]
|
|
85
|
-
args_no_nones = [arg for arg in args if arg != type(None)]
|
|
85
|
+
args_no_nones = [arg for arg in args if arg != type(None)] # noqa: E721
|
|
86
86
|
if len(args_no_nones) == 2:
|
|
87
87
|
args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
|
|
88
88
|
if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
|
datachain/lib/data_model.py
CHANGED
|
@@ -47,7 +47,12 @@ def is_chain_type(t: type) -> bool:
|
|
|
47
47
|
if any(t is ft or t is get_args(ft)[0] for ft in get_args(StandardType)):
|
|
48
48
|
return True
|
|
49
49
|
|
|
50
|
-
|
|
50
|
+
orig = get_origin(t)
|
|
51
|
+
args = get_args(t)
|
|
52
|
+
if orig is list and len(args) == 1:
|
|
51
53
|
return is_chain_type(get_args(t)[0])
|
|
52
54
|
|
|
55
|
+
if orig is Union and len(args) == 2 and (type(None) in args):
|
|
56
|
+
return is_chain_type(args[0])
|
|
57
|
+
|
|
53
58
|
return False
|
datachain/lib/dc.py
CHANGED
|
@@ -33,6 +33,7 @@ from datachain.lib.settings import Settings
|
|
|
33
33
|
from datachain.lib.signal_schema import SignalSchema
|
|
34
34
|
from datachain.lib.udf import (
|
|
35
35
|
Aggregator,
|
|
36
|
+
BatchMapper,
|
|
36
37
|
Generator,
|
|
37
38
|
Mapper,
|
|
38
39
|
UDFBase,
|
|
@@ -192,6 +193,8 @@ class DataChain(DatasetQuery):
|
|
|
192
193
|
```
|
|
193
194
|
"""
|
|
194
195
|
|
|
196
|
+
max_row_count: Optional[int] = None
|
|
197
|
+
|
|
195
198
|
DEFAULT_FILE_RECORD: ClassVar[dict] = {
|
|
196
199
|
"source": "",
|
|
197
200
|
"name": "",
|
|
@@ -237,7 +240,6 @@ class DataChain(DatasetQuery):
|
|
|
237
240
|
def settings(
|
|
238
241
|
self,
|
|
239
242
|
cache=None,
|
|
240
|
-
batch=None,
|
|
241
243
|
parallel=None,
|
|
242
244
|
workers=None,
|
|
243
245
|
min_task_size=None,
|
|
@@ -250,7 +252,6 @@ class DataChain(DatasetQuery):
|
|
|
250
252
|
|
|
251
253
|
Parameters:
|
|
252
254
|
cache : data caching (default=False)
|
|
253
|
-
batch : size of the batch (default=1000)
|
|
254
255
|
parallel : number of thread for processors. True is a special value to
|
|
255
256
|
enable all available CPUs (default=1)
|
|
256
257
|
workers : number of distributed workers. Only for Studio mode. (default=1)
|
|
@@ -268,7 +269,7 @@ class DataChain(DatasetQuery):
|
|
|
268
269
|
chain = self.clone()
|
|
269
270
|
if sys is not None:
|
|
270
271
|
chain._sys = sys
|
|
271
|
-
chain._settings.add(Settings(cache,
|
|
272
|
+
chain._settings.add(Settings(cache, parallel, workers, min_task_size))
|
|
272
273
|
return chain
|
|
273
274
|
|
|
274
275
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
@@ -344,7 +345,7 @@ class DataChain(DatasetQuery):
|
|
|
344
345
|
jmespath: Optional[str] = None,
|
|
345
346
|
object_name: Optional[str] = "",
|
|
346
347
|
model_name: Optional[str] = None,
|
|
347
|
-
|
|
348
|
+
print_schema: Optional[bool] = False,
|
|
348
349
|
meta_type: Optional[str] = "json",
|
|
349
350
|
nrows=None,
|
|
350
351
|
**kwargs,
|
|
@@ -359,7 +360,7 @@ class DataChain(DatasetQuery):
|
|
|
359
360
|
schema_from : path to sample to infer spec (if schema not provided)
|
|
360
361
|
object_name : generated object column name
|
|
361
362
|
model_name : optional generated model name
|
|
362
|
-
|
|
363
|
+
print_schema : print auto-generated schema
|
|
363
364
|
jmespath : optional JMESPATH expression to reduce JSON
|
|
364
365
|
nrows : optional row limit for jsonl and JSON arrays
|
|
365
366
|
|
|
@@ -392,7 +393,7 @@ class DataChain(DatasetQuery):
|
|
|
392
393
|
meta_type=meta_type,
|
|
393
394
|
spec=spec,
|
|
394
395
|
model_name=model_name,
|
|
395
|
-
|
|
396
|
+
print_schema=print_schema,
|
|
396
397
|
jmespath=jmespath,
|
|
397
398
|
nrows=nrows,
|
|
398
399
|
)
|
|
@@ -409,7 +410,7 @@ class DataChain(DatasetQuery):
|
|
|
409
410
|
jmespath: Optional[str] = None,
|
|
410
411
|
object_name: Optional[str] = "",
|
|
411
412
|
model_name: Optional[str] = None,
|
|
412
|
-
|
|
413
|
+
print_schema: Optional[bool] = False,
|
|
413
414
|
meta_type: Optional[str] = "jsonl",
|
|
414
415
|
nrows=None,
|
|
415
416
|
**kwargs,
|
|
@@ -424,7 +425,7 @@ class DataChain(DatasetQuery):
|
|
|
424
425
|
schema_from : path to sample to infer spec (if schema not provided)
|
|
425
426
|
object_name : generated object column name
|
|
426
427
|
model_name : optional generated model name
|
|
427
|
-
|
|
428
|
+
print_schema : print auto-generated schema
|
|
428
429
|
jmespath : optional JMESPATH expression to reduce JSON
|
|
429
430
|
nrows : optional row limit for jsonl and JSON arrays
|
|
430
431
|
|
|
@@ -452,7 +453,7 @@ class DataChain(DatasetQuery):
|
|
|
452
453
|
meta_type=meta_type,
|
|
453
454
|
spec=spec,
|
|
454
455
|
model_name=model_name,
|
|
455
|
-
|
|
456
|
+
print_schema=print_schema,
|
|
456
457
|
jmespath=jmespath,
|
|
457
458
|
nrows=nrows,
|
|
458
459
|
)
|
|
@@ -488,7 +489,7 @@ class DataChain(DatasetQuery):
|
|
|
488
489
|
**{object_name: datasets}, # type: ignore[arg-type]
|
|
489
490
|
)
|
|
490
491
|
|
|
491
|
-
def
|
|
492
|
+
def print_json_schema( # type: ignore[override]
|
|
492
493
|
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
493
494
|
) -> "DataChain":
|
|
494
495
|
"""Print JSON data model and save it. It returns the chain itself.
|
|
@@ -513,7 +514,7 @@ class DataChain(DatasetQuery):
|
|
|
513
514
|
output=str,
|
|
514
515
|
)
|
|
515
516
|
|
|
516
|
-
def
|
|
517
|
+
def print_jsonl_schema( # type: ignore[override]
|
|
517
518
|
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
518
519
|
) -> "DataChain":
|
|
519
520
|
"""Print JSON data model and save it. It returns the chain itself.
|
|
@@ -598,14 +599,16 @@ class DataChain(DatasetQuery):
|
|
|
598
599
|
|
|
599
600
|
Using func and output as a map:
|
|
600
601
|
```py
|
|
601
|
-
chain = chain.map(
|
|
602
|
+
chain = chain.map(
|
|
603
|
+
lambda name: name.split("."), output={"stem": str, "ext": str}
|
|
604
|
+
)
|
|
602
605
|
chain.save("new_dataset")
|
|
603
606
|
```
|
|
604
607
|
"""
|
|
605
608
|
udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
|
|
606
609
|
|
|
607
610
|
chain = self.add_signals(
|
|
608
|
-
udf_obj.to_udf_wrapper(
|
|
611
|
+
udf_obj.to_udf_wrapper(),
|
|
609
612
|
**self._settings.to_dict(),
|
|
610
613
|
)
|
|
611
614
|
|
|
@@ -618,7 +621,7 @@ class DataChain(DatasetQuery):
|
|
|
618
621
|
output: OutputType = None,
|
|
619
622
|
**signal_map,
|
|
620
623
|
) -> "Self":
|
|
621
|
-
"""Apply a function to each row to create new rows (with potentially new
|
|
624
|
+
r"""Apply a function to each row to create new rows (with potentially new
|
|
622
625
|
signals). The function needs to return a new objects for each of the new rows.
|
|
623
626
|
It returns a chain itself with new signals.
|
|
624
627
|
|
|
@@ -628,11 +631,20 @@ class DataChain(DatasetQuery):
|
|
|
628
631
|
one key differences: It produces a sequence of rows for each input row (like
|
|
629
632
|
extracting multiple file records from a single tar file or bounding boxes from a
|
|
630
633
|
single image file).
|
|
634
|
+
|
|
635
|
+
Example:
|
|
636
|
+
```py
|
|
637
|
+
chain = chain.gen(
|
|
638
|
+
line=lambda file: [l for l in file.read().split("\n")],
|
|
639
|
+
output=str,
|
|
640
|
+
)
|
|
641
|
+
chain.save("new_dataset")
|
|
642
|
+
```
|
|
631
643
|
"""
|
|
632
644
|
udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
|
|
633
645
|
chain = DatasetQuery.generate(
|
|
634
646
|
self,
|
|
635
|
-
udf_obj.to_udf_wrapper(
|
|
647
|
+
udf_obj.to_udf_wrapper(),
|
|
636
648
|
**self._settings.to_dict(),
|
|
637
649
|
)
|
|
638
650
|
|
|
@@ -652,23 +664,68 @@ class DataChain(DatasetQuery):
|
|
|
652
664
|
|
|
653
665
|
Input-output relationship: N:M
|
|
654
666
|
|
|
655
|
-
This method bears similarity to `gen()` and map()
|
|
656
|
-
parameters, yet differs in two crucial aspects:
|
|
667
|
+
This method bears similarity to `gen()` and `map()`, employing a comparable set
|
|
668
|
+
of parameters, yet differs in two crucial aspects:
|
|
657
669
|
1. The `partition_by` parameter: This specifies the column name or a list of
|
|
658
670
|
column names that determine the grouping criteria for aggregation.
|
|
659
671
|
2. Group-based UDF function input: Instead of individual rows, the function
|
|
660
672
|
receives a list all rows within each group defined by `partition_by`.
|
|
673
|
+
|
|
674
|
+
Example:
|
|
675
|
+
```py
|
|
676
|
+
chain = chain.agg(
|
|
677
|
+
total=lambda category, amount: [sum(amount)],
|
|
678
|
+
output=float,
|
|
679
|
+
partition_by="category",
|
|
680
|
+
)
|
|
681
|
+
chain.save("new_dataset")
|
|
682
|
+
```
|
|
661
683
|
"""
|
|
662
684
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
663
685
|
chain = DatasetQuery.generate(
|
|
664
686
|
self,
|
|
665
|
-
udf_obj.to_udf_wrapper(
|
|
687
|
+
udf_obj.to_udf_wrapper(),
|
|
666
688
|
partition_by=partition_by,
|
|
667
689
|
**self._settings.to_dict(),
|
|
668
690
|
)
|
|
669
691
|
|
|
670
692
|
return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
|
|
671
693
|
|
|
694
|
+
def batch_map(
|
|
695
|
+
self,
|
|
696
|
+
func: Optional[Callable] = None,
|
|
697
|
+
params: Union[None, str, Sequence[str]] = None,
|
|
698
|
+
output: OutputType = None,
|
|
699
|
+
batch: int = 1000,
|
|
700
|
+
**signal_map,
|
|
701
|
+
) -> "Self":
|
|
702
|
+
"""This is a batch version of `map()`.
|
|
703
|
+
|
|
704
|
+
Input-output relationship: N:N
|
|
705
|
+
|
|
706
|
+
It accepts the same parameters plus an
|
|
707
|
+
additional parameter:
|
|
708
|
+
|
|
709
|
+
batch : Size of each batch passed to `func`. Defaults to 1000.
|
|
710
|
+
|
|
711
|
+
Example:
|
|
712
|
+
```py
|
|
713
|
+
chain = chain.batch_map(
|
|
714
|
+
sqrt=lambda size: np.sqrt(size),
|
|
715
|
+
output=float
|
|
716
|
+
)
|
|
717
|
+
chain.save("new_dataset")
|
|
718
|
+
```
|
|
719
|
+
"""
|
|
720
|
+
udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
|
|
721
|
+
chain = DatasetQuery.add_signals(
|
|
722
|
+
self,
|
|
723
|
+
udf_obj.to_udf_wrapper(batch),
|
|
724
|
+
**self._settings.to_dict(),
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
return chain.add_schema(udf_obj.output).reset_settings(self._settings)
|
|
728
|
+
|
|
672
729
|
def _udf_to_obj(
|
|
673
730
|
self,
|
|
674
731
|
target_class: type[UDFBase],
|
|
@@ -1176,6 +1233,7 @@ class DataChain(DatasetQuery):
|
|
|
1176
1233
|
output: OutputType = None,
|
|
1177
1234
|
object_name: str = "",
|
|
1178
1235
|
model_name: str = "",
|
|
1236
|
+
source: bool = True,
|
|
1179
1237
|
nrows: Optional[int] = None,
|
|
1180
1238
|
**kwargs,
|
|
1181
1239
|
) -> "DataChain":
|
|
@@ -1187,8 +1245,9 @@ class DataChain(DatasetQuery):
|
|
|
1187
1245
|
case types will be inferred.
|
|
1188
1246
|
object_name : Generated object column name.
|
|
1189
1247
|
model_name : Generated model name.
|
|
1190
|
-
|
|
1248
|
+
source : Whether to include info about the source file.
|
|
1191
1249
|
nrows : Optional row limit.
|
|
1250
|
+
kwargs : Parameters to pass to pyarrow.dataset.dataset.
|
|
1192
1251
|
|
|
1193
1252
|
Example:
|
|
1194
1253
|
Reading a json lines file:
|
|
@@ -1215,18 +1274,24 @@ class DataChain(DatasetQuery):
|
|
|
1215
1274
|
except ValueError as e:
|
|
1216
1275
|
raise DatasetPrepareError(self.name, e) from e
|
|
1217
1276
|
|
|
1277
|
+
if isinstance(output, dict):
|
|
1278
|
+
model_name = model_name or object_name or ""
|
|
1279
|
+
model = DataChain._dict_to_data_model(model_name, output)
|
|
1280
|
+
else:
|
|
1281
|
+
model = output # type: ignore[assignment]
|
|
1282
|
+
|
|
1218
1283
|
if object_name:
|
|
1219
|
-
|
|
1220
|
-
model_name = model_name or object_name
|
|
1221
|
-
output = DataChain._dict_to_data_model(model_name, output)
|
|
1222
|
-
output = {object_name: output} # type: ignore[dict-item]
|
|
1284
|
+
output = {object_name: model} # type: ignore[dict-item]
|
|
1223
1285
|
elif isinstance(output, type(BaseModel)):
|
|
1224
1286
|
output = {
|
|
1225
1287
|
name: info.annotation # type: ignore[misc]
|
|
1226
1288
|
for name, info in output.model_fields.items()
|
|
1227
1289
|
}
|
|
1228
|
-
|
|
1229
|
-
|
|
1290
|
+
if source:
|
|
1291
|
+
output = {"source": IndexedFile} | output # type: ignore[assignment,operator]
|
|
1292
|
+
return self.gen(
|
|
1293
|
+
ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
|
|
1294
|
+
)
|
|
1230
1295
|
|
|
1231
1296
|
@staticmethod
|
|
1232
1297
|
def _dict_to_data_model(
|
|
@@ -1245,10 +1310,10 @@ class DataChain(DatasetQuery):
|
|
|
1245
1310
|
path,
|
|
1246
1311
|
delimiter: str = ",",
|
|
1247
1312
|
header: bool = True,
|
|
1248
|
-
column_names: Optional[list[str]] = None,
|
|
1249
1313
|
output: OutputType = None,
|
|
1250
1314
|
object_name: str = "",
|
|
1251
1315
|
model_name: str = "",
|
|
1316
|
+
source: bool = True,
|
|
1252
1317
|
nrows=None,
|
|
1253
1318
|
**kwargs,
|
|
1254
1319
|
) -> "DataChain":
|
|
@@ -1264,6 +1329,7 @@ class DataChain(DatasetQuery):
|
|
|
1264
1329
|
case types will be inferred.
|
|
1265
1330
|
object_name : Created object column name.
|
|
1266
1331
|
model_name : Generated model name.
|
|
1332
|
+
source : Whether to include info about the source file.
|
|
1267
1333
|
nrows : Optional row limit.
|
|
1268
1334
|
|
|
1269
1335
|
Example:
|
|
@@ -1282,6 +1348,7 @@ class DataChain(DatasetQuery):
|
|
|
1282
1348
|
|
|
1283
1349
|
chain = DataChain.from_storage(path, **kwargs)
|
|
1284
1350
|
|
|
1351
|
+
column_names = None
|
|
1285
1352
|
if not header:
|
|
1286
1353
|
if not output:
|
|
1287
1354
|
msg = "error parsing csv - provide output if no header"
|
|
@@ -1303,6 +1370,7 @@ class DataChain(DatasetQuery):
|
|
|
1303
1370
|
output=output,
|
|
1304
1371
|
object_name=object_name,
|
|
1305
1372
|
model_name=model_name,
|
|
1373
|
+
source=source,
|
|
1306
1374
|
nrows=nrows,
|
|
1307
1375
|
format=format,
|
|
1308
1376
|
)
|
|
@@ -1315,6 +1383,7 @@ class DataChain(DatasetQuery):
|
|
|
1315
1383
|
output: Optional[dict[str, DataType]] = None,
|
|
1316
1384
|
object_name: str = "",
|
|
1317
1385
|
model_name: str = "",
|
|
1386
|
+
source: bool = True,
|
|
1318
1387
|
nrows=None,
|
|
1319
1388
|
**kwargs,
|
|
1320
1389
|
) -> "DataChain":
|
|
@@ -1327,6 +1396,7 @@ class DataChain(DatasetQuery):
|
|
|
1327
1396
|
output : Dictionary defining column names and their corresponding types.
|
|
1328
1397
|
object_name : Created object column name.
|
|
1329
1398
|
model_name : Generated model name.
|
|
1399
|
+
source : Whether to include info about the source file.
|
|
1330
1400
|
nrows : Optional row limit.
|
|
1331
1401
|
|
|
1332
1402
|
Example:
|
|
@@ -1345,6 +1415,7 @@ class DataChain(DatasetQuery):
|
|
|
1345
1415
|
output=output,
|
|
1346
1416
|
object_name=object_name,
|
|
1347
1417
|
model_name=model_name,
|
|
1418
|
+
source=source,
|
|
1348
1419
|
nrows=None,
|
|
1349
1420
|
format="parquet",
|
|
1350
1421
|
partitioning=partitioning,
|
|
@@ -1531,7 +1602,18 @@ class DataChain(DatasetQuery):
|
|
|
1531
1602
|
@detach
|
|
1532
1603
|
def limit(self, n: int) -> "Self":
|
|
1533
1604
|
"""Return the first n rows of the chain."""
|
|
1534
|
-
|
|
1605
|
+
n = max(n, 0)
|
|
1606
|
+
|
|
1607
|
+
if self.max_row_count is None:
|
|
1608
|
+
self.max_row_count = n
|
|
1609
|
+
return super().limit(n)
|
|
1610
|
+
|
|
1611
|
+
limit = min(n, self.max_row_count)
|
|
1612
|
+
if limit == self.max_row_count:
|
|
1613
|
+
return self
|
|
1614
|
+
|
|
1615
|
+
self.max_row_count = limit
|
|
1616
|
+
return super().limit(self.max_row_count)
|
|
1535
1617
|
|
|
1536
1618
|
@detach
|
|
1537
1619
|
def offset(self, offset: int) -> "Self":
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -101,7 +101,7 @@ def read_meta( # noqa: C901
|
|
|
101
101
|
schema_from=None,
|
|
102
102
|
meta_type="json",
|
|
103
103
|
jmespath=None,
|
|
104
|
-
|
|
104
|
+
print_schema=False,
|
|
105
105
|
model_name=None,
|
|
106
106
|
nrows=None,
|
|
107
107
|
) -> Callable:
|
|
@@ -129,7 +129,7 @@ def read_meta( # noqa: C901
|
|
|
129
129
|
model_output = captured_output.getvalue()
|
|
130
130
|
captured_output.close()
|
|
131
131
|
|
|
132
|
-
if
|
|
132
|
+
if print_schema:
|
|
133
133
|
print(f"{model_output}")
|
|
134
134
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|
|
135
135
|
if not spec:
|
|
@@ -153,13 +153,13 @@ def read_meta( # noqa: C901
|
|
|
153
153
|
jmespath=jmespath,
|
|
154
154
|
nrows=nrows,
|
|
155
155
|
) -> Iterator[spec]:
|
|
156
|
-
def validator(json_object: dict) -> spec:
|
|
156
|
+
def validator(json_object: dict, nrow=0) -> spec:
|
|
157
157
|
json_string = json.dumps(json_object)
|
|
158
158
|
try:
|
|
159
159
|
data_instance = data_model.model_validate_json(json_string)
|
|
160
160
|
yield data_instance
|
|
161
161
|
except ValidationError as e:
|
|
162
|
-
print(f"Validation error occurred in file {file.name}:", e)
|
|
162
|
+
print(f"Validation error occurred in row {nrow} file {file.name}:", e)
|
|
163
163
|
|
|
164
164
|
if meta_type == "csv":
|
|
165
165
|
with (
|
|
@@ -185,7 +185,7 @@ def read_meta( # noqa: C901
|
|
|
185
185
|
nrow = nrow + 1
|
|
186
186
|
if nrows is not None and nrow > nrows:
|
|
187
187
|
return
|
|
188
|
-
yield from validator(json_dict)
|
|
188
|
+
yield from validator(json_dict, nrow)
|
|
189
189
|
|
|
190
190
|
if meta_type == "jsonl":
|
|
191
191
|
try:
|
|
@@ -198,7 +198,7 @@ def read_meta( # noqa: C901
|
|
|
198
198
|
return
|
|
199
199
|
json_object = process_json(data_string, jmespath)
|
|
200
200
|
data_string = fd.readline()
|
|
201
|
-
yield from validator(json_object)
|
|
201
|
+
yield from validator(json_object, nrow)
|
|
202
202
|
except OSError as e:
|
|
203
203
|
print(f"An unexpected file error occurred in file {file.name}: {e}")
|
|
204
204
|
|
datachain/lib/settings.py
CHANGED
|
@@ -7,11 +7,8 @@ class SettingsError(DataChainParamsError):
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Settings:
|
|
10
|
-
def __init__(
|
|
11
|
-
self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
|
|
12
|
-
):
|
|
10
|
+
def __init__(self, cache=None, parallel=None, workers=None, min_task_size=None):
|
|
13
11
|
self._cache = cache
|
|
14
|
-
self._batch = batch
|
|
15
12
|
self.parallel = parallel
|
|
16
13
|
self._workers = workers
|
|
17
14
|
self.min_task_size = min_task_size
|
|
@@ -22,12 +19,6 @@ class Settings:
|
|
|
22
19
|
f" while {cache.__class__.__name__} was given"
|
|
23
20
|
)
|
|
24
21
|
|
|
25
|
-
if not isinstance(batch, int) and batch is not None:
|
|
26
|
-
raise SettingsError(
|
|
27
|
-
"'batch' argument must be int or None"
|
|
28
|
-
f" while {batch.__class__.__name__} was given"
|
|
29
|
-
)
|
|
30
|
-
|
|
31
22
|
if not isinstance(parallel, int) and parallel is not None:
|
|
32
23
|
raise SettingsError(
|
|
33
24
|
"'parallel' argument must be int or None"
|
|
@@ -54,10 +45,6 @@ class Settings:
|
|
|
54
45
|
def cache(self):
|
|
55
46
|
return self._cache if self._cache is not None else False
|
|
56
47
|
|
|
57
|
-
@property
|
|
58
|
-
def batch(self):
|
|
59
|
-
return self._batch if self._batch is not None else 1
|
|
60
|
-
|
|
61
48
|
@property
|
|
62
49
|
def workers(self):
|
|
63
50
|
return self._workers if self._workers is not None else False
|
|
@@ -66,8 +53,6 @@ class Settings:
|
|
|
66
53
|
res = {}
|
|
67
54
|
if self._cache is not None:
|
|
68
55
|
res["cache"] = self.cache
|
|
69
|
-
if self._batch is not None:
|
|
70
|
-
res["batch"] = self.batch
|
|
71
56
|
if self.parallel is not None:
|
|
72
57
|
res["parallel"] = self.parallel
|
|
73
58
|
if self._workers is not None:
|
|
@@ -78,7 +63,6 @@ class Settings:
|
|
|
78
63
|
|
|
79
64
|
def add(self, settings: "Settings"):
|
|
80
65
|
self._cache = settings._cache or self._cache
|
|
81
|
-
self._batch = settings._batch or self._batch
|
|
82
66
|
self.parallel = settings.parallel or self.parallel
|
|
83
67
|
self._workers = settings._workers or self._workers
|
|
84
68
|
self.min_task_size = settings.min_task_size or self.min_task_size
|
datachain/lib/udf.py
CHANGED
|
@@ -225,11 +225,10 @@ class UDFBase(AbstractUDF):
|
|
|
225
225
|
def __call__(self, *rows, cache, download_cb):
|
|
226
226
|
if self.is_input_grouped:
|
|
227
227
|
objs = self._parse_grouped_rows(rows[0], cache, download_cb)
|
|
228
|
+
elif self.is_input_batched:
|
|
229
|
+
objs = zip(*self._parse_rows(rows[0], cache, download_cb))
|
|
228
230
|
else:
|
|
229
|
-
objs = self._parse_rows(rows, cache, download_cb)
|
|
230
|
-
|
|
231
|
-
if not self.is_input_batched:
|
|
232
|
-
objs = objs[0]
|
|
231
|
+
objs = self._parse_rows([rows], cache, download_cb)[0]
|
|
233
232
|
|
|
234
233
|
result_objs = self.process_safe(objs)
|
|
235
234
|
|
|
@@ -259,17 +258,24 @@ class UDFBase(AbstractUDF):
|
|
|
259
258
|
|
|
260
259
|
if not self.is_output_batched:
|
|
261
260
|
res = list(res)
|
|
262
|
-
assert
|
|
263
|
-
|
|
264
|
-
)
|
|
261
|
+
assert (
|
|
262
|
+
len(res) == 1
|
|
263
|
+
), f"{self.name} returns {len(res)} rows while it's not batched"
|
|
265
264
|
if isinstance(res[0], tuple):
|
|
266
265
|
res = res[0]
|
|
266
|
+
elif (
|
|
267
|
+
self.is_input_batched
|
|
268
|
+
and self.is_output_batched
|
|
269
|
+
and not self.is_input_grouped
|
|
270
|
+
):
|
|
271
|
+
res = list(res)
|
|
272
|
+
assert len(res) == len(
|
|
273
|
+
rows[0]
|
|
274
|
+
), f"{self.name} returns {len(res)} rows while len(rows[0]) expected"
|
|
267
275
|
|
|
268
276
|
return res
|
|
269
277
|
|
|
270
278
|
def _parse_rows(self, rows, cache, download_cb):
|
|
271
|
-
if not self.is_input_batched:
|
|
272
|
-
rows = [rows]
|
|
273
279
|
objs = []
|
|
274
280
|
for row in rows:
|
|
275
281
|
obj_row = self.params.row_to_objs(row)
|
|
@@ -330,7 +336,9 @@ class Mapper(UDFBase):
|
|
|
330
336
|
"""Inherit from this class to pass to `DataChain.map()`."""
|
|
331
337
|
|
|
332
338
|
|
|
333
|
-
class BatchMapper(
|
|
339
|
+
class BatchMapper(UDFBase):
|
|
340
|
+
"""Inherit from this class to pass to `DataChain.batch_map()`."""
|
|
341
|
+
|
|
334
342
|
is_input_batched = True
|
|
335
343
|
is_output_batched = True
|
|
336
344
|
|
datachain/query/dataset.py
CHANGED
|
@@ -262,9 +262,7 @@ class DatasetDiffOperation(Step):
|
|
|
262
262
|
temp_tables.extend(self.dq.temp_table_names)
|
|
263
263
|
|
|
264
264
|
# creating temp table that will hold subtract results
|
|
265
|
-
temp_table_name = self.catalog.warehouse.
|
|
266
|
-
6
|
|
267
|
-
)
|
|
265
|
+
temp_table_name = self.catalog.warehouse.temp_table_name()
|
|
268
266
|
temp_tables.append(temp_table_name)
|
|
269
267
|
|
|
270
268
|
columns = [
|
|
@@ -448,9 +446,6 @@ class UDFStep(Step, ABC):
|
|
|
448
446
|
to select
|
|
449
447
|
"""
|
|
450
448
|
|
|
451
|
-
def udf_table_name(self) -> str:
|
|
452
|
-
return self.catalog.warehouse.UDF_TABLE_NAME_PREFIX + _random_string(6)
|
|
453
|
-
|
|
454
449
|
def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
|
|
455
450
|
use_partitioning = self.partition_by is not None
|
|
456
451
|
batching = self.udf.properties.get_batching(use_partitioning)
|
|
@@ -574,9 +569,7 @@ class UDFStep(Step, ABC):
|
|
|
574
569
|
list_partition_by = [self.partition_by]
|
|
575
570
|
|
|
576
571
|
# create table with partitions
|
|
577
|
-
tbl = self.catalog.warehouse.create_udf_table(
|
|
578
|
-
self.udf_table_name(), partition_columns()
|
|
579
|
-
)
|
|
572
|
+
tbl = self.catalog.warehouse.create_udf_table(partition_columns())
|
|
580
573
|
|
|
581
574
|
# fill table with partitions
|
|
582
575
|
cols = [
|
|
@@ -638,37 +631,12 @@ class UDFSignal(UDFStep):
|
|
|
638
631
|
for (col_name, col_type) in self.udf.output.items()
|
|
639
632
|
]
|
|
640
633
|
|
|
641
|
-
return self.catalog.warehouse.create_udf_table(
|
|
642
|
-
self.udf_table_name(), udf_output_columns
|
|
643
|
-
)
|
|
644
|
-
|
|
645
|
-
def create_pre_udf_table(self, query: Select) -> "Table":
|
|
646
|
-
columns = [
|
|
647
|
-
sqlalchemy.Column(c.name, c.type)
|
|
648
|
-
for c in query.selected_columns
|
|
649
|
-
if c.name != "sys__id"
|
|
650
|
-
]
|
|
651
|
-
table = self.catalog.warehouse.create_udf_table(self.udf_table_name(), columns)
|
|
652
|
-
select_q = query.with_only_columns(
|
|
653
|
-
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
654
|
-
)
|
|
655
|
-
|
|
656
|
-
# if there is order by clause we need row_number to preserve order
|
|
657
|
-
# if there is no order by clause we still need row_number to generate
|
|
658
|
-
# unique ids as uniqueness is important for this table
|
|
659
|
-
select_q = select_q.add_columns(
|
|
660
|
-
f.row_number().over(order_by=select_q._order_by_clauses).label("sys__id")
|
|
661
|
-
)
|
|
662
|
-
|
|
663
|
-
self.catalog.warehouse.db.execute(
|
|
664
|
-
table.insert().from_select(list(select_q.selected_columns), select_q)
|
|
665
|
-
)
|
|
666
|
-
return table
|
|
634
|
+
return self.catalog.warehouse.create_udf_table(udf_output_columns)
|
|
667
635
|
|
|
668
636
|
def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]:
|
|
669
637
|
if os.getenv("DATACHAIN_DISABLE_QUERY_CACHE", "") not in ("", "0"):
|
|
670
638
|
return query, []
|
|
671
|
-
table = self.create_pre_udf_table(query)
|
|
639
|
+
table = self.catalog.warehouse.create_pre_udf_table(query)
|
|
672
640
|
q: Select = sqlalchemy.select(*table.c)
|
|
673
641
|
if query._order_by_clauses:
|
|
674
642
|
# we are adding ordering only if it's explicitly added by user in
|
|
@@ -732,7 +700,7 @@ class RowGenerator(UDFStep):
|
|
|
732
700
|
def create_udf_table(self, query: Select) -> "Table":
|
|
733
701
|
warehouse = self.catalog.warehouse
|
|
734
702
|
|
|
735
|
-
table_name = self.udf_table_name()
|
|
703
|
+
table_name = self.catalog.warehouse.udf_table_name()
|
|
736
704
|
columns: tuple[Column, ...] = tuple(
|
|
737
705
|
Column(name, typ) for name, typ in self.udf.output.items()
|
|
738
706
|
)
|
|
@@ -1802,10 +1770,3 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
|
1802
1770
|
|
|
1803
1771
|
_send_result(dataset_query)
|
|
1804
1772
|
return dataset_query
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
def _random_string(length: int) -> str:
|
|
1808
|
-
return "".join(
|
|
1809
|
-
random.choice(string.ascii_letters + string.digits) # noqa: S311
|
|
1810
|
-
for i in range(length)
|
|
1811
|
-
)
|
datachain/sql/types.py
CHANGED
|
@@ -12,6 +12,7 @@ for sqlite we can use `sqlite.register_converter`
|
|
|
12
12
|
( https://docs.python.org/3/library/sqlite3.html#sqlite3.register_converter )
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
+
import json
|
|
15
16
|
from datetime import datetime
|
|
16
17
|
from types import MappingProxyType
|
|
17
18
|
from typing import Any, Union
|
|
@@ -247,7 +248,10 @@ class Array(SQLType):
|
|
|
247
248
|
return type_defaults(dialect).array()
|
|
248
249
|
|
|
249
250
|
def on_read_convert(self, value, dialect):
|
|
250
|
-
|
|
251
|
+
r = read_converter(dialect).array(value, self.item_type, dialect)
|
|
252
|
+
if isinstance(self.item_type, JSON):
|
|
253
|
+
r = [json.loads(item) if isinstance(item, str) else item for item in r]
|
|
254
|
+
return r
|
|
251
255
|
|
|
252
256
|
|
|
253
257
|
class JSON(SQLType):
|
|
@@ -35,38 +35,38 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
|
|
|
35
35
|
datachain/data_storage/metastore.py,sha256=wVcT8MiSH_paWEXN6eZ8Z3msrHY6vWtVFTH5kwHteRE,54852
|
|
36
36
|
datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=w0d_cZ2u9LpQYFFXll22mnxHaxPOoJdHlsKAZmONQpA,25605
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=WGHWBuBmNmK-qHwhvMfAwtXZ-fQKwk8w1dadN_4dugA,33293
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
41
|
+
datachain/lib/arrow.py,sha256=9C5AVH6tLo9hwzav-1tLLnmWP-3_SReYCOfcOC54pu0,4437
|
|
42
42
|
datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
|
|
43
|
-
datachain/lib/data_model.py,sha256=
|
|
43
|
+
datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=alJwK7z5JoUmGc1Kj74dGtlH2MJ0jeSyS2dnInemnnA,56386
|
|
46
46
|
datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
|
|
47
47
|
datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
|
|
48
|
-
datachain/lib/meta_formats.py,sha256=
|
|
48
|
+
datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
|
|
49
49
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
50
50
|
datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
|
|
51
|
-
datachain/lib/settings.py,sha256=
|
|
51
|
+
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
52
52
|
datachain/lib/signal_schema.py,sha256=lKGlpRRUHOUFLcpk-pLQd9kGAJ8FPy0Q2bk--UlVemU,14559
|
|
53
53
|
datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
|
|
54
|
-
datachain/lib/udf.py,sha256=
|
|
54
|
+
datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
|
|
55
55
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
56
56
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
57
57
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
|
|
59
59
|
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
60
60
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
|
-
datachain/lib/convert/flatten.py,sha256=
|
|
62
|
-
datachain/lib/convert/python_to_sql.py,sha256=
|
|
61
|
+
datachain/lib/convert/flatten.py,sha256=YMoC00BqEy3zSpvCp6Q0DfxihuPmgjUJj1g2cesWGPs,1790
|
|
62
|
+
datachain/lib/convert/python_to_sql.py,sha256=4gplGlr_Kg-Z40OpJUzJiarDWj7pwbUOk-dPOYYCJ9Q,2629
|
|
63
63
|
datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
|
|
64
64
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
65
65
|
datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
|
|
66
66
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
67
67
|
datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
|
|
68
68
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
69
|
-
datachain/query/dataset.py,sha256=
|
|
69
|
+
datachain/query/dataset.py,sha256=PJFVasYhCU0XvF7OrbxlAHLdm_PnhIQBp3TUDVHNHVY,60054
|
|
70
70
|
datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
|
|
71
71
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
72
72
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -77,7 +77,7 @@ datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
77
77
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
78
78
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
79
79
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
80
|
-
datachain/sql/types.py,sha256=
|
|
80
|
+
datachain/sql/types.py,sha256=SShudhdIpdfTKDxWDDqOajYRkTCkIgQbilA94g4i-4E,10389
|
|
81
81
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
82
82
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
83
83
|
datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
|
|
@@ -92,9 +92,9 @@ datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,
|
|
|
92
92
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
93
93
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
94
94
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
95
|
-
datachain-0.2.
|
|
96
|
-
datachain-0.2.
|
|
97
|
-
datachain-0.2.
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
95
|
+
datachain-0.2.15.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
96
|
+
datachain-0.2.15.dist-info/METADATA,sha256=kKdEsDFle6KQ55q9RlWsAd6DUTgAg40A8L5YWE9fbMg,14577
|
|
97
|
+
datachain-0.2.15.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
98
|
+
datachain-0.2.15.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
99
|
+
datachain-0.2.15.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
100
|
+
datachain-0.2.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|