datachain 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +0 -81
- datachain/cli.py +0 -37
- datachain/lib/arrow.py +4 -4
- datachain/lib/clip.py +14 -3
- datachain/lib/convert/python_to_sql.py +9 -0
- datachain/lib/data_model.py +10 -1
- datachain/lib/dc.py +95 -30
- datachain/lib/hf.py +166 -0
- datachain/lib/image.py +9 -1
- datachain/lib/pytorch.py +1 -2
- datachain/lib/signal_schema.py +124 -20
- datachain/lib/text.py +4 -0
- datachain/lib/udf.py +14 -20
- datachain/query/dataset.py +10 -3
- datachain/query/session.py +5 -3
- {datachain-0.3.7.dist-info → datachain-0.3.8.dist-info}/METADATA +8 -3
- {datachain-0.3.7.dist-info → datachain-0.3.8.dist-info}/RECORD +21 -20
- {datachain-0.3.7.dist-info → datachain-0.3.8.dist-info}/WHEEL +1 -1
- {datachain-0.3.7.dist-info → datachain-0.3.8.dist-info}/LICENSE +0 -0
- {datachain-0.3.7.dist-info → datachain-0.3.8.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.7.dist-info → datachain-0.3.8.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -1540,87 +1540,6 @@ class Catalog:
|
|
|
1540
1540
|
dataset = self.get_dataset(name)
|
|
1541
1541
|
return self.update_dataset(dataset, **update_data)
|
|
1542
1542
|
|
|
1543
|
-
def merge_datasets(
|
|
1544
|
-
self,
|
|
1545
|
-
src: DatasetRecord,
|
|
1546
|
-
dst: DatasetRecord,
|
|
1547
|
-
src_version: int,
|
|
1548
|
-
dst_version: Optional[int] = None,
|
|
1549
|
-
) -> DatasetRecord:
|
|
1550
|
-
"""
|
|
1551
|
-
Merges records from source to destination dataset.
|
|
1552
|
-
It will create new version
|
|
1553
|
-
of a dataset with records merged from old version and the source, unless
|
|
1554
|
-
existing version is specified for destination in which case it must
|
|
1555
|
-
be in non final status as datasets are immutable
|
|
1556
|
-
"""
|
|
1557
|
-
if (
|
|
1558
|
-
dst_version
|
|
1559
|
-
and not dst.is_valid_next_version(dst_version)
|
|
1560
|
-
and dst.get_version(dst_version).is_final_status()
|
|
1561
|
-
):
|
|
1562
|
-
raise DatasetInvalidVersionError(
|
|
1563
|
-
f"Version {dst_version} must be higher than the current latest one"
|
|
1564
|
-
)
|
|
1565
|
-
|
|
1566
|
-
src_dep = self.get_dataset_dependencies(src.name, src_version)
|
|
1567
|
-
dst_dep = self.get_dataset_dependencies(
|
|
1568
|
-
dst.name,
|
|
1569
|
-
dst.latest_version, # type: ignore[arg-type]
|
|
1570
|
-
)
|
|
1571
|
-
|
|
1572
|
-
if dst.has_version(dst_version): # type: ignore[arg-type]
|
|
1573
|
-
# case where we don't create new version, but append to the existing one
|
|
1574
|
-
self.warehouse.merge_dataset_rows(
|
|
1575
|
-
src,
|
|
1576
|
-
dst,
|
|
1577
|
-
src_version,
|
|
1578
|
-
dst_version=dst_version, # type: ignore[arg-type]
|
|
1579
|
-
)
|
|
1580
|
-
merged_schema = src.serialized_schema | dst.serialized_schema
|
|
1581
|
-
self.update_dataset(dst, schema=merged_schema)
|
|
1582
|
-
self.update_dataset_version_with_warehouse_info(
|
|
1583
|
-
dst,
|
|
1584
|
-
dst_version, # type: ignore[arg-type]
|
|
1585
|
-
schema=merged_schema,
|
|
1586
|
-
)
|
|
1587
|
-
for dep in src_dep:
|
|
1588
|
-
if dep and dep not in dst_dep:
|
|
1589
|
-
self.metastore.add_dependency(
|
|
1590
|
-
dep,
|
|
1591
|
-
dst.name,
|
|
1592
|
-
dst_version, # type: ignore[arg-type]
|
|
1593
|
-
)
|
|
1594
|
-
else:
|
|
1595
|
-
# case where we create new version of merged results
|
|
1596
|
-
src_dr = self.warehouse.dataset_rows(src, src_version)
|
|
1597
|
-
dst_dr = self.warehouse.dataset_rows(dst)
|
|
1598
|
-
|
|
1599
|
-
merge_result_columns = list(
|
|
1600
|
-
{
|
|
1601
|
-
c.name: c for c in list(src_dr.table.c) + list(dst_dr.table.c)
|
|
1602
|
-
}.values()
|
|
1603
|
-
)
|
|
1604
|
-
|
|
1605
|
-
dst_version = dst_version or dst.next_version
|
|
1606
|
-
dst = self.create_new_dataset_version(
|
|
1607
|
-
dst,
|
|
1608
|
-
dst_version,
|
|
1609
|
-
columns=merge_result_columns,
|
|
1610
|
-
)
|
|
1611
|
-
self.warehouse.merge_dataset_rows(
|
|
1612
|
-
src,
|
|
1613
|
-
dst,
|
|
1614
|
-
src_version,
|
|
1615
|
-
dst_version,
|
|
1616
|
-
)
|
|
1617
|
-
self.update_dataset_version_with_warehouse_info(dst, dst_version)
|
|
1618
|
-
for dep in set(src_dep + dst_dep):
|
|
1619
|
-
if dep:
|
|
1620
|
-
self.metastore.add_dependency(dep, dst.name, dst_version)
|
|
1621
|
-
|
|
1622
|
-
return dst
|
|
1623
|
-
|
|
1624
1543
|
def get_file_signals(
|
|
1625
1544
|
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1626
1545
|
) -> Optional[dict]:
|
datachain/cli.py
CHANGED
|
@@ -336,36 +336,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
336
336
|
help="Display size using powers of 1000 not 1024",
|
|
337
337
|
)
|
|
338
338
|
|
|
339
|
-
parse_merge_datasets = subp.add_parser(
|
|
340
|
-
"merge-datasets", parents=[parent_parser], description="Merges datasets"
|
|
341
|
-
)
|
|
342
|
-
parse_merge_datasets.add_argument(
|
|
343
|
-
"--src",
|
|
344
|
-
action="store",
|
|
345
|
-
default=None,
|
|
346
|
-
help="Source dataset name",
|
|
347
|
-
)
|
|
348
|
-
parse_merge_datasets.add_argument(
|
|
349
|
-
"--dst",
|
|
350
|
-
action="store",
|
|
351
|
-
default=None,
|
|
352
|
-
help="Destination dataset name",
|
|
353
|
-
)
|
|
354
|
-
parse_merge_datasets.add_argument(
|
|
355
|
-
"--src-version",
|
|
356
|
-
action="store",
|
|
357
|
-
default=None,
|
|
358
|
-
type=int,
|
|
359
|
-
help="Source dataset version",
|
|
360
|
-
)
|
|
361
|
-
parse_merge_datasets.add_argument(
|
|
362
|
-
"--dst-version",
|
|
363
|
-
action="store",
|
|
364
|
-
default=None,
|
|
365
|
-
type=int,
|
|
366
|
-
help="Destination dataset version",
|
|
367
|
-
)
|
|
368
|
-
|
|
369
339
|
parse_ls = subp.add_parser(
|
|
370
340
|
"ls", parents=[parent_parser], description="List storage contents"
|
|
371
341
|
)
|
|
@@ -996,13 +966,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
996
966
|
new_name=args.new_name,
|
|
997
967
|
labels=args.labels,
|
|
998
968
|
)
|
|
999
|
-
elif args.command == "merge-datasets":
|
|
1000
|
-
catalog.merge_datasets(
|
|
1001
|
-
catalog.get_dataset(args.src),
|
|
1002
|
-
catalog.get_dataset(args.dst),
|
|
1003
|
-
args.src_version,
|
|
1004
|
-
dst_version=args.dst_version,
|
|
1005
|
-
)
|
|
1006
969
|
elif args.command == "ls":
|
|
1007
970
|
ls(
|
|
1008
971
|
args.sources,
|
datachain/lib/arrow.py
CHANGED
|
@@ -95,7 +95,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
95
95
|
if not column:
|
|
96
96
|
column = f"c{default_column}"
|
|
97
97
|
default_column += 1
|
|
98
|
-
dtype =
|
|
98
|
+
dtype = arrow_type_mapper(field.type) # type: ignore[assignment]
|
|
99
99
|
if field.nullable:
|
|
100
100
|
dtype = Optional[dtype] # type: ignore[assignment]
|
|
101
101
|
output[column] = dtype
|
|
@@ -103,7 +103,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
103
103
|
return output
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
def
|
|
106
|
+
def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
107
107
|
"""Convert pyarrow types to basic types."""
|
|
108
108
|
from datetime import datetime
|
|
109
109
|
|
|
@@ -122,11 +122,11 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
|
|
|
122
122
|
if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
|
|
123
123
|
return str
|
|
124
124
|
if pa.types.is_list(col_type):
|
|
125
|
-
return list[
|
|
125
|
+
return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
|
|
126
126
|
if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
|
|
127
127
|
return dict
|
|
128
128
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
129
|
-
return
|
|
129
|
+
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
|
130
130
|
raise TypeError(f"{col_type!r} datatypes not supported")
|
|
131
131
|
|
|
132
132
|
|
datachain/lib/clip.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import inspect
|
|
2
|
-
from typing import TYPE_CHECKING, Any, Callable, Literal, Union
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
5
|
from transformers.modeling_utils import PreTrainedModel
|
|
@@ -39,6 +39,7 @@ def clip_similarity_scores(
|
|
|
39
39
|
tokenizer: Callable,
|
|
40
40
|
prob: bool = False,
|
|
41
41
|
image_to_text: bool = True,
|
|
42
|
+
device: Optional[Union[str, torch.device]] = None,
|
|
42
43
|
) -> list[list[float]]:
|
|
43
44
|
"""
|
|
44
45
|
Calculate CLIP similarity scores between one or more images and/or text.
|
|
@@ -52,6 +53,7 @@ def clip_similarity_scores(
|
|
|
52
53
|
prob : Compute softmax probabilities.
|
|
53
54
|
image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
|
|
54
55
|
if only one of images or text provided.
|
|
56
|
+
device : Device to use. Defaults is None - use model's device.
|
|
55
57
|
|
|
56
58
|
|
|
57
59
|
Example:
|
|
@@ -130,17 +132,26 @@ def clip_similarity_scores(
|
|
|
130
132
|
```
|
|
131
133
|
"""
|
|
132
134
|
|
|
135
|
+
if device is None:
|
|
136
|
+
if hasattr(model, "device"):
|
|
137
|
+
device = model.device
|
|
138
|
+
else:
|
|
139
|
+
device = next(model.parameters()).device
|
|
140
|
+
else:
|
|
141
|
+
model = model.to(device)
|
|
133
142
|
with torch.no_grad():
|
|
134
143
|
if images is not None:
|
|
135
144
|
encoder = _get_encoder(model, "image")
|
|
136
145
|
image_features = convert_images(
|
|
137
|
-
images, transform=preprocess, encoder=encoder
|
|
146
|
+
images, transform=preprocess, encoder=encoder, device=device
|
|
138
147
|
)
|
|
139
148
|
image_features /= image_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
|
|
140
149
|
|
|
141
150
|
if text is not None:
|
|
142
151
|
encoder = _get_encoder(model, "text")
|
|
143
|
-
text_features = convert_text(
|
|
152
|
+
text_features = convert_text(
|
|
153
|
+
text, tokenizer, encoder=encoder, device=device
|
|
154
|
+
)
|
|
144
155
|
text_features /= text_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
|
|
145
156
|
|
|
146
157
|
if images is not None and text is not None:
|
|
@@ -73,6 +73,9 @@ def python_to_sql(typ): # noqa: PLR0911
|
|
|
73
73
|
if len(args) == 2 and (type(None) in args):
|
|
74
74
|
return python_to_sql(args[0])
|
|
75
75
|
|
|
76
|
+
if _is_union_str_literal(orig, args):
|
|
77
|
+
return String
|
|
78
|
+
|
|
76
79
|
if _is_json_inside_union(orig, args):
|
|
77
80
|
return JSON
|
|
78
81
|
|
|
@@ -94,3 +97,9 @@ def _is_json_inside_union(orig, args) -> bool:
|
|
|
94
97
|
if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
|
|
95
98
|
return True
|
|
96
99
|
return False
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _is_union_str_literal(orig, args) -> bool:
|
|
103
|
+
if orig != Union:
|
|
104
|
+
return False
|
|
105
|
+
return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
|
datachain/lib/data_model.py
CHANGED
|
@@ -2,7 +2,7 @@ from collections.abc import Sequence
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import ClassVar, Union, get_args, get_origin
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel
|
|
5
|
+
from pydantic import BaseModel, create_model
|
|
6
6
|
|
|
7
7
|
from datachain.lib.model_store import ModelStore
|
|
8
8
|
|
|
@@ -57,3 +57,12 @@ def is_chain_type(t: type) -> bool:
|
|
|
57
57
|
return is_chain_type(args[0])
|
|
58
58
|
|
|
59
59
|
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
|
|
63
|
+
fields = {name: (anno, ...) for name, anno in data_dict.items()}
|
|
64
|
+
return create_model(
|
|
65
|
+
name,
|
|
66
|
+
__base__=(DataModel,), # type: ignore[call-overload]
|
|
67
|
+
**fields,
|
|
68
|
+
) # type: ignore[call-overload]
|
datachain/lib/dc.py
CHANGED
|
@@ -18,14 +18,13 @@ from typing import (
|
|
|
18
18
|
|
|
19
19
|
import pandas as pd
|
|
20
20
|
import sqlalchemy
|
|
21
|
-
from pydantic import BaseModel
|
|
21
|
+
from pydantic import BaseModel
|
|
22
22
|
from sqlalchemy.sql.functions import GenericFunction
|
|
23
23
|
from sqlalchemy.sql.sqltypes import NullType
|
|
24
24
|
|
|
25
|
-
from datachain import DataModel
|
|
26
25
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
27
26
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
28
|
-
from datachain.lib.data_model import DataType
|
|
27
|
+
from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
29
28
|
from datachain.lib.dataset_info import DatasetInfo
|
|
30
29
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
31
30
|
from datachain.lib.file import File, IndexedFile, get_file
|
|
@@ -55,6 +54,8 @@ from datachain.utils import inside_notebook
|
|
|
55
54
|
if TYPE_CHECKING:
|
|
56
55
|
from typing_extensions import Concatenate, ParamSpec, Self
|
|
57
56
|
|
|
57
|
+
from datachain.lib.hf import HFDatasetType
|
|
58
|
+
|
|
58
59
|
P = ParamSpec("P")
|
|
59
60
|
|
|
60
61
|
C = Column
|
|
@@ -77,12 +78,12 @@ def resolve_columns(
|
|
|
77
78
|
@wraps(method)
|
|
78
79
|
def _inner(self: D, *args: "P.args", **kwargs: "P.kwargs") -> D:
|
|
79
80
|
resolved_args = self.signals_schema.resolve(
|
|
80
|
-
*[arg for arg in args if not isinstance(arg, GenericFunction)]
|
|
81
|
+
*[arg for arg in args if not isinstance(arg, GenericFunction)] # type: ignore[arg-type]
|
|
81
82
|
).db_signals()
|
|
82
83
|
|
|
83
84
|
for idx, arg in enumerate(args):
|
|
84
85
|
if isinstance(arg, GenericFunction):
|
|
85
|
-
resolved_args.insert(idx, arg)
|
|
86
|
+
resolved_args.insert(idx, arg) # type: ignore[arg-type]
|
|
86
87
|
|
|
87
88
|
return method(self, *resolved_args, **kwargs)
|
|
88
89
|
|
|
@@ -208,23 +209,28 @@ class DataChain(DatasetQuery):
|
|
|
208
209
|
"size": 0,
|
|
209
210
|
}
|
|
210
211
|
|
|
211
|
-
def __init__(self, *args, **kwargs):
|
|
212
|
+
def __init__(self, *args, settings: Optional[dict] = None, **kwargs):
|
|
212
213
|
"""This method needs to be redefined as a part of Dataset and DataChain
|
|
213
214
|
decoupling.
|
|
214
215
|
"""
|
|
215
|
-
super().__init__(
|
|
216
|
+
super().__init__( # type: ignore[misc]
|
|
216
217
|
*args,
|
|
217
218
|
**kwargs,
|
|
218
219
|
indexing_column_types=File._datachain_column_types,
|
|
219
220
|
)
|
|
220
|
-
|
|
221
|
-
|
|
221
|
+
if settings:
|
|
222
|
+
self._settings = Settings(**settings)
|
|
223
|
+
else:
|
|
224
|
+
self._settings = Settings()
|
|
225
|
+
self._setup: dict = {}
|
|
222
226
|
|
|
223
227
|
self.signals_schema = SignalSchema({"sys": Sys})
|
|
224
228
|
if self.feature_schema:
|
|
225
229
|
self.signals_schema |= SignalSchema.deserialize(self.feature_schema)
|
|
226
230
|
else:
|
|
227
|
-
self.signals_schema |= SignalSchema.from_column_types(
|
|
231
|
+
self.signals_schema |= SignalSchema.from_column_types(
|
|
232
|
+
self.column_types or {}
|
|
233
|
+
)
|
|
228
234
|
|
|
229
235
|
self._sys = False
|
|
230
236
|
|
|
@@ -309,6 +315,7 @@ class DataChain(DatasetQuery):
|
|
|
309
315
|
*,
|
|
310
316
|
type: Literal["binary", "text", "image"] = "binary",
|
|
311
317
|
session: Optional[Session] = None,
|
|
318
|
+
settings: Optional[dict] = None,
|
|
312
319
|
in_memory: bool = False,
|
|
313
320
|
recursive: Optional[bool] = True,
|
|
314
321
|
object_name: str = "file",
|
|
@@ -336,6 +343,7 @@ class DataChain(DatasetQuery):
|
|
|
336
343
|
cls(
|
|
337
344
|
path,
|
|
338
345
|
session=session,
|
|
346
|
+
settings=settings,
|
|
339
347
|
recursive=recursive,
|
|
340
348
|
update=update,
|
|
341
349
|
in_memory=in_memory,
|
|
@@ -489,6 +497,7 @@ class DataChain(DatasetQuery):
|
|
|
489
497
|
def datasets(
|
|
490
498
|
cls,
|
|
491
499
|
session: Optional[Session] = None,
|
|
500
|
+
settings: Optional[dict] = None,
|
|
492
501
|
in_memory: bool = False,
|
|
493
502
|
object_name: str = "dataset",
|
|
494
503
|
) -> "DataChain":
|
|
@@ -513,6 +522,7 @@ class DataChain(DatasetQuery):
|
|
|
513
522
|
|
|
514
523
|
return cls.from_values(
|
|
515
524
|
session=session,
|
|
525
|
+
settings=settings,
|
|
516
526
|
in_memory=in_memory,
|
|
517
527
|
output={object_name: DatasetInfo},
|
|
518
528
|
**{object_name: datasets}, # type: ignore[arg-type]
|
|
@@ -895,7 +905,7 @@ class DataChain(DatasetQuery):
|
|
|
895
905
|
if isinstance(value, Column):
|
|
896
906
|
# renaming existing column
|
|
897
907
|
for signal in schema.db_signals(name=value.name, as_columns=True):
|
|
898
|
-
mutated[signal.name.replace(value.name, name, 1)] = signal
|
|
908
|
+
mutated[signal.name.replace(value.name, name, 1)] = signal # type: ignore[union-attr]
|
|
899
909
|
else:
|
|
900
910
|
# adding new signal
|
|
901
911
|
mutated[name] = value
|
|
@@ -1086,7 +1096,7 @@ class DataChain(DatasetQuery):
|
|
|
1086
1096
|
)
|
|
1087
1097
|
|
|
1088
1098
|
signals_schema = self.signals_schema.clone_without_sys_signals()
|
|
1089
|
-
on_columns = signals_schema.resolve(*on).db_signals()
|
|
1099
|
+
on_columns: list[str] = signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
1090
1100
|
|
|
1091
1101
|
right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
|
|
1092
1102
|
if right_on is not None:
|
|
@@ -1105,7 +1115,9 @@ class DataChain(DatasetQuery):
|
|
|
1105
1115
|
on, right_on, "'on' and 'right_on' must have the same length'"
|
|
1106
1116
|
)
|
|
1107
1117
|
|
|
1108
|
-
right_on_columns = right_signals_schema.resolve(
|
|
1118
|
+
right_on_columns: list[str] = right_signals_schema.resolve(
|
|
1119
|
+
*right_on
|
|
1120
|
+
).db_signals() # type: ignore[assignment]
|
|
1109
1121
|
|
|
1110
1122
|
if len(right_on_columns) != len(on_columns):
|
|
1111
1123
|
on_str = ", ".join(right_on_columns)
|
|
@@ -1169,7 +1181,7 @@ class DataChain(DatasetQuery):
|
|
|
1169
1181
|
"'on' cannot be empty",
|
|
1170
1182
|
)
|
|
1171
1183
|
else:
|
|
1172
|
-
signals = self.signals_schema.resolve(*on).db_signals()
|
|
1184
|
+
signals = self.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
1173
1185
|
return super()._subtract(other, signals) # type: ignore[arg-type]
|
|
1174
1186
|
|
|
1175
1187
|
@classmethod
|
|
@@ -1177,6 +1189,7 @@ class DataChain(DatasetQuery):
|
|
|
1177
1189
|
cls,
|
|
1178
1190
|
ds_name: str = "",
|
|
1179
1191
|
session: Optional[Session] = None,
|
|
1192
|
+
settings: Optional[dict] = None,
|
|
1180
1193
|
in_memory: bool = False,
|
|
1181
1194
|
output: OutputType = None,
|
|
1182
1195
|
object_name: str = "",
|
|
@@ -1195,10 +1208,13 @@ class DataChain(DatasetQuery):
|
|
|
1195
1208
|
yield from tuples
|
|
1196
1209
|
|
|
1197
1210
|
chain = DataChain.from_records(
|
|
1198
|
-
DataChain.DEFAULT_FILE_RECORD,
|
|
1211
|
+
DataChain.DEFAULT_FILE_RECORD,
|
|
1212
|
+
session=session,
|
|
1213
|
+
settings=settings,
|
|
1214
|
+
in_memory=in_memory,
|
|
1199
1215
|
)
|
|
1200
1216
|
if object_name:
|
|
1201
|
-
output = {object_name:
|
|
1217
|
+
output = {object_name: dict_to_data_model(object_name, output)} # type: ignore[arg-type]
|
|
1202
1218
|
return chain.gen(_func_fr, output=output)
|
|
1203
1219
|
|
|
1204
1220
|
@classmethod
|
|
@@ -1207,6 +1223,7 @@ class DataChain(DatasetQuery):
|
|
|
1207
1223
|
df: "pd.DataFrame",
|
|
1208
1224
|
name: str = "",
|
|
1209
1225
|
session: Optional[Session] = None,
|
|
1226
|
+
settings: Optional[dict] = None,
|
|
1210
1227
|
in_memory: bool = False,
|
|
1211
1228
|
object_name: str = "",
|
|
1212
1229
|
) -> "DataChain":
|
|
@@ -1236,7 +1253,12 @@ class DataChain(DatasetQuery):
|
|
|
1236
1253
|
)
|
|
1237
1254
|
|
|
1238
1255
|
return cls.from_values(
|
|
1239
|
-
name,
|
|
1256
|
+
name,
|
|
1257
|
+
session,
|
|
1258
|
+
settings=settings,
|
|
1259
|
+
object_name=object_name,
|
|
1260
|
+
in_memory=in_memory,
|
|
1261
|
+
**fr_map,
|
|
1240
1262
|
)
|
|
1241
1263
|
|
|
1242
1264
|
def to_pandas(self, flatten=False) -> "pd.DataFrame":
|
|
@@ -1306,6 +1328,59 @@ class DataChain(DatasetQuery):
|
|
|
1306
1328
|
if len(df) == limit:
|
|
1307
1329
|
print(f"\n[Limited by {len(df)} rows]")
|
|
1308
1330
|
|
|
1331
|
+
@classmethod
|
|
1332
|
+
def from_hf(
|
|
1333
|
+
cls,
|
|
1334
|
+
dataset: Union[str, "HFDatasetType"],
|
|
1335
|
+
*args,
|
|
1336
|
+
session: Optional[Session] = None,
|
|
1337
|
+
settings: Optional[dict] = None,
|
|
1338
|
+
object_name: str = "",
|
|
1339
|
+
model_name: str = "",
|
|
1340
|
+
**kwargs,
|
|
1341
|
+
) -> "DataChain":
|
|
1342
|
+
"""Generate chain from huggingface hub dataset.
|
|
1343
|
+
|
|
1344
|
+
Parameters:
|
|
1345
|
+
dataset : Path or name of the dataset to read from Hugging Face Hub,
|
|
1346
|
+
or an instance of `datasets.Dataset`-like object.
|
|
1347
|
+
session : Session to use for the chain.
|
|
1348
|
+
settings : Settings to use for the chain.
|
|
1349
|
+
object_name : Generated object column name.
|
|
1350
|
+
model_name : Generated model name.
|
|
1351
|
+
kwargs : Parameters to pass to datasets.load_dataset.
|
|
1352
|
+
|
|
1353
|
+
Example:
|
|
1354
|
+
Load from Hugging Face Hub:
|
|
1355
|
+
```py
|
|
1356
|
+
DataChain.from_hf("beans", split="train")
|
|
1357
|
+
```
|
|
1358
|
+
|
|
1359
|
+
Generate chain from loaded dataset:
|
|
1360
|
+
```py
|
|
1361
|
+
from datasets import load_dataset
|
|
1362
|
+
ds = load_dataset("beans", split="train")
|
|
1363
|
+
DataChain.from_hf(ds)
|
|
1364
|
+
```
|
|
1365
|
+
"""
|
|
1366
|
+
from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
|
|
1367
|
+
|
|
1368
|
+
output: dict[str, DataType] = {}
|
|
1369
|
+
ds_dict = stream_splits(dataset, *args, **kwargs)
|
|
1370
|
+
if len(ds_dict) > 1:
|
|
1371
|
+
output = {"split": str}
|
|
1372
|
+
|
|
1373
|
+
model_name = model_name or object_name or ""
|
|
1374
|
+
output = output | get_output_schema(next(iter(ds_dict.values())), model_name)
|
|
1375
|
+
model = dict_to_data_model(model_name, output)
|
|
1376
|
+
if object_name:
|
|
1377
|
+
output = {object_name: model}
|
|
1378
|
+
|
|
1379
|
+
chain = DataChain.from_values(
|
|
1380
|
+
split=list(ds_dict.keys()), session=session, settings=settings
|
|
1381
|
+
)
|
|
1382
|
+
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
|
1383
|
+
|
|
1309
1384
|
def parse_tabular(
|
|
1310
1385
|
self,
|
|
1311
1386
|
output: OutputType = None,
|
|
@@ -1367,7 +1442,7 @@ class DataChain(DatasetQuery):
|
|
|
1367
1442
|
|
|
1368
1443
|
if isinstance(output, dict):
|
|
1369
1444
|
model_name = model_name or object_name or ""
|
|
1370
|
-
model =
|
|
1445
|
+
model = dict_to_data_model(model_name, output)
|
|
1371
1446
|
else:
|
|
1372
1447
|
model = output # type: ignore[assignment]
|
|
1373
1448
|
|
|
@@ -1384,17 +1459,6 @@ class DataChain(DatasetQuery):
|
|
|
1384
1459
|
ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
|
|
1385
1460
|
)
|
|
1386
1461
|
|
|
1387
|
-
@staticmethod
|
|
1388
|
-
def _dict_to_data_model(
|
|
1389
|
-
name: str, data_dict: dict[str, DataType]
|
|
1390
|
-
) -> type[BaseModel]:
|
|
1391
|
-
fields = {name: (anno, ...) for name, anno in data_dict.items()}
|
|
1392
|
-
return create_model(
|
|
1393
|
-
name,
|
|
1394
|
-
__base__=(DataModel,), # type: ignore[call-overload]
|
|
1395
|
-
**fields,
|
|
1396
|
-
) # type: ignore[call-overload]
|
|
1397
|
-
|
|
1398
1462
|
@classmethod
|
|
1399
1463
|
def from_csv(
|
|
1400
1464
|
cls,
|
|
@@ -1543,6 +1607,7 @@ class DataChain(DatasetQuery):
|
|
|
1543
1607
|
cls,
|
|
1544
1608
|
to_insert: Optional[Union[dict, list[dict]]],
|
|
1545
1609
|
session: Optional[Session] = None,
|
|
1610
|
+
settings: Optional[dict] = None,
|
|
1546
1611
|
in_memory: bool = False,
|
|
1547
1612
|
schema: Optional[dict[str, DataType]] = None,
|
|
1548
1613
|
) -> "DataChain":
|
|
@@ -1597,7 +1662,7 @@ class DataChain(DatasetQuery):
|
|
|
1597
1662
|
insert_q = dr.get_table().insert()
|
|
1598
1663
|
for record in to_insert:
|
|
1599
1664
|
db.execute(insert_q.values(**record))
|
|
1600
|
-
return DataChain(name=dsr.name)
|
|
1665
|
+
return DataChain(name=dsr.name, settings=settings)
|
|
1601
1666
|
|
|
1602
1667
|
def sum(self, fr: DataType): # type: ignore[override]
|
|
1603
1668
|
"""Compute the sum of a column."""
|
datachain/lib/hf.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from datasets import (
|
|
3
|
+
Array2D,
|
|
4
|
+
Array3D,
|
|
5
|
+
Array4D,
|
|
6
|
+
Array5D,
|
|
7
|
+
Audio,
|
|
8
|
+
ClassLabel,
|
|
9
|
+
Dataset,
|
|
10
|
+
DatasetDict,
|
|
11
|
+
Image,
|
|
12
|
+
IterableDataset,
|
|
13
|
+
IterableDatasetDict,
|
|
14
|
+
Sequence,
|
|
15
|
+
Value,
|
|
16
|
+
load_dataset,
|
|
17
|
+
)
|
|
18
|
+
from datasets.features.features import string_to_arrow
|
|
19
|
+
from datasets.features.image import image_to_bytes
|
|
20
|
+
|
|
21
|
+
except ImportError as exc:
|
|
22
|
+
raise ImportError(
|
|
23
|
+
"Missing dependencies for huggingface datasets:\n"
|
|
24
|
+
"To install run:\n\n"
|
|
25
|
+
" pip install 'datachain[hf]'\n"
|
|
26
|
+
) from exc
|
|
27
|
+
|
|
28
|
+
from io import BytesIO
|
|
29
|
+
from typing import TYPE_CHECKING, Any, Union
|
|
30
|
+
|
|
31
|
+
import PIL
|
|
32
|
+
from tqdm import tqdm
|
|
33
|
+
|
|
34
|
+
from datachain.lib.arrow import arrow_type_mapper
|
|
35
|
+
from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
36
|
+
from datachain.lib.udf import Generator
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from pydantic import BaseModel
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
HFDatasetType = Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class HFClassLabel(DataModel):
|
|
46
|
+
string: str
|
|
47
|
+
integer: int
|
|
48
|
+
|
|
49
|
+
def read(self):
|
|
50
|
+
return self.integer
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HFImage(DataModel):
|
|
54
|
+
img: bytes
|
|
55
|
+
|
|
56
|
+
def read(self):
|
|
57
|
+
return PIL.Image.open(BytesIO(self.img))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class HFAudio(DataModel):
|
|
61
|
+
path: str
|
|
62
|
+
array: list[float]
|
|
63
|
+
sampling_rate: int
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class HFGenerator(Generator):
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
ds: Union[str, HFDatasetType],
|
|
70
|
+
output_schema: type["BaseModel"],
|
|
71
|
+
*args,
|
|
72
|
+
**kwargs,
|
|
73
|
+
):
|
|
74
|
+
super().__init__()
|
|
75
|
+
self.ds = ds
|
|
76
|
+
self.output_schema = output_schema
|
|
77
|
+
self.args = args
|
|
78
|
+
self.kwargs = kwargs
|
|
79
|
+
|
|
80
|
+
def setup(self):
|
|
81
|
+
self.ds_dict = stream_splits(self.ds, *self.args, **self.kwargs)
|
|
82
|
+
|
|
83
|
+
def process(self, split: str = ""):
|
|
84
|
+
desc = "Parsed Hugging Face dataset"
|
|
85
|
+
ds = self.ds_dict[split]
|
|
86
|
+
if split:
|
|
87
|
+
desc += f" split '{split}'"
|
|
88
|
+
with tqdm(desc=desc, unit=" rows") as pbar:
|
|
89
|
+
for row in ds:
|
|
90
|
+
output_dict = {}
|
|
91
|
+
if split:
|
|
92
|
+
output_dict["split"] = split
|
|
93
|
+
for name, feat in ds.features.items():
|
|
94
|
+
anno = self.output_schema.model_fields[name].annotation
|
|
95
|
+
output_dict[name] = _convert_feature(row[name], feat, anno)
|
|
96
|
+
yield self.output_schema(**output_dict)
|
|
97
|
+
pbar.update(1)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
|
|
101
|
+
if isinstance(ds, str):
|
|
102
|
+
ds = load_dataset(ds, *args, streaming=True, **kwargs)
|
|
103
|
+
if isinstance(ds, (DatasetDict, IterableDatasetDict)):
|
|
104
|
+
return ds
|
|
105
|
+
return {"": ds}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
109
|
+
if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
|
|
110
|
+
return val
|
|
111
|
+
if isinstance(feat, ClassLabel):
|
|
112
|
+
return HFClassLabel(string=feat.names[val], integer=val)
|
|
113
|
+
if isinstance(feat, Sequence):
|
|
114
|
+
if isinstance(feat.feature, dict):
|
|
115
|
+
sdict = {}
|
|
116
|
+
for sname in val:
|
|
117
|
+
sfeat = feat.feature[sname]
|
|
118
|
+
sanno = anno.model_fields[sname].annotation
|
|
119
|
+
sdict[sname] = [_convert_feature(v, sfeat, sanno) for v in val[sname]]
|
|
120
|
+
return anno(**sdict)
|
|
121
|
+
return val
|
|
122
|
+
if isinstance(feat, Image):
|
|
123
|
+
return HFImage(img=image_to_bytes(val))
|
|
124
|
+
if isinstance(feat, Audio):
|
|
125
|
+
return HFAudio(**val)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_output_schema(
|
|
129
|
+
ds: Union[Dataset, IterableDataset], model_name: str = ""
|
|
130
|
+
) -> dict[str, DataType]:
|
|
131
|
+
fields_dict = {}
|
|
132
|
+
for name, val in ds.features.items():
|
|
133
|
+
fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
|
|
134
|
+
return fields_dict # type: ignore[return-value]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
|
|
138
|
+
if isinstance(val, Value):
|
|
139
|
+
return arrow_type_mapper(val.pa_type)
|
|
140
|
+
if isinstance(val, ClassLabel):
|
|
141
|
+
return HFClassLabel
|
|
142
|
+
if isinstance(val, Sequence):
|
|
143
|
+
if isinstance(val.feature, dict):
|
|
144
|
+
sequence_dict = {}
|
|
145
|
+
for sname, sval in val.feature.items():
|
|
146
|
+
dtype = _feature_to_chain_type(sname, sval)
|
|
147
|
+
sequence_dict[sname] = list[dtype] # type: ignore[valid-type]
|
|
148
|
+
return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
|
|
149
|
+
return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
|
|
150
|
+
if isinstance(val, Array2D):
|
|
151
|
+
dtype = arrow_type_mapper(string_to_arrow(val.dtype))
|
|
152
|
+
return list[list[dtype]] # type: ignore[valid-type]
|
|
153
|
+
if isinstance(val, Array3D):
|
|
154
|
+
dtype = arrow_type_mapper(string_to_arrow(val.dtype))
|
|
155
|
+
return list[list[list[dtype]]] # type: ignore[valid-type]
|
|
156
|
+
if isinstance(val, Array4D):
|
|
157
|
+
dtype = arrow_type_mapper(string_to_arrow(val.dtype))
|
|
158
|
+
return list[list[list[list[dtype]]]] # type: ignore[valid-type]
|
|
159
|
+
if isinstance(val, Array5D):
|
|
160
|
+
dtype = arrow_type_mapper(string_to_arrow(val.dtype))
|
|
161
|
+
return list[list[list[list[list[dtype]]]]] # type: ignore[valid-type]
|
|
162
|
+
if isinstance(val, Image):
|
|
163
|
+
return HFImage
|
|
164
|
+
if isinstance(val, Audio):
|
|
165
|
+
return HFAudio
|
|
166
|
+
raise TypeError(f"Unknown huggingface datasets type {type(val)}")
|
datachain/lib/image.py
CHANGED
|
@@ -10,6 +10,7 @@ def convert_image(
|
|
|
10
10
|
size: Optional[tuple[int, int]] = None,
|
|
11
11
|
transform: Optional[Callable] = None,
|
|
12
12
|
encoder: Optional[Callable] = None,
|
|
13
|
+
device: Optional[Union[str, torch.device]] = None,
|
|
13
14
|
) -> Union[Image.Image, torch.Tensor]:
|
|
14
15
|
"""
|
|
15
16
|
Resize, transform, and otherwise convert an image.
|
|
@@ -20,6 +21,7 @@ def convert_image(
|
|
|
20
21
|
size (tuple[int, int]): Size in (width, height) pixels for resizing.
|
|
21
22
|
transform (Callable): Torchvision transform or huggingface processor to apply.
|
|
22
23
|
encoder (Callable): Encode image using model.
|
|
24
|
+
device (str or torch.device): Device to use.
|
|
23
25
|
"""
|
|
24
26
|
if mode:
|
|
25
27
|
img = img.convert(mode)
|
|
@@ -35,6 +37,8 @@ def convert_image(
|
|
|
35
37
|
img = torch.tensor(img.pixel_values[0]) # type: ignore[assignment,attr-defined]
|
|
36
38
|
except ImportError:
|
|
37
39
|
pass
|
|
40
|
+
if device:
|
|
41
|
+
img = img.to(device) # type: ignore[attr-defined]
|
|
38
42
|
if encoder:
|
|
39
43
|
img = img.unsqueeze(0) # type: ignore[attr-defined]
|
|
40
44
|
if encoder:
|
|
@@ -48,6 +52,7 @@ def convert_images(
|
|
|
48
52
|
size: Optional[tuple[int, int]] = None,
|
|
49
53
|
transform: Optional[Callable] = None,
|
|
50
54
|
encoder: Optional[Callable] = None,
|
|
55
|
+
device: Optional[Union[str, torch.device]] = None,
|
|
51
56
|
) -> Union[list[Image.Image], torch.Tensor]:
|
|
52
57
|
"""
|
|
53
58
|
Resize, transform, and otherwise convert one or more images.
|
|
@@ -58,11 +63,14 @@ def convert_images(
|
|
|
58
63
|
size (tuple[int, int]): Size in (width, height) pixels for resizing.
|
|
59
64
|
transform (Callable): Torchvision transform or huggingface processor to apply.
|
|
60
65
|
encoder (Callable): Encode image using model.
|
|
66
|
+
device (str or torch.device): Device to use.
|
|
61
67
|
"""
|
|
62
68
|
if isinstance(images, Image.Image):
|
|
63
69
|
images = [images]
|
|
64
70
|
|
|
65
|
-
converted = [
|
|
71
|
+
converted = [
|
|
72
|
+
convert_image(img, mode, size, transform, device=device) for img in images
|
|
73
|
+
]
|
|
66
74
|
|
|
67
75
|
if isinstance(converted[0], torch.Tensor):
|
|
68
76
|
converted = torch.stack(converted) # type: ignore[assignment,arg-type]
|
datachain/lib/pytorch.py
CHANGED
|
@@ -10,7 +10,6 @@ from torchvision.transforms import v2
|
|
|
10
10
|
|
|
11
11
|
from datachain.catalog import Catalog, get_catalog
|
|
12
12
|
from datachain.lib.dc import DataChain
|
|
13
|
-
from datachain.lib.file import File
|
|
14
13
|
from datachain.lib.text import convert_text
|
|
15
14
|
|
|
16
15
|
if TYPE_CHECKING:
|
|
@@ -97,7 +96,7 @@ class PytorchDataset(IterableDataset):
|
|
|
97
96
|
for row_features in ds.collect():
|
|
98
97
|
row = []
|
|
99
98
|
for fr in row_features:
|
|
100
|
-
if
|
|
99
|
+
if hasattr(fr, "read"):
|
|
101
100
|
row.append(fr.read()) # type: ignore[unreachable]
|
|
102
101
|
else:
|
|
103
102
|
row.append(fr)
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import warnings
|
|
2
3
|
from collections.abc import Iterator, Sequence
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from datetime import datetime
|
|
@@ -42,6 +43,8 @@ NAMES_TO_TYPES = {
|
|
|
42
43
|
"dict": dict,
|
|
43
44
|
"bytes": bytes,
|
|
44
45
|
"datetime": datetime,
|
|
46
|
+
"Literal": Literal,
|
|
47
|
+
"Union": Union,
|
|
45
48
|
}
|
|
46
49
|
|
|
47
50
|
|
|
@@ -49,6 +52,10 @@ class SignalSchemaError(DataChainParamsError):
|
|
|
49
52
|
pass
|
|
50
53
|
|
|
51
54
|
|
|
55
|
+
class SignalSchemaWarning(RuntimeWarning):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
52
59
|
class SignalResolvingError(SignalSchemaError):
|
|
53
60
|
def __init__(self, path: Optional[list[str]], msg: str):
|
|
54
61
|
name = " '" + ".".join(path) + "'" if path else ""
|
|
@@ -69,6 +76,28 @@ class SignalResolvingTypeError(SignalResolvingError):
|
|
|
69
76
|
)
|
|
70
77
|
|
|
71
78
|
|
|
79
|
+
def create_feature_model(
|
|
80
|
+
name: str, fields: dict[str, Union[type, tuple[type, Any]]]
|
|
81
|
+
) -> type[BaseModel]:
|
|
82
|
+
"""
|
|
83
|
+
This gets or returns a dynamic feature model for use in restoring a model
|
|
84
|
+
from the custom_types stored within a serialized SignalSchema. This is useful
|
|
85
|
+
when using a custom feature model where the original definition is not available.
|
|
86
|
+
This happens in Studio and if a custom model is used in a dataset, then that dataset
|
|
87
|
+
is used in a DataChain in a separate script where that model is not declared.
|
|
88
|
+
"""
|
|
89
|
+
name = name.replace("@", "_")
|
|
90
|
+
return create_model(
|
|
91
|
+
name,
|
|
92
|
+
__base__=DataModel, # type: ignore[call-overload]
|
|
93
|
+
# These are tuples for each field of: annotation, default (if any)
|
|
94
|
+
**{
|
|
95
|
+
field_name: anno if isinstance(anno, tuple) else (anno, None)
|
|
96
|
+
for field_name, anno in fields.items()
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
72
101
|
@dataclass
|
|
73
102
|
class SignalSchema:
|
|
74
103
|
values: dict[str, DataType]
|
|
@@ -117,40 +146,115 @@ class SignalSchema:
|
|
|
117
146
|
)
|
|
118
147
|
return SignalSchema(signals)
|
|
119
148
|
|
|
120
|
-
|
|
121
|
-
|
|
149
|
+
@staticmethod
|
|
150
|
+
def _get_name_original_type(fr_type: type) -> tuple[str, type]:
|
|
151
|
+
"""Returns the name of and the original type for the given type,
|
|
152
|
+
based on whether the type is Optional or not."""
|
|
153
|
+
orig = get_origin(fr_type)
|
|
154
|
+
args = get_args(fr_type)
|
|
155
|
+
# Check if fr_type is Optional
|
|
156
|
+
if orig == Union and len(args) == 2 and (type(None) in args):
|
|
157
|
+
fr_type = args[0]
|
|
158
|
+
orig = get_origin(fr_type)
|
|
159
|
+
if orig in (Literal, LiteralEx):
|
|
160
|
+
# Literal has no __name__ in Python 3.9
|
|
161
|
+
type_name = "Literal"
|
|
162
|
+
elif orig == Union:
|
|
163
|
+
# Union also has no __name__ in Python 3.9
|
|
164
|
+
type_name = "Union"
|
|
165
|
+
else:
|
|
166
|
+
type_name = str(fr_type.__name__) # type: ignore[union-attr]
|
|
167
|
+
return type_name, fr_type
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def serialize_custom_model_fields(
|
|
171
|
+
name: str, fr: type, custom_types: dict[str, Any]
|
|
172
|
+
) -> str:
|
|
173
|
+
"""This serializes any custom type information to the provided custom_types
|
|
174
|
+
dict, and returns the name of the type provided."""
|
|
175
|
+
if hasattr(fr, "__origin__") or not issubclass(fr, BaseModel):
|
|
176
|
+
# Don't store non-feature types.
|
|
177
|
+
return name
|
|
178
|
+
version_name = ModelStore.get_name(fr)
|
|
179
|
+
if version_name in custom_types:
|
|
180
|
+
# This type is already stored in custom_types.
|
|
181
|
+
return version_name
|
|
182
|
+
fields = {}
|
|
183
|
+
for field_name, info in fr.model_fields.items():
|
|
184
|
+
field_type = info.annotation
|
|
185
|
+
# All fields should be typed.
|
|
186
|
+
assert field_type
|
|
187
|
+
field_type_name, field_type = SignalSchema._get_name_original_type(
|
|
188
|
+
field_type
|
|
189
|
+
)
|
|
190
|
+
# Serialize this type to custom_types if it is a custom type as well.
|
|
191
|
+
fields[field_name] = SignalSchema.serialize_custom_model_fields(
|
|
192
|
+
field_type_name, field_type, custom_types
|
|
193
|
+
)
|
|
194
|
+
custom_types[version_name] = fields
|
|
195
|
+
return version_name
|
|
196
|
+
|
|
197
|
+
def serialize(self) -> dict[str, Any]:
|
|
198
|
+
signals: dict[str, Any] = {}
|
|
199
|
+
custom_types: dict[str, Any] = {}
|
|
122
200
|
for name, fr_type in self.values.items():
|
|
123
201
|
if (fr := ModelStore.to_pydantic(fr_type)) is not None:
|
|
124
202
|
ModelStore.register(fr)
|
|
125
203
|
signals[name] = ModelStore.get_name(fr)
|
|
204
|
+
type_name, fr_type = SignalSchema._get_name_original_type(fr)
|
|
126
205
|
else:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
signals[name] = str(fr_type.__name__) # type: ignore[union-attr]
|
|
206
|
+
type_name, fr_type = SignalSchema._get_name_original_type(fr_type)
|
|
207
|
+
signals[name] = type_name
|
|
208
|
+
self.serialize_custom_model_fields(type_name, fr_type, custom_types)
|
|
209
|
+
if custom_types:
|
|
210
|
+
signals["_custom_types"] = custom_types
|
|
133
211
|
return signals
|
|
134
212
|
|
|
135
213
|
@staticmethod
|
|
136
|
-
def
|
|
214
|
+
def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:
|
|
215
|
+
"""Convert a string-based type back into a python type."""
|
|
216
|
+
fr = NAMES_TO_TYPES.get(type_name)
|
|
217
|
+
if fr:
|
|
218
|
+
return fr # type: ignore[return-value]
|
|
219
|
+
|
|
220
|
+
model_name, version = ModelStore.parse_name_version(type_name)
|
|
221
|
+
fr = ModelStore.get(model_name, version)
|
|
222
|
+
if fr:
|
|
223
|
+
return fr
|
|
224
|
+
|
|
225
|
+
if type_name in custom_types:
|
|
226
|
+
fields = custom_types[type_name]
|
|
227
|
+
fields = {
|
|
228
|
+
field_name: SignalSchema._resolve_type(field_type_str, custom_types)
|
|
229
|
+
for field_name, field_type_str in fields.items()
|
|
230
|
+
}
|
|
231
|
+
return create_feature_model(type_name, fields)
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def deserialize(schema: dict[str, Any]) -> "SignalSchema":
|
|
137
236
|
if not isinstance(schema, dict):
|
|
138
237
|
raise SignalSchemaError(f"cannot deserialize signal schema: {schema}")
|
|
139
238
|
|
|
140
239
|
signals: dict[str, DataType] = {}
|
|
240
|
+
custom_types: dict[str, Any] = schema.get("_custom_types", {})
|
|
141
241
|
for signal, type_name in schema.items():
|
|
242
|
+
if signal == "_custom_types":
|
|
243
|
+
# This entry is used as a lookup for custom types,
|
|
244
|
+
# and is not an actual field.
|
|
245
|
+
continue
|
|
142
246
|
try:
|
|
143
|
-
fr =
|
|
144
|
-
if
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
247
|
+
fr = SignalSchema._resolve_type(type_name, custom_types)
|
|
248
|
+
if fr is None:
|
|
249
|
+
# Skip if the type is not found, so all data can be displayed.
|
|
250
|
+
warnings.warn(
|
|
251
|
+
f"In signal '{signal}': "
|
|
252
|
+
f"unknown type '{type_name}'."
|
|
253
|
+
f" Try to add it with `ModelStore.register({type_name})`.",
|
|
254
|
+
SignalSchemaWarning,
|
|
255
|
+
stacklevel=2,
|
|
256
|
+
)
|
|
257
|
+
continue
|
|
154
258
|
except TypeError as err:
|
|
155
259
|
raise SignalSchemaError(
|
|
156
260
|
f"cannot deserialize '{signal}': {err}"
|
datachain/lib/text.py
CHANGED
|
@@ -9,6 +9,7 @@ def convert_text(
|
|
|
9
9
|
tokenizer: Optional[Callable] = None,
|
|
10
10
|
tokenizer_kwargs: Optional[dict[str, Any]] = None,
|
|
11
11
|
encoder: Optional[Callable] = None,
|
|
12
|
+
device: Optional[Union[str, torch.device]] = None,
|
|
12
13
|
) -> Union[str, list[str], torch.Tensor]:
|
|
13
14
|
"""
|
|
14
15
|
Tokenize and otherwise transform text.
|
|
@@ -18,6 +19,7 @@ def convert_text(
|
|
|
18
19
|
tokenizer (Callable): Tokenizer to use to tokenize objects.
|
|
19
20
|
tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
|
|
20
21
|
encoder (Callable): Encode text using model.
|
|
22
|
+
device (str or torch.device): Device to use.
|
|
21
23
|
"""
|
|
22
24
|
if not tokenizer:
|
|
23
25
|
return text
|
|
@@ -32,6 +34,8 @@ def convert_text(
|
|
|
32
34
|
|
|
33
35
|
tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
|
|
34
36
|
tokens = torch.tensor(tokens)
|
|
37
|
+
if device:
|
|
38
|
+
tokens = tokens.to(device)
|
|
35
39
|
|
|
36
40
|
if not encoder:
|
|
37
41
|
return tokens
|
datachain/lib/udf.py
CHANGED
|
@@ -242,26 +242,8 @@ class UDFBase(AbstractUDF):
|
|
|
242
242
|
if not self.is_output_batched:
|
|
243
243
|
result_objs = [result_objs]
|
|
244
244
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
for tuple_ in result_objs:
|
|
248
|
-
flat = []
|
|
249
|
-
for obj in tuple_:
|
|
250
|
-
if isinstance(obj, BaseModel):
|
|
251
|
-
flat.extend(flatten(obj))
|
|
252
|
-
else:
|
|
253
|
-
flat.append(obj)
|
|
254
|
-
res.append(tuple(flat))
|
|
255
|
-
else:
|
|
256
|
-
# Generator expression is required, otherwise the value will be materialized
|
|
257
|
-
res = (
|
|
258
|
-
flatten(obj)
|
|
259
|
-
if isinstance(obj, BaseModel)
|
|
260
|
-
else obj
|
|
261
|
-
if isinstance(obj, tuple)
|
|
262
|
-
else (obj,)
|
|
263
|
-
for obj in result_objs
|
|
264
|
-
)
|
|
245
|
+
# Generator expression is required, otherwise the value will be materialized
|
|
246
|
+
res = (self._flatten_row(row) for row in result_objs)
|
|
265
247
|
|
|
266
248
|
if not self.is_output_batched:
|
|
267
249
|
res = list(res)
|
|
@@ -282,6 +264,18 @@ class UDFBase(AbstractUDF):
|
|
|
282
264
|
|
|
283
265
|
return res
|
|
284
266
|
|
|
267
|
+
def _flatten_row(self, row):
|
|
268
|
+
if len(self.output.values) > 1 and not isinstance(row, BaseModel):
|
|
269
|
+
flat = []
|
|
270
|
+
for obj in row:
|
|
271
|
+
flat.extend(self._obj_to_list(obj))
|
|
272
|
+
return tuple(flat)
|
|
273
|
+
return row if isinstance(row, tuple) else tuple(self._obj_to_list(row))
|
|
274
|
+
|
|
275
|
+
@staticmethod
|
|
276
|
+
def _obj_to_list(obj):
|
|
277
|
+
return flatten(obj) if isinstance(obj, BaseModel) else [obj]
|
|
278
|
+
|
|
285
279
|
def _parse_rows(self, rows, cache, download_cb):
|
|
286
280
|
objs = []
|
|
287
281
|
for row in rows:
|
datachain/query/dataset.py
CHANGED
|
@@ -24,6 +24,7 @@ from typing import (
|
|
|
24
24
|
)
|
|
25
25
|
|
|
26
26
|
import attrs
|
|
27
|
+
import psutil
|
|
27
28
|
import sqlalchemy
|
|
28
29
|
import sqlalchemy as sa
|
|
29
30
|
from attrs import frozen
|
|
@@ -383,7 +384,7 @@ def process_udf_outputs(
|
|
|
383
384
|
udf_table: "Table",
|
|
384
385
|
udf_results: Iterator[Iterable["UDFResult"]],
|
|
385
386
|
udf: UDFBase,
|
|
386
|
-
batch_size=INSERT_BATCH_SIZE,
|
|
387
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
387
388
|
cb: Callback = DEFAULT_CALLBACK,
|
|
388
389
|
) -> None:
|
|
389
390
|
rows: list[UDFResult] = []
|
|
@@ -396,7 +397,9 @@ def process_udf_outputs(
|
|
|
396
397
|
for row in udf_output:
|
|
397
398
|
cb.relative_update()
|
|
398
399
|
rows.append(adjust_outputs(warehouse, row, udf_col_types))
|
|
399
|
-
if len(rows) >= batch_size
|
|
400
|
+
if len(rows) >= batch_size or (
|
|
401
|
+
len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
|
|
402
|
+
):
|
|
400
403
|
for row_chunk in batched(rows, batch_size):
|
|
401
404
|
warehouse.insert_rows(udf_table, row_chunk)
|
|
402
405
|
rows.clear()
|
|
@@ -1775,6 +1778,10 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
|
1775
1778
|
save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
|
|
1776
1779
|
save_as = os.getenv("DATACHAIN_QUERY_SAVE_AS")
|
|
1777
1780
|
|
|
1781
|
+
is_session_temp_dataset = dataset_query.name and dataset_query.name.startswith(
|
|
1782
|
+
dataset_query.session.get_temp_prefix()
|
|
1783
|
+
)
|
|
1784
|
+
|
|
1778
1785
|
if save_as:
|
|
1779
1786
|
if dataset_query.attached:
|
|
1780
1787
|
dataset_name = dataset_query.name
|
|
@@ -1801,7 +1808,7 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
|
1801
1808
|
)
|
|
1802
1809
|
else:
|
|
1803
1810
|
dataset_query = dataset_query.save(save_as)
|
|
1804
|
-
elif save and not dataset_query.attached:
|
|
1811
|
+
elif save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1805
1812
|
name = catalog.generate_query_dataset_name()
|
|
1806
1813
|
dataset_query = dataset_query.save(name)
|
|
1807
1814
|
|
datachain/query/session.py
CHANGED
|
@@ -74,11 +74,13 @@ class Session:
|
|
|
74
74
|
self.catalog.id_generator.close_on_exit()
|
|
75
75
|
|
|
76
76
|
def generate_temp_dataset_name(self) -> str:
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
return self.get_temp_prefix() + uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
|
|
78
|
+
|
|
79
|
+
def get_temp_prefix(self) -> str:
|
|
80
|
+
return f"{self.DATASET_PREFIX}{self.name}_"
|
|
79
81
|
|
|
80
82
|
def _cleanup_temp_datasets(self) -> None:
|
|
81
|
-
prefix =
|
|
83
|
+
prefix = self.get_temp_prefix()
|
|
82
84
|
try:
|
|
83
85
|
for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
|
|
84
86
|
self.catalog.remove_dataset(dataset.name, force=True)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -41,10 +41,11 @@ Requires-Dist: jmespath >=1.0
|
|
|
41
41
|
Requires-Dist: datamodel-code-generator >=0.25
|
|
42
42
|
Requires-Dist: Pillow <11,>=10.0.0
|
|
43
43
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
44
|
+
Requires-Dist: psutil
|
|
44
45
|
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
45
46
|
Provides-Extra: dev
|
|
46
47
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
47
|
-
Requires-Dist: mypy ==1.11.
|
|
48
|
+
Requires-Dist: mypy ==1.11.2 ; extra == 'dev'
|
|
48
49
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
49
50
|
Requires-Dist: types-pytz ; extra == 'dev'
|
|
50
51
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
@@ -64,11 +65,14 @@ Requires-Dist: accelerate ; extra == 'examples'
|
|
|
64
65
|
Requires-Dist: unstructured[pdf] ; extra == 'examples'
|
|
65
66
|
Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
66
67
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
68
|
+
Provides-Extra: hf
|
|
69
|
+
Requires-Dist: numba >=0.60.0 ; extra == 'hf'
|
|
70
|
+
Requires-Dist: datasets[audio,vision] ; extra == 'hf'
|
|
67
71
|
Provides-Extra: remote
|
|
68
72
|
Requires-Dist: lz4 ; extra == 'remote'
|
|
69
73
|
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
70
74
|
Provides-Extra: tests
|
|
71
|
-
Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
|
|
75
|
+
Requires-Dist: datachain[hf,remote,torch,vector] ; extra == 'tests'
|
|
72
76
|
Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
73
77
|
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
74
78
|
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
@@ -83,6 +87,7 @@ Requires-Dist: hypothesis ; extra == 'tests'
|
|
|
83
87
|
Requires-Dist: open-clip-torch ; extra == 'tests'
|
|
84
88
|
Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
|
|
85
89
|
Requires-Dist: requests-mock ; extra == 'tests'
|
|
90
|
+
Requires-Dist: scipy ; extra == 'tests'
|
|
86
91
|
Provides-Extra: torch
|
|
87
92
|
Requires-Dist: torch >=2.1.0 ; extra == 'torch'
|
|
88
93
|
Requires-Dist: torchvision ; extra == 'torch'
|
|
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
|
|
4
4
|
datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=otR2eN0JL-JhZ9SOTPcPwt_-_TiT-vHifx2h4YzD6Tg,32052
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=6S4AnDos4sGYGhy4wNSyV2pKPQNXvo819cd3Dl8Htgg,78271
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -38,21 +38,22 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
|
|
|
38
38
|
datachain/data_storage/sqlite.py,sha256=jLgkvikYkENQUO_ykoNFfsBc2ofZXwFHLMa1nyWP3aw,28316
|
|
39
39
|
datachain/data_storage/warehouse.py,sha256=cvlfa-nyIxqrrpSRtCdeVjlTwhn7rcIoWjOq91HhItU,33668
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
-
datachain/lib/clip.py,sha256=
|
|
43
|
-
datachain/lib/data_model.py,sha256=
|
|
41
|
+
datachain/lib/arrow.py,sha256=W8bIxMIe_b3dqMFYKGWmfbC_7Xe0gV3UiJjQ2i4EYLA,4925
|
|
42
|
+
datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
|
|
43
|
+
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=wdMzFLglOhwWKHwh4qcLA0ezMrjuRJq2il2WnkHjyag,62490
|
|
46
46
|
datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
|
|
47
|
-
datachain/lib/
|
|
47
|
+
datachain/lib/hf.py,sha256=mYaHFPS4CW2-stRZHBMWW-NKN4dhrnhjZobBgRocnvo,5317
|
|
48
|
+
datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
|
|
48
49
|
datachain/lib/listing.py,sha256=nXLmGae_oQke4hnurzzWiHTEjHjWiqqHdB41Wb-hMTk,3521
|
|
49
50
|
datachain/lib/meta_formats.py,sha256=Hels85LJmNCz1aYVJvhymNdAt3qdJ2-qoxsIiUezrow,7198
|
|
50
51
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
51
|
-
datachain/lib/pytorch.py,sha256=
|
|
52
|
+
datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
|
|
52
53
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
53
|
-
datachain/lib/signal_schema.py,sha256=
|
|
54
|
-
datachain/lib/text.py,sha256=
|
|
55
|
-
datachain/lib/udf.py,sha256=
|
|
54
|
+
datachain/lib/signal_schema.py,sha256=rW1R6nIzdtmqWzpXk7aNAfrQD58_gbvkvEGyNTQ4WNM,20099
|
|
55
|
+
datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
|
|
56
|
+
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
56
57
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
57
58
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
58
59
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -60,20 +61,20 @@ datachain/lib/webdataset.py,sha256=SsjCKLSKEkHRRfeTHQhjoGqNPqIWw_SCWQcUwgUWWP0,8
|
|
|
60
61
|
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
61
62
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
63
|
datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
|
|
63
|
-
datachain/lib/convert/python_to_sql.py,sha256=
|
|
64
|
+
datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
|
|
64
65
|
datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
|
|
65
66
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
66
67
|
datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
|
|
67
68
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
68
69
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
69
70
|
datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
|
|
70
|
-
datachain/query/dataset.py,sha256=
|
|
71
|
+
datachain/query/dataset.py,sha256=G6xA3ItIGUJTXhizdAb6S3L1zFwTf8I0w0jHa1A6F4A,61103
|
|
71
72
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
72
73
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
73
74
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
74
75
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
75
76
|
datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
|
|
76
|
-
datachain/query/session.py,sha256=
|
|
77
|
+
datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
|
|
77
78
|
datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
|
|
78
79
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
79
80
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
@@ -94,9 +95,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
94
95
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
95
96
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
96
97
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
97
|
-
datachain-0.3.
|
|
98
|
-
datachain-0.3.
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
98
|
+
datachain-0.3.8.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
99
|
+
datachain-0.3.8.dist-info/METADATA,sha256=ivteXQrJgp8dKgIO2pdwUj6Qdg96rbI3Gq0kx5fyxtk,16903
|
|
100
|
+
datachain-0.3.8.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
|
|
101
|
+
datachain-0.3.8.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
102
|
+
datachain-0.3.8.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
103
|
+
datachain-0.3.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|