datachain 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +18 -29
- datachain/client/fsspec.py +9 -8
- datachain/data_storage/sqlite.py +19 -0
- datachain/data_storage/warehouse.py +19 -3
- datachain/dataset.py +1 -1
- datachain/lib/arrow.py +51 -16
- datachain/lib/dc.py +7 -2
- datachain/lib/file.py +76 -2
- datachain/lib/hf.py +23 -6
- datachain/lib/listing.py +7 -5
- datachain/lib/listing_info.py +2 -2
- datachain/lib/signal_schema.py +11 -2
- datachain/lib/tar.py +33 -0
- datachain/lib/webdataset.py +3 -59
- datachain/query/dataset.py +40 -25
- {datachain-0.3.14.dist-info → datachain-0.3.16.dist-info}/METADATA +1 -1
- {datachain-0.3.14.dist-info → datachain-0.3.16.dist-info}/RECORD +21 -20
- {datachain-0.3.14.dist-info → datachain-0.3.16.dist-info}/WHEEL +1 -1
- {datachain-0.3.14.dist-info → datachain-0.3.16.dist-info}/LICENSE +0 -0
- {datachain-0.3.14.dist-info → datachain-0.3.16.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.14.dist-info → datachain-0.3.16.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -621,10 +621,6 @@ class Catalog:
|
|
|
621
621
|
code_ast.body[-1:] = new_expressions
|
|
622
622
|
return code_ast
|
|
623
623
|
|
|
624
|
-
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
625
|
-
config = config or self.client_config
|
|
626
|
-
return Client.parse_url(uri, self.cache, **config)
|
|
627
|
-
|
|
628
624
|
def get_client(self, uri: StorageURI, **config: Any) -> Client:
|
|
629
625
|
"""
|
|
630
626
|
Return the client corresponding to the given source `uri`.
|
|
@@ -651,17 +647,16 @@ class Catalog:
|
|
|
651
647
|
partial_path: Optional[str]
|
|
652
648
|
|
|
653
649
|
client_config = client_config or self.client_config
|
|
654
|
-
|
|
650
|
+
uri, path = Client.parse_url(source)
|
|
651
|
+
client = Client.get_client(source, self.cache, **client_config)
|
|
655
652
|
stem = os.path.basename(os.path.normpath(path))
|
|
656
653
|
prefix = (
|
|
657
654
|
posixpath.dirname(path)
|
|
658
655
|
if glob.has_magic(stem) or client.fs.isfile(source)
|
|
659
656
|
else path
|
|
660
657
|
)
|
|
661
|
-
storage_dataset_name = Storage.dataset_name(
|
|
662
|
-
|
|
663
|
-
)
|
|
664
|
-
source_metastore = self.metastore.clone(client.uri)
|
|
658
|
+
storage_dataset_name = Storage.dataset_name(uri, posixpath.join(prefix, ""))
|
|
659
|
+
source_metastore = self.metastore.clone(uri)
|
|
665
660
|
|
|
666
661
|
columns = [
|
|
667
662
|
Column("path", String),
|
|
@@ -675,15 +670,13 @@ class Catalog:
|
|
|
675
670
|
]
|
|
676
671
|
|
|
677
672
|
if skip_indexing:
|
|
678
|
-
source_metastore.create_storage_if_not_registered(
|
|
679
|
-
storage = source_metastore.get_storage(
|
|
680
|
-
source_metastore.init_partial_id(
|
|
681
|
-
partial_id = source_metastore.get_next_partial_id(
|
|
673
|
+
source_metastore.create_storage_if_not_registered(uri)
|
|
674
|
+
storage = source_metastore.get_storage(uri)
|
|
675
|
+
source_metastore.init_partial_id(uri)
|
|
676
|
+
partial_id = source_metastore.get_next_partial_id(uri)
|
|
682
677
|
|
|
683
|
-
source_metastore = self.metastore.clone(
|
|
684
|
-
|
|
685
|
-
)
|
|
686
|
-
source_metastore.init(client.uri)
|
|
678
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
679
|
+
source_metastore.init(uri)
|
|
687
680
|
|
|
688
681
|
source_warehouse = self.warehouse.clone()
|
|
689
682
|
dataset = self.create_dataset(
|
|
@@ -701,20 +694,16 @@ class Catalog:
|
|
|
701
694
|
in_progress,
|
|
702
695
|
partial_id,
|
|
703
696
|
partial_path,
|
|
704
|
-
) = source_metastore.register_storage_for_indexing(
|
|
705
|
-
client.uri, force_update, prefix
|
|
706
|
-
)
|
|
697
|
+
) = source_metastore.register_storage_for_indexing(uri, force_update, prefix)
|
|
707
698
|
if in_progress:
|
|
708
699
|
raise PendingIndexingError(f"Pending indexing operation: uri={storage.uri}")
|
|
709
700
|
|
|
710
701
|
if not need_index:
|
|
711
702
|
assert partial_id is not None
|
|
712
703
|
assert partial_path is not None
|
|
713
|
-
source_metastore = self.metastore.clone(
|
|
714
|
-
uri=client.uri, partial_id=partial_id
|
|
715
|
-
)
|
|
704
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
716
705
|
source_warehouse = self.warehouse.clone()
|
|
717
|
-
dataset = self.get_dataset(Storage.dataset_name(
|
|
706
|
+
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
718
707
|
lst = Listing(storage, source_metastore, source_warehouse, client, dataset)
|
|
719
708
|
logger.debug(
|
|
720
709
|
"Using cached listing %s. Valid till: %s",
|
|
@@ -731,11 +720,11 @@ class Catalog:
|
|
|
731
720
|
|
|
732
721
|
return lst, path
|
|
733
722
|
|
|
734
|
-
source_metastore.init_partial_id(
|
|
735
|
-
partial_id = source_metastore.get_next_partial_id(
|
|
723
|
+
source_metastore.init_partial_id(uri)
|
|
724
|
+
partial_id = source_metastore.get_next_partial_id(uri)
|
|
736
725
|
|
|
737
|
-
source_metastore.init(
|
|
738
|
-
source_metastore = self.metastore.clone(uri=
|
|
726
|
+
source_metastore.init(uri)
|
|
727
|
+
source_metastore = self.metastore.clone(uri=uri, partial_id=partial_id)
|
|
739
728
|
|
|
740
729
|
source_warehouse = self.warehouse.clone()
|
|
741
730
|
|
|
@@ -1370,7 +1359,7 @@ class Catalog:
|
|
|
1370
1359
|
|
|
1371
1360
|
def signed_url(self, source: str, path: str, client_config=None) -> str:
|
|
1372
1361
|
client_config = client_config or self.client_config
|
|
1373
|
-
client
|
|
1362
|
+
client = Client.get_client(source, self.cache, **client_config)
|
|
1374
1363
|
return client.url(path)
|
|
1375
1364
|
|
|
1376
1365
|
def export_dataset_table(
|
datachain/client/fsspec.py
CHANGED
|
@@ -116,15 +116,16 @@ class Client(ABC):
|
|
|
116
116
|
return DATA_SOURCE_URI_PATTERN.match(name) is not None
|
|
117
117
|
|
|
118
118
|
@staticmethod
|
|
119
|
-
def parse_url(
|
|
120
|
-
source
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
119
|
+
def parse_url(source: str) -> tuple[StorageURI, str]:
|
|
120
|
+
cls = Client.get_implementation(source)
|
|
121
|
+
storage_name, rel_path = cls.split_url(source)
|
|
122
|
+
return cls.get_uri(storage_name), rel_path
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
|
|
124
126
|
cls = Client.get_implementation(source)
|
|
125
|
-
storage_url,
|
|
126
|
-
|
|
127
|
-
return client, rel_path
|
|
127
|
+
storage_url, _ = cls.split_url(source)
|
|
128
|
+
return cls.from_name(storage_url, cache, kwargs)
|
|
128
129
|
|
|
129
130
|
@classmethod
|
|
130
131
|
def create_fs(cls, **kwargs) -> "AbstractFileSystem":
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -40,7 +40,9 @@ if TYPE_CHECKING:
|
|
|
40
40
|
from sqlalchemy.dialects.sqlite import Insert
|
|
41
41
|
from sqlalchemy.engine.base import Engine
|
|
42
42
|
from sqlalchemy.schema import SchemaItem
|
|
43
|
+
from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
|
|
43
44
|
from sqlalchemy.sql.elements import ColumnElement
|
|
45
|
+
from sqlalchemy.sql.selectable import Join
|
|
44
46
|
from sqlalchemy.types import TypeEngine
|
|
45
47
|
|
|
46
48
|
from datachain.lib.file import File
|
|
@@ -788,6 +790,23 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
788
790
|
if progress_cb:
|
|
789
791
|
progress_cb(len(batch_ids))
|
|
790
792
|
|
|
793
|
+
def join(
|
|
794
|
+
self,
|
|
795
|
+
left: "_FromClauseArgument",
|
|
796
|
+
right: "_FromClauseArgument",
|
|
797
|
+
onclause: "_OnClauseArgument",
|
|
798
|
+
inner: bool = True,
|
|
799
|
+
) -> "Join":
|
|
800
|
+
"""
|
|
801
|
+
Join two tables together.
|
|
802
|
+
"""
|
|
803
|
+
return sqlalchemy.join(
|
|
804
|
+
left,
|
|
805
|
+
right,
|
|
806
|
+
onclause,
|
|
807
|
+
isouter=not inner,
|
|
808
|
+
)
|
|
809
|
+
|
|
791
810
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
792
811
|
"""
|
|
793
812
|
Create a temporary table from a query for use in a UDF.
|
|
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
|
|
|
27
27
|
from datachain.utils import sql_escape_like
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
|
-
from sqlalchemy.sql._typing import
|
|
31
|
-
|
|
30
|
+
from sqlalchemy.sql._typing import (
|
|
31
|
+
_ColumnsClauseArgument,
|
|
32
|
+
_FromClauseArgument,
|
|
33
|
+
_OnClauseArgument,
|
|
34
|
+
)
|
|
35
|
+
from sqlalchemy.sql.selectable import Join, Select
|
|
32
36
|
from sqlalchemy.types import TypeEngine
|
|
33
37
|
|
|
34
38
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
894
898
|
Copy the results of a query into a table.
|
|
895
899
|
"""
|
|
896
900
|
|
|
901
|
+
@abstractmethod
|
|
902
|
+
def join(
|
|
903
|
+
self,
|
|
904
|
+
left: "_FromClauseArgument",
|
|
905
|
+
right: "_FromClauseArgument",
|
|
906
|
+
onclause: "_OnClauseArgument",
|
|
907
|
+
inner: bool = True,
|
|
908
|
+
) -> "Join":
|
|
909
|
+
"""
|
|
910
|
+
Join two tables together.
|
|
911
|
+
"""
|
|
912
|
+
|
|
897
913
|
@abstractmethod
|
|
898
914
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
899
915
|
"""
|
|
@@ -922,7 +938,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
922
938
|
are cleaned up as soon as they are no longer needed.
|
|
923
939
|
"""
|
|
924
940
|
with tqdm(desc="Cleanup", unit=" tables") as pbar:
|
|
925
|
-
for name in names:
|
|
941
|
+
for name in set(names):
|
|
926
942
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
927
943
|
pbar.update(1)
|
|
928
944
|
|
datachain/dataset.py
CHANGED
|
@@ -112,7 +112,7 @@ class DatasetDependency:
|
|
|
112
112
|
|
|
113
113
|
if is_listing_dataset(dataset_name):
|
|
114
114
|
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
115
|
-
dependency_name = listing_uri_from_name(dataset_name)
|
|
115
|
+
dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
|
|
116
116
|
|
|
117
117
|
return cls(
|
|
118
118
|
id,
|
datachain/lib/arrow.py
CHANGED
|
@@ -13,8 +13,10 @@ from datachain.lib.model_store import ModelStore
|
|
|
13
13
|
from datachain.lib.udf import Generator
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
|
+
from datasets.features.features import Features
|
|
16
17
|
from pydantic import BaseModel
|
|
17
18
|
|
|
19
|
+
from datachain.lib.data_model import DataType
|
|
18
20
|
from datachain.lib.dc import DataChain
|
|
19
21
|
|
|
20
22
|
|
|
@@ -46,7 +48,10 @@ class ArrowGenerator(Generator):
|
|
|
46
48
|
self.kwargs = kwargs
|
|
47
49
|
|
|
48
50
|
def process(self, file: File):
|
|
49
|
-
if
|
|
51
|
+
if file._caching_enabled:
|
|
52
|
+
path = file.get_local_path(download=True)
|
|
53
|
+
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
54
|
+
elif self.nrows:
|
|
50
55
|
path = _nrows_file(file, self.nrows)
|
|
51
56
|
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
52
57
|
else:
|
|
@@ -54,6 +59,7 @@ class ArrowGenerator(Generator):
|
|
|
54
59
|
ds = dataset(
|
|
55
60
|
path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
|
|
56
61
|
)
|
|
62
|
+
hf_schema = _get_hf_schema(ds.schema)
|
|
57
63
|
index = 0
|
|
58
64
|
with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
|
|
59
65
|
for record_batch in ds.to_batches():
|
|
@@ -62,9 +68,17 @@ class ArrowGenerator(Generator):
|
|
|
62
68
|
if self.output_schema:
|
|
63
69
|
fields = self.output_schema.model_fields
|
|
64
70
|
vals_dict = {}
|
|
65
|
-
for (field, field_info), val in
|
|
66
|
-
|
|
67
|
-
|
|
71
|
+
for i, ((field, field_info), val) in enumerate(
|
|
72
|
+
zip(fields.items(), vals)
|
|
73
|
+
):
|
|
74
|
+
anno = field_info.annotation
|
|
75
|
+
if hf_schema:
|
|
76
|
+
from datachain.lib.hf import convert_feature
|
|
77
|
+
|
|
78
|
+
feat = list(hf_schema[0].values())[i]
|
|
79
|
+
vals_dict[field] = convert_feature(val, feat, anno)
|
|
80
|
+
elif ModelStore.is_pydantic(anno):
|
|
81
|
+
vals_dict[field] = anno(**val) # type: ignore[misc]
|
|
68
82
|
else:
|
|
69
83
|
vals_dict[field] = val
|
|
70
84
|
vals = [self.output_schema(**vals_dict)]
|
|
@@ -91,26 +105,36 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
91
105
|
"Error generating output from Arrow schema - "
|
|
92
106
|
f"Schema has {len(schema)} columns but got {len(col_names)} column names."
|
|
93
107
|
)
|
|
94
|
-
|
|
108
|
+
if not col_names:
|
|
109
|
+
col_names = schema.names
|
|
110
|
+
columns = _convert_col_names(col_names) # type: ignore[arg-type]
|
|
111
|
+
hf_schema = _get_hf_schema(schema)
|
|
112
|
+
if hf_schema:
|
|
113
|
+
return {
|
|
114
|
+
column: hf_type for hf_type, column in zip(hf_schema[1].values(), columns)
|
|
115
|
+
}
|
|
95
116
|
output = {}
|
|
96
|
-
for
|
|
97
|
-
if col_names:
|
|
98
|
-
column = col_names[i]
|
|
99
|
-
else:
|
|
100
|
-
column = field.name
|
|
101
|
-
column = column.lower()
|
|
102
|
-
column = re.sub("[^0-9a-z_]+", "", column)
|
|
103
|
-
if not column:
|
|
104
|
-
column = f"c{default_column}"
|
|
105
|
-
default_column += 1
|
|
117
|
+
for field, column in zip(schema, columns):
|
|
106
118
|
dtype = arrow_type_mapper(field.type, column) # type: ignore[assignment]
|
|
107
119
|
if field.nullable and not ModelStore.is_pydantic(dtype):
|
|
108
120
|
dtype = Optional[dtype] # type: ignore[assignment]
|
|
109
121
|
output[column] = dtype
|
|
110
|
-
|
|
111
122
|
return output
|
|
112
123
|
|
|
113
124
|
|
|
125
|
+
def _convert_col_names(col_names: Sequence[str]) -> list[str]:
|
|
126
|
+
default_column = 0
|
|
127
|
+
converted_col_names = []
|
|
128
|
+
for column in col_names:
|
|
129
|
+
column = column.lower()
|
|
130
|
+
column = re.sub("[^0-9a-z_]+", "", column)
|
|
131
|
+
if not column:
|
|
132
|
+
column = f"c{default_column}"
|
|
133
|
+
default_column += 1
|
|
134
|
+
converted_col_names.append(column)
|
|
135
|
+
return converted_col_names
|
|
136
|
+
|
|
137
|
+
|
|
114
138
|
def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
|
|
115
139
|
"""Convert pyarrow types to basic types."""
|
|
116
140
|
from datetime import datetime
|
|
@@ -156,3 +180,14 @@ def _nrows_file(file: File, nrows: int) -> str:
|
|
|
156
180
|
writer.write(line)
|
|
157
181
|
writer.write("\n")
|
|
158
182
|
return tf.name
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _get_hf_schema(
|
|
186
|
+
schema: "pa.Schema",
|
|
187
|
+
) -> Optional[tuple["Features", dict[str, "DataType"]]]:
|
|
188
|
+
if schema.metadata and b"huggingface" in schema.metadata:
|
|
189
|
+
from datachain.lib.hf import get_output_schema, schema_from_arrow
|
|
190
|
+
|
|
191
|
+
features = schema_from_arrow(schema)
|
|
192
|
+
return features, get_output_schema(features)
|
|
193
|
+
return None
|
datachain/lib/dc.py
CHANGED
|
@@ -408,7 +408,11 @@ class DataChain(DatasetQuery):
|
|
|
408
408
|
in_memory=in_memory,
|
|
409
409
|
)
|
|
410
410
|
.gen(
|
|
411
|
-
list_bucket(
|
|
411
|
+
list_bucket(
|
|
412
|
+
list_uri,
|
|
413
|
+
session.catalog.cache,
|
|
414
|
+
client_config=session.catalog.client_config,
|
|
415
|
+
),
|
|
412
416
|
output={f"{object_name}": File},
|
|
413
417
|
)
|
|
414
418
|
.save(list_dataset_name, listing=True)
|
|
@@ -1523,7 +1527,8 @@ class DataChain(DatasetQuery):
|
|
|
1523
1527
|
output = {"split": str}
|
|
1524
1528
|
|
|
1525
1529
|
model_name = model_name or object_name or ""
|
|
1526
|
-
|
|
1530
|
+
hf_features = next(iter(ds_dict.values())).features
|
|
1531
|
+
output = output | get_output_schema(hf_features, model_name)
|
|
1527
1532
|
model = dict_to_data_model(model_name, output)
|
|
1528
1533
|
if object_name:
|
|
1529
1534
|
output = {object_name: model}
|
datachain/lib/file.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
|
+
import logging
|
|
3
4
|
import os
|
|
4
5
|
import posixpath
|
|
5
6
|
from abc import ABC, abstractmethod
|
|
@@ -15,6 +16,9 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
|
15
16
|
from PIL import Image
|
|
16
17
|
from pydantic import Field, field_validator
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from typing_extensions import Self
|
|
21
|
+
|
|
18
22
|
from datachain.cache import UniqueId
|
|
19
23
|
from datachain.client.fileslice import FileSlice
|
|
20
24
|
from datachain.lib.data_model import DataModel
|
|
@@ -25,6 +29,8 @@ from datachain.utils import TIME_ZERO
|
|
|
25
29
|
if TYPE_CHECKING:
|
|
26
30
|
from datachain.catalog import Catalog
|
|
27
31
|
|
|
32
|
+
logger = logging.getLogger("datachain")
|
|
33
|
+
|
|
28
34
|
# how to create file path when exporting
|
|
29
35
|
ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
30
36
|
|
|
@@ -251,14 +257,18 @@ class File(DataModel):
|
|
|
251
257
|
dump = self.model_dump()
|
|
252
258
|
return UniqueId(*(dump[k] for k in self._unique_id_keys))
|
|
253
259
|
|
|
254
|
-
def get_local_path(self) -> Optional[str]:
|
|
260
|
+
def get_local_path(self, download: bool = False) -> Optional[str]:
|
|
255
261
|
"""Returns path to a file in a local cache.
|
|
256
262
|
Return None if file is not cached. Throws an exception if cache is not setup."""
|
|
257
263
|
if self._catalog is None:
|
|
258
264
|
raise RuntimeError(
|
|
259
265
|
"cannot resolve local file path because catalog is not setup"
|
|
260
266
|
)
|
|
261
|
-
|
|
267
|
+
uid = self.get_uid()
|
|
268
|
+
if download:
|
|
269
|
+
client = self._catalog.get_client(self.source)
|
|
270
|
+
client.download(uid, callback=self._download_cb)
|
|
271
|
+
return self._catalog.cache.get_path(uid)
|
|
262
272
|
|
|
263
273
|
def get_file_suffix(self):
|
|
264
274
|
"""Returns last part of file name with `.`."""
|
|
@@ -313,6 +323,70 @@ class File(DataModel):
|
|
|
313
323
|
"""Returns `fsspec` filesystem for the file."""
|
|
314
324
|
return self._catalog.get_client(self.source).fs
|
|
315
325
|
|
|
326
|
+
def resolve(self) -> "Self":
|
|
327
|
+
"""
|
|
328
|
+
Resolve a File object by checking its existence and updating its metadata.
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
File: The resolved File object with updated metadata.
|
|
332
|
+
"""
|
|
333
|
+
if self._catalog is None:
|
|
334
|
+
raise RuntimeError("Cannot resolve file: catalog is not set")
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
client = self._catalog.get_client(self.source)
|
|
338
|
+
except NotImplementedError as e:
|
|
339
|
+
raise RuntimeError(
|
|
340
|
+
f"Unsupported protocol for file source: {self.source}"
|
|
341
|
+
) from e
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
info = client.fs.info(client.get_full_path(self.path))
|
|
345
|
+
converted_info = client.info_to_file(info, self.source)
|
|
346
|
+
return type(self)(
|
|
347
|
+
path=self.path,
|
|
348
|
+
source=self.source,
|
|
349
|
+
size=converted_info.size,
|
|
350
|
+
etag=converted_info.etag,
|
|
351
|
+
version=converted_info.version,
|
|
352
|
+
is_latest=converted_info.is_latest,
|
|
353
|
+
last_modified=converted_info.last_modified,
|
|
354
|
+
location=self.location,
|
|
355
|
+
)
|
|
356
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
357
|
+
logger.warning("File system error when resolving %s: %s", self.path, str(e))
|
|
358
|
+
|
|
359
|
+
return type(self)(
|
|
360
|
+
path=self.path,
|
|
361
|
+
source=self.source,
|
|
362
|
+
size=0,
|
|
363
|
+
etag="",
|
|
364
|
+
version="",
|
|
365
|
+
is_latest=True,
|
|
366
|
+
last_modified=TIME_ZERO,
|
|
367
|
+
location=self.location,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def resolve(file: File) -> File:
|
|
372
|
+
"""
|
|
373
|
+
Resolve a File object by checking its existence and updating its metadata.
|
|
374
|
+
|
|
375
|
+
This function is a wrapper around the File.resolve() method, designed to be
|
|
376
|
+
used as a mapper in DataChain operations.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
file (File): The File object to resolve.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
File: The resolved File object with updated metadata.
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
RuntimeError: If the file's catalog is not set or if
|
|
386
|
+
the file source protocol is unsupported.
|
|
387
|
+
"""
|
|
388
|
+
return file.resolve()
|
|
389
|
+
|
|
316
390
|
|
|
317
391
|
class TextFile(File):
|
|
318
392
|
"""`DataModel` for reading text files."""
|
datachain/lib/hf.py
CHANGED
|
@@ -15,7 +15,7 @@ try:
|
|
|
15
15
|
Value,
|
|
16
16
|
load_dataset,
|
|
17
17
|
)
|
|
18
|
-
from datasets.features.features import string_to_arrow
|
|
18
|
+
from datasets.features.features import Features, string_to_arrow
|
|
19
19
|
from datasets.features.image import image_to_bytes
|
|
20
20
|
|
|
21
21
|
except ImportError as exc:
|
|
@@ -36,6 +36,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
|
36
36
|
from datachain.lib.udf import Generator
|
|
37
37
|
|
|
38
38
|
if TYPE_CHECKING:
|
|
39
|
+
import pyarrow as pa
|
|
39
40
|
from pydantic import BaseModel
|
|
40
41
|
|
|
41
42
|
|
|
@@ -71,6 +72,15 @@ class HFGenerator(Generator):
|
|
|
71
72
|
*args,
|
|
72
73
|
**kwargs,
|
|
73
74
|
):
|
|
75
|
+
"""
|
|
76
|
+
Generator for chain from huggingface datasets.
|
|
77
|
+
|
|
78
|
+
Parameters:
|
|
79
|
+
|
|
80
|
+
ds : Path or name of the dataset to read from Hugging Face Hub,
|
|
81
|
+
or an instance of `datasets.Dataset`-like object.
|
|
82
|
+
output_schema : Pydantic model for validation.
|
|
83
|
+
"""
|
|
74
84
|
super().__init__()
|
|
75
85
|
self.ds = ds
|
|
76
86
|
self.output_schema = output_schema
|
|
@@ -92,7 +102,7 @@ class HFGenerator(Generator):
|
|
|
92
102
|
output_dict["split"] = split
|
|
93
103
|
for name, feat in ds.features.items():
|
|
94
104
|
anno = self.output_schema.model_fields[name].annotation
|
|
95
|
-
output_dict[name] =
|
|
105
|
+
output_dict[name] = convert_feature(row[name], feat, anno)
|
|
96
106
|
yield self.output_schema(**output_dict)
|
|
97
107
|
pbar.update(1)
|
|
98
108
|
|
|
@@ -106,7 +116,7 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
|
|
|
106
116
|
return {"": ds}
|
|
107
117
|
|
|
108
118
|
|
|
109
|
-
def
|
|
119
|
+
def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
|
|
110
120
|
if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
|
|
111
121
|
return val
|
|
112
122
|
if isinstance(feat, ClassLabel):
|
|
@@ -117,20 +127,23 @@ def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
|
117
127
|
for sname in val:
|
|
118
128
|
sfeat = feat.feature[sname]
|
|
119
129
|
sanno = anno.model_fields[sname].annotation
|
|
120
|
-
sdict[sname] = [
|
|
130
|
+
sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
|
|
121
131
|
return anno(**sdict)
|
|
122
132
|
return val
|
|
123
133
|
if isinstance(feat, Image):
|
|
134
|
+
if isinstance(val, dict):
|
|
135
|
+
return HFImage(img=val["bytes"])
|
|
124
136
|
return HFImage(img=image_to_bytes(val))
|
|
125
137
|
if isinstance(feat, Audio):
|
|
126
138
|
return HFAudio(**val)
|
|
127
139
|
|
|
128
140
|
|
|
129
141
|
def get_output_schema(
|
|
130
|
-
|
|
142
|
+
features: Features, model_name: str = "", stream: bool = True
|
|
131
143
|
) -> dict[str, DataType]:
|
|
144
|
+
"""Generate UDF output schema from huggingface datasets features."""
|
|
132
145
|
fields_dict = {}
|
|
133
|
-
for name, val in
|
|
146
|
+
for name, val in features.items():
|
|
134
147
|
fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
|
|
135
148
|
return fields_dict # type: ignore[return-value]
|
|
136
149
|
|
|
@@ -165,3 +178,7 @@ def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
|
|
|
165
178
|
if isinstance(val, Audio):
|
|
166
179
|
return HFAudio
|
|
167
180
|
raise TypeError(f"Unknown huggingface datasets type {type(val)}")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def schema_from_arrow(schema: "pa.Schema"):
|
|
184
|
+
return Features.from_arrow_schema(schema)
|
datachain/lib/listing.py
CHANGED
|
@@ -20,7 +20,7 @@ LISTING_TTL = 4 * 60 * 60 # cached listing lasts 4 hours
|
|
|
20
20
|
LISTING_PREFIX = "lst__" # listing datasets start with this name
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def list_bucket(uri: str, client_config=None) -> Callable:
|
|
23
|
+
def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
24
24
|
"""
|
|
25
25
|
Function that returns another generator function that yields File objects
|
|
26
26
|
from bucket where each File represents one bucket entry.
|
|
@@ -28,7 +28,8 @@ def list_bucket(uri: str, client_config=None) -> Callable:
|
|
|
28
28
|
|
|
29
29
|
def list_func() -> Iterator[File]:
|
|
30
30
|
config = client_config or {}
|
|
31
|
-
client
|
|
31
|
+
client = Client.get_client(uri, cache, **config) # type: ignore[arg-type]
|
|
32
|
+
_, path = Client.parse_url(uri)
|
|
32
33
|
for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
|
|
33
34
|
yield from entries
|
|
34
35
|
|
|
@@ -76,16 +77,17 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
76
77
|
"""
|
|
77
78
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
78
79
|
"""
|
|
79
|
-
client
|
|
80
|
+
client = Client.get_client(uri, cache, **client_config)
|
|
81
|
+
storage_uri, path = Client.parse_url(uri)
|
|
80
82
|
|
|
81
83
|
# clean path without globs
|
|
82
84
|
lst_uri_path = (
|
|
83
85
|
posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
|
|
84
86
|
)
|
|
85
87
|
|
|
86
|
-
lst_uri = f"{
|
|
88
|
+
lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
|
|
87
89
|
ds_name = (
|
|
88
|
-
f"{LISTING_PREFIX}{
|
|
90
|
+
f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
|
|
89
91
|
)
|
|
90
92
|
|
|
91
93
|
return ds_name, lst_uri, path
|
datachain/lib/listing_info.py
CHANGED
|
@@ -13,8 +13,8 @@ class ListingInfo(DatasetInfo):
|
|
|
13
13
|
|
|
14
14
|
@property
|
|
15
15
|
def storage_uri(self) -> str:
|
|
16
|
-
|
|
17
|
-
return
|
|
16
|
+
uri, _ = Client.parse_url(self.uri)
|
|
17
|
+
return uri
|
|
18
18
|
|
|
19
19
|
@property
|
|
20
20
|
def expires(self) -> Optional[datetime]:
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -386,11 +386,20 @@ class SignalSchema:
|
|
|
386
386
|
else:
|
|
387
387
|
json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
|
|
388
388
|
obj = fr(**json)
|
|
389
|
-
|
|
390
|
-
obj._set_stream(catalog, caching_enabled=cache)
|
|
389
|
+
SignalSchema._set_file_stream(obj, catalog, cache)
|
|
391
390
|
res.append(obj)
|
|
392
391
|
return res
|
|
393
392
|
|
|
393
|
+
@staticmethod
|
|
394
|
+
def _set_file_stream(
|
|
395
|
+
obj: BaseModel, catalog: "Catalog", cache: bool = False
|
|
396
|
+
) -> None:
|
|
397
|
+
if isinstance(obj, File):
|
|
398
|
+
obj._set_stream(catalog, caching_enabled=cache)
|
|
399
|
+
for field, finfo in obj.model_fields.items():
|
|
400
|
+
if ModelStore.is_pydantic(finfo.annotation):
|
|
401
|
+
SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
|
|
402
|
+
|
|
394
403
|
def db_signals(
|
|
395
404
|
self, name: Optional[str] = None, as_columns=False
|
|
396
405
|
) -> Union[list[str], list[Column]]:
|
datachain/lib/tar.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import tarfile
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
|
|
5
|
+
from datachain.lib.file import File, TarVFile
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
|
|
9
|
+
new_parent = parent.get_full_name()
|
|
10
|
+
etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
|
|
11
|
+
etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
|
|
12
|
+
return File(
|
|
13
|
+
source=parent.source,
|
|
14
|
+
path=f"{new_parent}/{info.name}",
|
|
15
|
+
version=parent.version,
|
|
16
|
+
size=info.size,
|
|
17
|
+
etag=etag,
|
|
18
|
+
location=[
|
|
19
|
+
{
|
|
20
|
+
"vtype": TarVFile.get_vtype(),
|
|
21
|
+
"parent": parent.model_dump_custom(),
|
|
22
|
+
"size": info.size,
|
|
23
|
+
"offset": info.offset_data,
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def process_tar(file: File) -> Iterator[File]:
|
|
30
|
+
with file.open() as fd:
|
|
31
|
+
with tarfile.open(fileobj=fd) as tar:
|
|
32
|
+
for entry in tar.getmembers():
|
|
33
|
+
yield build_tar_member(file, entry)
|
datachain/lib/webdataset.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import hashlib
|
|
2
1
|
import json
|
|
3
2
|
import tarfile
|
|
4
3
|
import warnings
|
|
@@ -17,7 +16,8 @@ from typing import (
|
|
|
17
16
|
from pydantic import Field
|
|
18
17
|
|
|
19
18
|
from datachain.lib.data_model import DataModel
|
|
20
|
-
from datachain.lib.file import File
|
|
19
|
+
from datachain.lib.file import File
|
|
20
|
+
from datachain.lib.tar import build_tar_member
|
|
21
21
|
from datachain.lib.utils import DataChainError
|
|
22
22
|
|
|
23
23
|
# The `json` method of the Pydantic `BaseModel` class has been deprecated
|
|
@@ -176,34 +176,11 @@ class Builder:
|
|
|
176
176
|
self._tar_stream, self._core_extensions, self.state.stem
|
|
177
177
|
)
|
|
178
178
|
|
|
179
|
-
file = self.
|
|
179
|
+
file = build_tar_member(self._tar_stream, self.state.core_file)
|
|
180
180
|
wds = self._wds_class(**self.state.data | {"file": file})
|
|
181
181
|
self.state = BuilderState()
|
|
182
182
|
return wds
|
|
183
183
|
|
|
184
|
-
def build_file_record(self):
|
|
185
|
-
new_parent = self._tar_stream.get_full_name()
|
|
186
|
-
core_file = self.state.core_file
|
|
187
|
-
etag_string = "-".join(
|
|
188
|
-
[self._tar_stream.etag, core_file.name, str(core_file.mtime)]
|
|
189
|
-
)
|
|
190
|
-
etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
|
|
191
|
-
return File(
|
|
192
|
-
source=self._tar_stream.source,
|
|
193
|
-
path=f"{new_parent}/{core_file.name}",
|
|
194
|
-
version=self._tar_stream.version,
|
|
195
|
-
size=core_file.size,
|
|
196
|
-
etag=etag,
|
|
197
|
-
location=[
|
|
198
|
-
{
|
|
199
|
-
"vtype": TarVFile.get_vtype(),
|
|
200
|
-
"parent": self._tar_stream.model_dump_custom(),
|
|
201
|
-
"size": core_file.size,
|
|
202
|
-
"offset": core_file.offset_data,
|
|
203
|
-
}
|
|
204
|
-
],
|
|
205
|
-
)
|
|
206
|
-
|
|
207
184
|
def _get_type(self, ext):
|
|
208
185
|
field = self._wds_class.model_fields.get(ext, None)
|
|
209
186
|
if field is None:
|
|
@@ -217,39 +194,6 @@ class Builder:
|
|
|
217
194
|
return anno
|
|
218
195
|
|
|
219
196
|
|
|
220
|
-
class TarStream(File):
|
|
221
|
-
@staticmethod
|
|
222
|
-
def to_text(data):
|
|
223
|
-
return data.decode("utf-8")
|
|
224
|
-
|
|
225
|
-
_DATA_CONVERTERS: ClassVar[dict[type, Any]] = {
|
|
226
|
-
str: lambda data: TarStream.to_text(data),
|
|
227
|
-
int: lambda data: int(TarStream.to_text(data)),
|
|
228
|
-
float: lambda data: float(TarStream.to_text(data)),
|
|
229
|
-
bytes: lambda data: data,
|
|
230
|
-
dict: lambda data: json.loads(TarStream.to_text(data)),
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
def __init__(self, **kwargs):
|
|
234
|
-
super().__init__(**kwargs)
|
|
235
|
-
self._tar = None
|
|
236
|
-
|
|
237
|
-
def open(self):
|
|
238
|
-
self._tar = tarfile.open(fileobj=super().open()) # noqa: SIM115
|
|
239
|
-
return self
|
|
240
|
-
|
|
241
|
-
def getmembers(self) -> list[tarfile.TarInfo]:
|
|
242
|
-
return self._tar.getmembers()
|
|
243
|
-
|
|
244
|
-
def read_member(self, member: tarfile.TarInfo, type):
|
|
245
|
-
fd = self._tar.extractfile(member)
|
|
246
|
-
data = fd.read()
|
|
247
|
-
converter = self._DATA_CONVERTERS.get(type, None)
|
|
248
|
-
if not converter:
|
|
249
|
-
raise ValueError("")
|
|
250
|
-
return converter(data)
|
|
251
|
-
|
|
252
|
-
|
|
253
197
|
def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
|
|
254
198
|
builder = Builder(stream, core_extensions, spec, tar, encoding)
|
|
255
199
|
|
datachain/query/dataset.py
CHANGED
|
@@ -33,10 +33,10 @@ from sqlalchemy.sql.elements import ColumnClause, ColumnElement
|
|
|
33
33
|
from sqlalchemy.sql.expression import label
|
|
34
34
|
from sqlalchemy.sql.schema import TableClause
|
|
35
35
|
from sqlalchemy.sql.selectable import Select
|
|
36
|
-
from tqdm import tqdm
|
|
37
36
|
|
|
38
37
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
39
38
|
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
39
|
+
from datachain.client import Client
|
|
40
40
|
from datachain.data_storage.schema import (
|
|
41
41
|
PARTITION_COLUMN_ID,
|
|
42
42
|
partition_col_names,
|
|
@@ -194,7 +194,7 @@ class IndexingStep(StartingStep):
|
|
|
194
194
|
|
|
195
195
|
def apply(self):
|
|
196
196
|
self.catalog.index([self.path], **self.kwargs)
|
|
197
|
-
uri, path = self.
|
|
197
|
+
uri, path = Client.parse_url(self.path)
|
|
198
198
|
_partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
|
|
199
199
|
uri, path
|
|
200
200
|
)
|
|
@@ -216,11 +216,6 @@ class IndexingStep(StartingStep):
|
|
|
216
216
|
|
|
217
217
|
return step_result(q, dataset_rows.c, dependencies=[storage.uri])
|
|
218
218
|
|
|
219
|
-
def parse_path(self):
|
|
220
|
-
client_config = self.kwargs.get("client_config") or {}
|
|
221
|
-
client, path = self.catalog.parse_url(self.path, **client_config)
|
|
222
|
-
return client.uri, path
|
|
223
|
-
|
|
224
219
|
|
|
225
220
|
def generator_then_call(generator, func: Callable):
|
|
226
221
|
"""
|
|
@@ -903,12 +898,36 @@ class SQLUnion(Step):
|
|
|
903
898
|
|
|
904
899
|
@frozen
|
|
905
900
|
class SQLJoin(Step):
|
|
901
|
+
catalog: "Catalog"
|
|
906
902
|
query1: "DatasetQuery"
|
|
907
903
|
query2: "DatasetQuery"
|
|
908
904
|
predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
|
|
909
905
|
inner: bool
|
|
910
906
|
rname: str
|
|
911
907
|
|
|
908
|
+
def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
|
|
909
|
+
query = dq.apply_steps().select()
|
|
910
|
+
temp_tables.extend(dq.temp_table_names)
|
|
911
|
+
|
|
912
|
+
if not any(isinstance(step, (SQLJoin, SQLUnion)) for step in dq.steps):
|
|
913
|
+
return query.subquery(dq.table.name)
|
|
914
|
+
|
|
915
|
+
warehouse = self.catalog.warehouse
|
|
916
|
+
|
|
917
|
+
columns = [
|
|
918
|
+
c if isinstance(c, Column) else Column(c.name, c.type)
|
|
919
|
+
for c in query.subquery().columns
|
|
920
|
+
]
|
|
921
|
+
temp_table = warehouse.create_dataset_rows_table(
|
|
922
|
+
warehouse.temp_table_name(),
|
|
923
|
+
columns=columns,
|
|
924
|
+
)
|
|
925
|
+
temp_tables.append(temp_table.name)
|
|
926
|
+
|
|
927
|
+
warehouse.copy_table(temp_table, query)
|
|
928
|
+
|
|
929
|
+
return temp_table.select().subquery(dq.table.name)
|
|
930
|
+
|
|
912
931
|
def validate_expression(self, exp: "ClauseElement", q1, q2):
|
|
913
932
|
"""
|
|
914
933
|
Checking if columns used in expression actually exist in left / right
|
|
@@ -941,10 +960,8 @@ class SQLJoin(Step):
|
|
|
941
960
|
def apply(
|
|
942
961
|
self, query_generator: QueryGenerator, temp_tables: list[str]
|
|
943
962
|
) -> StepResult:
|
|
944
|
-
q1 = self.
|
|
945
|
-
|
|
946
|
-
q2 = self.query2.apply_steps().select().subquery(self.query2.table.name)
|
|
947
|
-
temp_tables.extend(self.query2.temp_table_names)
|
|
963
|
+
q1 = self.get_query(self.query1, temp_tables)
|
|
964
|
+
q2 = self.get_query(self.query2, temp_tables)
|
|
948
965
|
|
|
949
966
|
q1_columns = list(q1.c)
|
|
950
967
|
q1_column_names = {c.name for c in q1_columns}
|
|
@@ -955,7 +972,12 @@ class SQLJoin(Step):
|
|
|
955
972
|
continue
|
|
956
973
|
|
|
957
974
|
if c.name in q1_column_names:
|
|
958
|
-
|
|
975
|
+
new_name = self.rname.format(name=c.name)
|
|
976
|
+
new_name_idx = 0
|
|
977
|
+
while new_name in q1_column_names:
|
|
978
|
+
new_name_idx += 1
|
|
979
|
+
new_name = self.rname.format(name=f"{c.name}_{new_name_idx}")
|
|
980
|
+
c = c.label(new_name)
|
|
959
981
|
q2_columns.append(c)
|
|
960
982
|
|
|
961
983
|
res_columns = q1_columns + q2_columns
|
|
@@ -983,16 +1005,14 @@ class SQLJoin(Step):
|
|
|
983
1005
|
self.validate_expression(join_expression, q1, q2)
|
|
984
1006
|
|
|
985
1007
|
def q(*columns):
|
|
986
|
-
join_query =
|
|
1008
|
+
join_query = self.catalog.warehouse.join(
|
|
987
1009
|
q1,
|
|
988
1010
|
q2,
|
|
989
1011
|
join_expression,
|
|
990
|
-
|
|
1012
|
+
inner=self.inner,
|
|
991
1013
|
)
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
subquery = res.subquery()
|
|
995
|
-
return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
1014
|
+
return sqlalchemy.select(*columns).select_from(join_query)
|
|
1015
|
+
# return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
996
1016
|
|
|
997
1017
|
return step_result(
|
|
998
1018
|
q,
|
|
@@ -1515,7 +1535,7 @@ class DatasetQuery:
|
|
|
1515
1535
|
if isinstance(predicates, (str, ColumnClause, ColumnElement))
|
|
1516
1536
|
else tuple(predicates)
|
|
1517
1537
|
)
|
|
1518
|
-
new_query.steps = [SQLJoin(left, right, predicates, inner, rname)]
|
|
1538
|
+
new_query.steps = [SQLJoin(self.catalog, left, right, predicates, inner, rname)]
|
|
1519
1539
|
return new_query
|
|
1520
1540
|
|
|
1521
1541
|
@detach
|
|
@@ -1691,12 +1711,7 @@ class DatasetQuery:
|
|
|
1691
1711
|
|
|
1692
1712
|
dr = self.catalog.warehouse.dataset_rows(dataset)
|
|
1693
1713
|
|
|
1694
|
-
|
|
1695
|
-
self.catalog.warehouse.copy_table(
|
|
1696
|
-
dr.get_table(),
|
|
1697
|
-
query.select(),
|
|
1698
|
-
progress_cb=pbar.update,
|
|
1699
|
-
)
|
|
1714
|
+
self.catalog.warehouse.copy_table(dr.get_table(), query.select())
|
|
1700
1715
|
|
|
1701
1716
|
self.catalog.metastore.update_dataset_status(
|
|
1702
1717
|
dataset, DatasetStatus.COMPLETE, version=version
|
|
@@ -5,7 +5,7 @@ datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
|
|
|
5
5
|
datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
|
|
9
9
|
datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
|
|
@@ -17,13 +17,13 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=kPg5ILeCWSjXCj3ewUZY6kzj36HTEqajB3mJDkbs-Vo,69023
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
23
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
24
24
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
25
25
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
26
|
-
datachain/client/fsspec.py,sha256=
|
|
26
|
+
datachain/client/fsspec.py,sha256=0i4EJIwdx_UNZlbSsUeohWjgVg4B5xoGxTYZKwXS22U,13459
|
|
27
27
|
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
28
28
|
datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
|
|
29
29
|
datachain/client/local.py,sha256=LTyISV4oNSOPUdsai5eNZYCGXNCn8rNGuAI0bdgbtnU,5006
|
|
@@ -35,30 +35,31 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
|
|
|
35
35
|
datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
|
|
36
36
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=3OehNpYb4WJYt4RhPxZrQn9UL1yiHX7Fp1W53o-Y1NA,28788
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=g_yWXpw5iC-VYi8gH0ctDlwO3Mo6AT-32j3Nw6TFgqw,32857
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
41
|
+
datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
|
|
42
42
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
43
43
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
-
datachain/lib/file.py,sha256=
|
|
47
|
-
datachain/lib/hf.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=HERJNR4TISbaAtSLARV72INgKPfQRItyd1l28P-GtzU,68871
|
|
46
|
+
datachain/lib/file.py,sha256=elQLorLbIkusuQSVfiuC_KrGSZI8cGm-iT8fHmckJlo,13774
|
|
47
|
+
datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
|
|
48
48
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
49
|
-
datachain/lib/listing.py,sha256=
|
|
50
|
-
datachain/lib/listing_info.py,sha256=
|
|
49
|
+
datachain/lib/listing.py,sha256=e4O1gs3rKJ0eGwb0hSEfD-l9U7x-f-TYqYGF7Ni-x38,3973
|
|
50
|
+
datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
|
|
51
51
|
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
52
52
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
53
53
|
datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
|
|
54
54
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
55
|
-
datachain/lib/signal_schema.py,sha256=
|
|
55
|
+
datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
|
|
56
|
+
datachain/lib/tar.py,sha256=d7FpYyxbHCL1twRt_Oe9QoPbZa2Tn5lj7iWP0HvvRn0,999
|
|
56
57
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
57
58
|
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
58
59
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
59
60
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
60
61
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
|
-
datachain/lib/webdataset.py,sha256=
|
|
62
|
+
datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
|
|
62
63
|
datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
|
|
63
64
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
65
|
datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
|
|
@@ -69,7 +70,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
|
|
|
69
70
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
70
71
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
71
72
|
datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
|
|
72
|
-
datachain/query/dataset.py,sha256=
|
|
73
|
+
datachain/query/dataset.py,sha256=tBmAlcz6orJbKWkcvGVE4wom-EWInFaXHJYMSpVZnhA,58892
|
|
73
74
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
74
75
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
75
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -96,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
96
97
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
97
98
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
98
99
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
100
|
+
datachain-0.3.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.3.16.dist-info/METADATA,sha256=EjMy4f4OVbwVttlWRzzXRLr-uAEAGNMPMmge96_CI2o,17073
|
|
102
|
+
datachain-0.3.16.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
103
|
+
datachain-0.3.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.3.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.3.16.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|