datachain 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +4 -9
- datachain/catalog/catalog.py +2 -2
- datachain/client/azure.py +1 -13
- datachain/client/fsspec.py +7 -7
- datachain/client/gcs.py +2 -13
- datachain/client/hf.py +0 -10
- datachain/client/local.py +3 -12
- datachain/client/s3.py +9 -19
- datachain/data_storage/sqlite.py +10 -1
- datachain/data_storage/warehouse.py +11 -17
- datachain/lib/listing.py +1 -2
- datachain/lib/model_store.py +2 -2
- datachain/lib/pytorch.py +32 -26
- datachain/lib/signal_schema.py +146 -58
- datachain/listing.py +6 -8
- datachain/node.py +0 -43
- {datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/METADATA +1 -1
- {datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/RECORD +22 -22
- {datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/LICENSE +0 -0
- {datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/WHEEL +0 -0
- {datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.13.dist-info → datachain-0.3.14.dist-info}/top_level.txt +0 -0
datachain/asyn.py
CHANGED
|
@@ -1,14 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from collections.abc import Awaitable, Coroutine, Iterable
|
|
2
|
+
from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
4
|
from heapq import heappop, heappush
|
|
5
|
-
from typing import
|
|
6
|
-
Any,
|
|
7
|
-
Callable,
|
|
8
|
-
Generic,
|
|
9
|
-
Optional,
|
|
10
|
-
TypeVar,
|
|
11
|
-
)
|
|
5
|
+
from typing import Any, Callable, Generic, Optional, TypeVar
|
|
12
6
|
|
|
13
7
|
from fsspec.asyn import get_loop
|
|
14
8
|
|
|
@@ -16,6 +10,7 @@ ASYNC_WORKERS = 20
|
|
|
16
10
|
|
|
17
11
|
InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
|
|
18
12
|
ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
|
|
13
|
+
T = TypeVar("T")
|
|
19
14
|
|
|
20
15
|
|
|
21
16
|
class AsyncMapper(Generic[InputT, ResultT]):
|
|
@@ -226,7 +221,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
|
|
|
226
221
|
self._push_result(self._next_yield, None)
|
|
227
222
|
|
|
228
223
|
|
|
229
|
-
def iter_over_async(ait, loop):
|
|
224
|
+
def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
|
|
230
225
|
"""Wrap an asynchronous iterator into a synchronous one"""
|
|
231
226
|
ait = ait.__aiter__()
|
|
232
227
|
|
datachain/catalog/catalog.py
CHANGED
|
@@ -1390,12 +1390,12 @@ class Catalog:
|
|
|
1390
1390
|
dataset = self.get_dataset(name)
|
|
1391
1391
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1392
1392
|
|
|
1393
|
-
def dataset_stats(self, name: str, version: int) -> DatasetStats:
|
|
1393
|
+
def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
|
|
1394
1394
|
"""
|
|
1395
1395
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
1396
1396
|
"""
|
|
1397
1397
|
dataset = self.get_dataset(name)
|
|
1398
|
-
dataset_version = dataset.get_version(version)
|
|
1398
|
+
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
1399
1399
|
return DatasetStats(
|
|
1400
1400
|
num_objects=dataset_version.num_objects,
|
|
1401
1401
|
size=dataset_version.size,
|
datachain/client/azure.py
CHANGED
|
@@ -4,7 +4,6 @@ from adlfs import AzureBlobFileSystem
|
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
|
|
6
6
|
from datachain.lib.file import File
|
|
7
|
-
from datachain.node import Entry
|
|
8
7
|
|
|
9
8
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
10
9
|
|
|
@@ -14,17 +13,6 @@ class AzureClient(Client):
|
|
|
14
13
|
PREFIX = "az://"
|
|
15
14
|
protocol = "az"
|
|
16
15
|
|
|
17
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
18
|
-
version_id = v.get("version_id")
|
|
19
|
-
return Entry.from_file(
|
|
20
|
-
path=path,
|
|
21
|
-
etag=v.get("etag", "").strip('"'),
|
|
22
|
-
version=version_id or "",
|
|
23
|
-
is_latest=version_id is None or bool(v.get("is_current_version")),
|
|
24
|
-
last_modified=v["last_modified"],
|
|
25
|
-
size=v.get("size", ""),
|
|
26
|
-
)
|
|
27
|
-
|
|
28
16
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
29
17
|
version_id = v.get("version_id")
|
|
30
18
|
return File(
|
|
@@ -57,7 +45,7 @@ class AzureClient(Client):
|
|
|
57
45
|
continue
|
|
58
46
|
info = (await self.fs._details([b]))[0]
|
|
59
47
|
entries.append(
|
|
60
|
-
self.
|
|
48
|
+
self.info_to_file(info, self.rel_path(info["name"]))
|
|
61
49
|
)
|
|
62
50
|
if entries:
|
|
63
51
|
await result_queue.put(entries)
|
datachain/client/fsspec.py
CHANGED
|
@@ -29,7 +29,7 @@ from tqdm import tqdm
|
|
|
29
29
|
from datachain.cache import DataChainCache, UniqueId
|
|
30
30
|
from datachain.client.fileslice import FileSlice, FileWrapper
|
|
31
31
|
from datachain.error import ClientError as DataChainClientError
|
|
32
|
-
from datachain.
|
|
32
|
+
from datachain.lib.file import File
|
|
33
33
|
from datachain.nodes_fetcher import NodesFetcher
|
|
34
34
|
from datachain.nodes_thread_pool import NodeChunk
|
|
35
35
|
from datachain.storage import StorageURI
|
|
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
|
|
|
45
45
|
|
|
46
46
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
47
47
|
|
|
48
|
-
ResultQueue = asyncio.Queue[Optional[Sequence[
|
|
48
|
+
ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def _is_win_local_path(uri: str) -> bool:
|
|
@@ -188,7 +188,7 @@ class Client(ABC):
|
|
|
188
188
|
|
|
189
189
|
async def get_current_etag(self, uid: UniqueId) -> str:
|
|
190
190
|
info = await self.fs._info(self.get_full_path(uid.path))
|
|
191
|
-
return self.
|
|
191
|
+
return self.info_to_file(info, "").etag
|
|
192
192
|
|
|
193
193
|
async def get_size(self, path: str) -> int:
|
|
194
194
|
return await self.fs._size(path)
|
|
@@ -198,7 +198,7 @@ class Client(ABC):
|
|
|
198
198
|
|
|
199
199
|
async def scandir(
|
|
200
200
|
self, start_prefix: str, method: str = "default"
|
|
201
|
-
) -> AsyncIterator[Sequence[
|
|
201
|
+
) -> AsyncIterator[Sequence[File]]:
|
|
202
202
|
try:
|
|
203
203
|
impl = getattr(self, f"_fetch_{method}")
|
|
204
204
|
except AttributeError:
|
|
@@ -264,7 +264,7 @@ class Client(ABC):
|
|
|
264
264
|
) -> None:
|
|
265
265
|
await self._fetch_nested(start_prefix, result_queue)
|
|
266
266
|
|
|
267
|
-
async def _fetch_dir(self, prefix, pbar, result_queue) -> set[str]:
|
|
267
|
+
async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
|
|
268
268
|
path = f"{self.name}/{prefix}"
|
|
269
269
|
infos = await self.ls_dir(path)
|
|
270
270
|
files = []
|
|
@@ -277,7 +277,7 @@ class Client(ABC):
|
|
|
277
277
|
if info["type"] == "directory":
|
|
278
278
|
subdirs.add(subprefix)
|
|
279
279
|
else:
|
|
280
|
-
files.append(self.
|
|
280
|
+
files.append(self.info_to_file(info, subprefix))
|
|
281
281
|
if files:
|
|
282
282
|
await result_queue.put(files)
|
|
283
283
|
found_count = len(subdirs) + len(files)
|
|
@@ -303,7 +303,7 @@ class Client(ABC):
|
|
|
303
303
|
return f"{self.PREFIX}{self.name}/{rel_path}"
|
|
304
304
|
|
|
305
305
|
@abstractmethod
|
|
306
|
-
def
|
|
306
|
+
def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
|
|
307
307
|
|
|
308
308
|
def fetch_nodes(
|
|
309
309
|
self,
|
datachain/client/gcs.py
CHANGED
|
@@ -10,7 +10,6 @@ from gcsfs import GCSFileSystem
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
12
|
from datachain.lib.file import File
|
|
13
|
-
from datachain.node import Entry
|
|
14
13
|
|
|
15
14
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
16
15
|
|
|
@@ -108,19 +107,9 @@ class GCSClient(Client):
|
|
|
108
107
|
finally:
|
|
109
108
|
await page_queue.put(None)
|
|
110
109
|
|
|
111
|
-
def _entry_from_dict(self, d: dict[str, Any]) ->
|
|
110
|
+
def _entry_from_dict(self, d: dict[str, Any]) -> File:
|
|
112
111
|
info = self.fs._process_object(self.name, d)
|
|
113
|
-
return self.
|
|
114
|
-
|
|
115
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
116
|
-
return Entry.from_file(
|
|
117
|
-
path=path,
|
|
118
|
-
etag=v.get("etag", ""),
|
|
119
|
-
version=v.get("generation", ""),
|
|
120
|
-
is_latest=not v.get("timeDeleted"),
|
|
121
|
-
last_modified=self.parse_timestamp(v["updated"]),
|
|
122
|
-
size=v.get("size", ""),
|
|
123
|
-
)
|
|
112
|
+
return self.info_to_file(info, self.rel_path(info["name"]))
|
|
124
113
|
|
|
125
114
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
126
115
|
return File(
|
datachain/client/hf.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, cast
|
|
|
5
5
|
from huggingface_hub import HfFileSystem
|
|
6
6
|
|
|
7
7
|
from datachain.lib.file import File
|
|
8
|
-
from datachain.node import Entry
|
|
9
8
|
|
|
10
9
|
from .fsspec import Client
|
|
11
10
|
|
|
@@ -22,15 +21,6 @@ class HfClient(Client):
|
|
|
22
21
|
|
|
23
22
|
return cast(HfFileSystem, super().create_fs(**kwargs))
|
|
24
23
|
|
|
25
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
26
|
-
return Entry.from_file(
|
|
27
|
-
path=path,
|
|
28
|
-
size=v["size"],
|
|
29
|
-
version=v["last_commit"].oid,
|
|
30
|
-
etag=v.get("blob_id", ""),
|
|
31
|
-
last_modified=v["last_commit"].date,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
24
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
35
25
|
return File(
|
|
36
26
|
path=path,
|
datachain/client/local.py
CHANGED
|
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
|
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
|
+
from datachain.cache import UniqueId
|
|
10
11
|
from datachain.lib.file import File
|
|
11
|
-
from datachain.node import Entry
|
|
12
12
|
from datachain.storage import StorageURI
|
|
13
13
|
|
|
14
14
|
from .fsspec import Client
|
|
@@ -114,9 +114,9 @@ class FileClient(Client):
|
|
|
114
114
|
use_symlinks=use_symlinks,
|
|
115
115
|
)
|
|
116
116
|
|
|
117
|
-
async def get_current_etag(self, uid) -> str:
|
|
117
|
+
async def get_current_etag(self, uid: UniqueId) -> str:
|
|
118
118
|
info = self.fs.info(self.get_full_path(uid.path))
|
|
119
|
-
return self.
|
|
119
|
+
return self.info_to_file(info, "").etag
|
|
120
120
|
|
|
121
121
|
async def get_size(self, path: str) -> int:
|
|
122
122
|
return self.fs.size(path)
|
|
@@ -136,15 +136,6 @@ class FileClient(Client):
|
|
|
136
136
|
full_path += "/"
|
|
137
137
|
return full_path
|
|
138
138
|
|
|
139
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
140
|
-
return Entry.from_file(
|
|
141
|
-
path=path,
|
|
142
|
-
etag=v["mtime"].hex(),
|
|
143
|
-
is_latest=True,
|
|
144
|
-
last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
|
|
145
|
-
size=v.get("size", ""),
|
|
146
|
-
)
|
|
147
|
-
|
|
148
139
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
149
140
|
return File(
|
|
150
141
|
source=self.uri,
|
datachain/client/s3.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import Any, cast
|
|
2
|
+
from typing import Any, Optional, cast
|
|
3
3
|
|
|
4
4
|
from botocore.exceptions import NoCredentialsError
|
|
5
5
|
from s3fs import S3FileSystem
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
|
|
8
8
|
from datachain.lib.file import File
|
|
9
|
-
from datachain.node import Entry
|
|
10
9
|
|
|
11
10
|
from .fsspec import DELIMITER, Client, ResultQueue
|
|
12
11
|
|
|
@@ -111,8 +110,9 @@ class ClientS3(Client):
|
|
|
111
110
|
) -> None:
|
|
112
111
|
await self._fetch_flat(start_prefix, result_queue)
|
|
113
112
|
|
|
114
|
-
def _entry_from_boto(self, v, bucket, versions=False):
|
|
115
|
-
return
|
|
113
|
+
def _entry_from_boto(self, v, bucket, versions=False) -> File:
|
|
114
|
+
return File(
|
|
115
|
+
source=self.uri,
|
|
116
116
|
path=v["Key"],
|
|
117
117
|
etag=v.get("ETag", "").strip('"'),
|
|
118
118
|
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
@@ -125,8 +125,8 @@ class ClientS3(Client):
|
|
|
125
125
|
self,
|
|
126
126
|
prefix,
|
|
127
127
|
pbar,
|
|
128
|
-
result_queue,
|
|
129
|
-
):
|
|
128
|
+
result_queue: ResultQueue,
|
|
129
|
+
) -> set[str]:
|
|
130
130
|
if prefix:
|
|
131
131
|
prefix = prefix.lstrip(DELIMITER) + DELIMITER
|
|
132
132
|
files = []
|
|
@@ -141,7 +141,7 @@ class ClientS3(Client):
|
|
|
141
141
|
if info["type"] == "directory":
|
|
142
142
|
subdirs.add(subprefix)
|
|
143
143
|
else:
|
|
144
|
-
files.append(self.
|
|
144
|
+
files.append(self.info_to_file(info, subprefix))
|
|
145
145
|
pbar.update()
|
|
146
146
|
found = True
|
|
147
147
|
if not found:
|
|
@@ -152,18 +152,8 @@ class ClientS3(Client):
|
|
|
152
152
|
return subdirs
|
|
153
153
|
|
|
154
154
|
@staticmethod
|
|
155
|
-
def clean_s3_version(ver):
|
|
156
|
-
return ver if ver != "null" else ""
|
|
157
|
-
|
|
158
|
-
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
159
|
-
return Entry.from_file(
|
|
160
|
-
path=path,
|
|
161
|
-
etag=v.get("ETag", "").strip('"'),
|
|
162
|
-
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
163
|
-
is_latest=v.get("IsLatest", True),
|
|
164
|
-
last_modified=v.get("LastModified", ""),
|
|
165
|
-
size=v["size"],
|
|
166
|
-
)
|
|
155
|
+
def clean_s3_version(ver: Optional[str]) -> str:
|
|
156
|
+
return ver if (ver is not None and ver != "null") else ""
|
|
167
157
|
|
|
168
158
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
169
159
|
return File(
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -43,6 +43,8 @@ if TYPE_CHECKING:
|
|
|
43
43
|
from sqlalchemy.sql.elements import ColumnElement
|
|
44
44
|
from sqlalchemy.types import TypeEngine
|
|
45
45
|
|
|
46
|
+
from datachain.lib.file import File
|
|
47
|
+
|
|
46
48
|
|
|
47
49
|
logger = logging.getLogger("datachain")
|
|
48
50
|
|
|
@@ -58,6 +60,10 @@ quote_schema = sqlite_dialect.identifier_preparer.quote_schema
|
|
|
58
60
|
quote = sqlite_dialect.identifier_preparer.quote
|
|
59
61
|
|
|
60
62
|
|
|
63
|
+
def _get_in_memory_uri():
|
|
64
|
+
return "file::memory:?cache=shared"
|
|
65
|
+
|
|
66
|
+
|
|
61
67
|
def get_retry_sleep_sec(retry_count: int) -> int:
|
|
62
68
|
return RETRY_START_SEC * (RETRY_FACTOR**retry_count)
|
|
63
69
|
|
|
@@ -119,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
119
125
|
if db_file == ":memory:":
|
|
120
126
|
# Enable multithreaded usage of the same in-memory db
|
|
121
127
|
db = sqlite3.connect(
|
|
122
|
-
|
|
128
|
+
_get_in_memory_uri(), uri=True, detect_types=DETECT_TYPES
|
|
123
129
|
)
|
|
124
130
|
else:
|
|
125
131
|
db = sqlite3.connect(
|
|
@@ -704,6 +710,9 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
704
710
|
|
|
705
711
|
self.db.execute(insert_query)
|
|
706
712
|
|
|
713
|
+
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
714
|
+
return (e.model_dump() for e in entries)
|
|
715
|
+
|
|
707
716
|
def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
|
|
708
717
|
rows = list(rows)
|
|
709
718
|
if not rows:
|
|
@@ -20,7 +20,7 @@ from datachain.client import Client
|
|
|
20
20
|
from datachain.data_storage.schema import convert_rows_custom_column_types
|
|
21
21
|
from datachain.data_storage.serializer import Serializable
|
|
22
22
|
from datachain.dataset import DatasetRecord
|
|
23
|
-
from datachain.node import DirType, DirTypeGroup,
|
|
23
|
+
from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
24
24
|
from datachain.sql.functions import path as pathfunc
|
|
25
25
|
from datachain.sql.types import Int, SQLType
|
|
26
26
|
from datachain.storage import StorageURI
|
|
@@ -34,6 +34,7 @@ if TYPE_CHECKING:
|
|
|
34
34
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
35
35
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
36
36
|
from datachain.data_storage.schema import DataTable
|
|
37
|
+
from datachain.lib.file import File
|
|
37
38
|
|
|
38
39
|
try:
|
|
39
40
|
import numpy as np
|
|
@@ -401,25 +402,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
401
402
|
expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
|
|
402
403
|
sa.func.count(table.c.sys__id),
|
|
403
404
|
)
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
405
|
+
size_columns = [
|
|
406
|
+
c for c in table.columns if c.name == "size" or c.name.endswith("__size")
|
|
407
|
+
]
|
|
408
|
+
if size_columns:
|
|
409
|
+
expressions = (*expressions, sa.func.sum(sum(size_columns)))
|
|
408
410
|
query = select(*expressions)
|
|
409
411
|
((nrows, *rest),) = self.db.execute(query)
|
|
410
|
-
return nrows, rest[0] if rest else
|
|
411
|
-
|
|
412
|
-
def prepare_entries(
|
|
413
|
-
self, uri: str, entries: Iterable[Entry]
|
|
414
|
-
) -> list[dict[str, Any]]:
|
|
415
|
-
"""
|
|
416
|
-
Prepares bucket listing entry (row) for inserting into database
|
|
417
|
-
"""
|
|
418
|
-
|
|
419
|
-
def _prepare_entry(entry: Entry):
|
|
420
|
-
return attrs.asdict(entry) | {"source": uri}
|
|
412
|
+
return nrows, rest[0] if rest else 0
|
|
421
413
|
|
|
422
|
-
|
|
414
|
+
@abstractmethod
|
|
415
|
+
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
416
|
+
"""Convert File entries so they can be passed on to `insert_rows()`"""
|
|
423
417
|
|
|
424
418
|
@abstractmethod
|
|
425
419
|
def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
|
datachain/lib/listing.py
CHANGED
|
@@ -30,8 +30,7 @@ def list_bucket(uri: str, client_config=None) -> Callable:
|
|
|
30
30
|
config = client_config or {}
|
|
31
31
|
client, path = Client.parse_url(uri, None, **config) # type: ignore[arg-type]
|
|
32
32
|
for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
|
|
33
|
-
|
|
34
|
-
yield entry.to_file(client.uri)
|
|
33
|
+
yield from entries
|
|
35
34
|
|
|
36
35
|
return list_func
|
|
37
36
|
|
datachain/lib/model_store.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import logging
|
|
3
|
-
from typing import ClassVar, Optional
|
|
3
|
+
from typing import Any, ClassVar, Optional
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
@@ -69,7 +69,7 @@ class ModelStore:
|
|
|
69
69
|
del cls.store[fr.__name__][version]
|
|
70
70
|
|
|
71
71
|
@staticmethod
|
|
72
|
-
def is_pydantic(val):
|
|
72
|
+
def is_pydantic(val: Any) -> bool:
|
|
73
73
|
return (
|
|
74
74
|
not hasattr(val, "__origin__")
|
|
75
75
|
and inspect.isclass(val)
|
datachain/lib/pytorch.py
CHANGED
|
@@ -7,6 +7,7 @@ from torch import float32
|
|
|
7
7
|
from torch.distributed import get_rank, get_world_size
|
|
8
8
|
from torch.utils.data import IterableDataset, get_worker_info
|
|
9
9
|
from torchvision.transforms import v2
|
|
10
|
+
from tqdm import tqdm
|
|
10
11
|
|
|
11
12
|
from datachain.catalog import Catalog, get_catalog
|
|
12
13
|
from datachain.lib.dc import DataChain
|
|
@@ -93,33 +94,38 @@ class PytorchDataset(IterableDataset):
|
|
|
93
94
|
if self.num_samples > 0:
|
|
94
95
|
ds = ds.sample(self.num_samples)
|
|
95
96
|
ds = ds.chunk(total_rank, total_workers)
|
|
96
|
-
for
|
|
97
|
-
|
|
98
|
-
for
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
97
|
+
desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
|
|
98
|
+
with tqdm(desc=desc, unit=" rows") as pbar:
|
|
99
|
+
for row_features in ds.collect():
|
|
100
|
+
row = []
|
|
101
|
+
for fr in row_features:
|
|
102
|
+
if hasattr(fr, "read"):
|
|
103
|
+
row.append(fr.read()) # type: ignore[unreachable]
|
|
104
|
+
else:
|
|
105
|
+
row.append(fr)
|
|
106
|
+
# Apply transforms
|
|
107
|
+
if self.transform:
|
|
108
|
+
try:
|
|
109
|
+
if isinstance(self.transform, v2.Transform):
|
|
110
|
+
row = self.transform(row)
|
|
111
|
+
for i, val in enumerate(row):
|
|
112
|
+
if isinstance(val, Image.Image):
|
|
113
|
+
row[i] = self.transform(val)
|
|
114
|
+
except ValueError:
|
|
115
|
+
logger.warning(
|
|
116
|
+
"Skipping transform due to unsupported data types."
|
|
117
|
+
)
|
|
118
|
+
self.transform = None
|
|
119
|
+
if self.tokenizer:
|
|
108
120
|
for i, val in enumerate(row):
|
|
109
|
-
if isinstance(val,
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
isinstance(val, list) and isinstance(val[0], str)
|
|
118
|
-
):
|
|
119
|
-
row[i] = convert_text(
|
|
120
|
-
val, self.tokenizer, self.tokenizer_kwargs
|
|
121
|
-
).squeeze(0) # type: ignore[union-attr]
|
|
122
|
-
yield row
|
|
121
|
+
if isinstance(val, str) or (
|
|
122
|
+
isinstance(val, list) and isinstance(val[0], str)
|
|
123
|
+
):
|
|
124
|
+
row[i] = convert_text(
|
|
125
|
+
val, self.tokenizer, self.tokenizer_kwargs
|
|
126
|
+
).squeeze(0) # type: ignore[union-attr]
|
|
127
|
+
yield row
|
|
128
|
+
pbar.update(1)
|
|
123
129
|
|
|
124
130
|
@staticmethod
|
|
125
131
|
def get_rank_and_workers() -> tuple[int, int]:
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -4,11 +4,14 @@ from collections.abc import Iterator, Sequence
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from inspect import isclass
|
|
7
|
-
from typing import (
|
|
7
|
+
from typing import ( # noqa: UP035
|
|
8
8
|
TYPE_CHECKING,
|
|
9
9
|
Annotated,
|
|
10
10
|
Any,
|
|
11
11
|
Callable,
|
|
12
|
+
Dict,
|
|
13
|
+
Final,
|
|
14
|
+
List,
|
|
12
15
|
Literal,
|
|
13
16
|
Optional,
|
|
14
17
|
Union,
|
|
@@ -42,8 +45,13 @@ NAMES_TO_TYPES = {
|
|
|
42
45
|
"dict": dict,
|
|
43
46
|
"bytes": bytes,
|
|
44
47
|
"datetime": datetime,
|
|
45
|
-
"
|
|
48
|
+
"Final": Final,
|
|
46
49
|
"Union": Union,
|
|
50
|
+
"Optional": Optional,
|
|
51
|
+
"List": list,
|
|
52
|
+
"Dict": dict,
|
|
53
|
+
"Literal": Any,
|
|
54
|
+
"Any": Any,
|
|
47
55
|
}
|
|
48
56
|
|
|
49
57
|
|
|
@@ -146,35 +154,11 @@ class SignalSchema:
|
|
|
146
154
|
return SignalSchema(signals)
|
|
147
155
|
|
|
148
156
|
@staticmethod
|
|
149
|
-
def
|
|
150
|
-
|
|
151
|
-
based on whether the type is Optional or not."""
|
|
152
|
-
orig = get_origin(fr_type)
|
|
153
|
-
args = get_args(fr_type)
|
|
154
|
-
# Check if fr_type is Optional
|
|
155
|
-
if orig == Union and len(args) == 2 and (type(None) in args):
|
|
156
|
-
fr_type = args[0]
|
|
157
|
-
orig = get_origin(fr_type)
|
|
158
|
-
if orig in (Literal, LiteralEx):
|
|
159
|
-
# Literal has no __name__ in Python 3.9
|
|
160
|
-
type_name = "Literal"
|
|
161
|
-
elif orig == Union:
|
|
162
|
-
# Union also has no __name__ in Python 3.9
|
|
163
|
-
type_name = "Union"
|
|
164
|
-
else:
|
|
165
|
-
type_name = str(fr_type.__name__) # type: ignore[union-attr]
|
|
166
|
-
return type_name, fr_type
|
|
167
|
-
|
|
168
|
-
@staticmethod
|
|
169
|
-
def serialize_custom_model_fields(
|
|
170
|
-
name: str, fr: type, custom_types: dict[str, Any]
|
|
157
|
+
def _serialize_custom_model_fields(
|
|
158
|
+
version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
|
|
171
159
|
) -> str:
|
|
172
160
|
"""This serializes any custom type information to the provided custom_types
|
|
173
|
-
dict, and returns the name of the type
|
|
174
|
-
if hasattr(fr, "__origin__") or not issubclass(fr, BaseModel):
|
|
175
|
-
# Don't store non-feature types.
|
|
176
|
-
return name
|
|
177
|
-
version_name = ModelStore.get_name(fr)
|
|
161
|
+
dict, and returns the name of the type serialized."""
|
|
178
162
|
if version_name in custom_types:
|
|
179
163
|
# This type is already stored in custom_types.
|
|
180
164
|
return version_name
|
|
@@ -183,37 +167,102 @@ class SignalSchema:
|
|
|
183
167
|
field_type = info.annotation
|
|
184
168
|
# All fields should be typed.
|
|
185
169
|
assert field_type
|
|
186
|
-
|
|
187
|
-
field_type
|
|
188
|
-
)
|
|
189
|
-
# Serialize this type to custom_types if it is a custom type as well.
|
|
190
|
-
fields[field_name] = SignalSchema.serialize_custom_model_fields(
|
|
191
|
-
field_type_name, field_type, custom_types
|
|
192
|
-
)
|
|
170
|
+
fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
|
|
193
171
|
custom_types[version_name] = fields
|
|
194
172
|
return version_name
|
|
195
173
|
|
|
174
|
+
@staticmethod
|
|
175
|
+
def _serialize_type(fr: type, custom_types: dict[str, Any]) -> str:
|
|
176
|
+
"""Serialize a given type to a string, including automatic ModelStore
|
|
177
|
+
registration, and save this type and subtypes to custom_types as well."""
|
|
178
|
+
subtypes: list[Any] = []
|
|
179
|
+
type_name = SignalSchema._type_to_str(fr, subtypes)
|
|
180
|
+
# Iterate over all subtypes (includes the input type).
|
|
181
|
+
for st in subtypes:
|
|
182
|
+
if st is None or not ModelStore.is_pydantic(st):
|
|
183
|
+
continue
|
|
184
|
+
# Register and save feature types.
|
|
185
|
+
ModelStore.register(st)
|
|
186
|
+
st_version_name = ModelStore.get_name(st)
|
|
187
|
+
if st is fr:
|
|
188
|
+
# If the main type is Pydantic, then use the ModelStore version name.
|
|
189
|
+
type_name = st_version_name
|
|
190
|
+
# Save this type to custom_types.
|
|
191
|
+
SignalSchema._serialize_custom_model_fields(
|
|
192
|
+
st_version_name, st, custom_types
|
|
193
|
+
)
|
|
194
|
+
return type_name
|
|
195
|
+
|
|
196
196
|
def serialize(self) -> dict[str, Any]:
|
|
197
197
|
signals: dict[str, Any] = {}
|
|
198
198
|
custom_types: dict[str, Any] = {}
|
|
199
199
|
for name, fr_type in self.values.items():
|
|
200
|
-
|
|
201
|
-
ModelStore.register(fr)
|
|
202
|
-
signals[name] = ModelStore.get_name(fr)
|
|
203
|
-
type_name, fr_type = SignalSchema._get_name_original_type(fr)
|
|
204
|
-
else:
|
|
205
|
-
type_name, fr_type = SignalSchema._get_name_original_type(fr_type)
|
|
206
|
-
signals[name] = type_name
|
|
207
|
-
self.serialize_custom_model_fields(type_name, fr_type, custom_types)
|
|
200
|
+
signals[name] = self._serialize_type(fr_type, custom_types)
|
|
208
201
|
if custom_types:
|
|
209
202
|
signals["_custom_types"] = custom_types
|
|
210
203
|
return signals
|
|
211
204
|
|
|
212
205
|
@staticmethod
|
|
213
|
-
def
|
|
206
|
+
def _split_subtypes(type_name: str) -> list[str]:
|
|
207
|
+
"""This splits a list of subtypes, including proper square bracket handling."""
|
|
208
|
+
start = 0
|
|
209
|
+
depth = 0
|
|
210
|
+
subtypes = []
|
|
211
|
+
for i, c in enumerate(type_name):
|
|
212
|
+
if c == "[":
|
|
213
|
+
depth += 1
|
|
214
|
+
elif c == "]":
|
|
215
|
+
if depth == 0:
|
|
216
|
+
raise TypeError(
|
|
217
|
+
"Extra closing square bracket when parsing subtype list"
|
|
218
|
+
)
|
|
219
|
+
depth -= 1
|
|
220
|
+
elif c == "," and depth == 0:
|
|
221
|
+
subtypes.append(type_name[start:i].strip())
|
|
222
|
+
start = i + 1
|
|
223
|
+
if depth > 0:
|
|
224
|
+
raise TypeError("Unclosed square bracket when parsing subtype list")
|
|
225
|
+
subtypes.append(type_name[start:].strip())
|
|
226
|
+
return subtypes
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]: # noqa: PLR0911
|
|
214
230
|
"""Convert a string-based type back into a python type."""
|
|
231
|
+
type_name = type_name.strip()
|
|
232
|
+
if not type_name:
|
|
233
|
+
raise TypeError("Type cannot be empty")
|
|
234
|
+
if type_name == "NoneType":
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
bracket_idx = type_name.find("[")
|
|
238
|
+
subtypes: Optional[tuple[Optional[type], ...]] = None
|
|
239
|
+
if bracket_idx > -1:
|
|
240
|
+
if bracket_idx == 0:
|
|
241
|
+
raise TypeError("Type cannot start with '['")
|
|
242
|
+
close_bracket_idx = type_name.rfind("]")
|
|
243
|
+
if close_bracket_idx == -1:
|
|
244
|
+
raise TypeError("Unclosed square bracket when parsing type")
|
|
245
|
+
if close_bracket_idx < bracket_idx:
|
|
246
|
+
raise TypeError("Square brackets are out of order when parsing type")
|
|
247
|
+
if close_bracket_idx == bracket_idx + 1:
|
|
248
|
+
raise TypeError("Empty square brackets when parsing type")
|
|
249
|
+
subtype_names = SignalSchema._split_subtypes(
|
|
250
|
+
type_name[bracket_idx + 1 : close_bracket_idx]
|
|
251
|
+
)
|
|
252
|
+
# Types like Union require the parameters to be a tuple of types.
|
|
253
|
+
subtypes = tuple(
|
|
254
|
+
SignalSchema._resolve_type(st, custom_types) for st in subtype_names
|
|
255
|
+
)
|
|
256
|
+
type_name = type_name[:bracket_idx].strip()
|
|
257
|
+
|
|
215
258
|
fr = NAMES_TO_TYPES.get(type_name)
|
|
216
259
|
if fr:
|
|
260
|
+
if subtypes:
|
|
261
|
+
if len(subtypes) == 1:
|
|
262
|
+
# Types like Optional require there to be only one argument.
|
|
263
|
+
return fr[subtypes[0]] # type: ignore[index]
|
|
264
|
+
# Other types like Union require the parameters to be a tuple of types.
|
|
265
|
+
return fr[subtypes] # type: ignore[index]
|
|
217
266
|
return fr # type: ignore[return-value]
|
|
218
267
|
|
|
219
268
|
model_name, version = ModelStore.parse_name_version(type_name)
|
|
@@ -228,7 +277,14 @@ class SignalSchema:
|
|
|
228
277
|
for field_name, field_type_str in fields.items()
|
|
229
278
|
}
|
|
230
279
|
return create_feature_model(type_name, fields)
|
|
231
|
-
|
|
280
|
+
# This can occur if a third-party or custom type is used, which is not available
|
|
281
|
+
# when deserializing.
|
|
282
|
+
warnings.warn(
|
|
283
|
+
f"Could not resolve type: '{type_name}'.",
|
|
284
|
+
SignalSchemaWarning,
|
|
285
|
+
stacklevel=2,
|
|
286
|
+
)
|
|
287
|
+
return Any # type: ignore[return-value]
|
|
232
288
|
|
|
233
289
|
@staticmethod
|
|
234
290
|
def deserialize(schema: dict[str, Any]) -> "SignalSchema":
|
|
@@ -242,9 +298,14 @@ class SignalSchema:
|
|
|
242
298
|
# This entry is used as a lookup for custom types,
|
|
243
299
|
# and is not an actual field.
|
|
244
300
|
continue
|
|
301
|
+
if not isinstance(type_name, str):
|
|
302
|
+
raise SignalSchemaError(
|
|
303
|
+
f"cannot deserialize '{type_name}': "
|
|
304
|
+
"serialized types must be a string"
|
|
305
|
+
)
|
|
245
306
|
try:
|
|
246
307
|
fr = SignalSchema._resolve_type(type_name, custom_types)
|
|
247
|
-
if fr is
|
|
308
|
+
if fr is Any:
|
|
248
309
|
# Skip if the type is not found, so all data can be displayed.
|
|
249
310
|
warnings.warn(
|
|
250
311
|
f"In signal '{signal}': "
|
|
@@ -258,7 +319,7 @@ class SignalSchema:
|
|
|
258
319
|
raise SignalSchemaError(
|
|
259
320
|
f"cannot deserialize '{signal}': {err}"
|
|
260
321
|
) from err
|
|
261
|
-
signals[signal] = fr
|
|
322
|
+
signals[signal] = fr # type: ignore[assignment]
|
|
262
323
|
|
|
263
324
|
return SignalSchema(signals)
|
|
264
325
|
|
|
@@ -509,31 +570,58 @@ class SignalSchema:
|
|
|
509
570
|
return self.values.pop(name)
|
|
510
571
|
|
|
511
572
|
@staticmethod
|
|
512
|
-
def _type_to_str(type_): # noqa: PLR0911
|
|
573
|
+
def _type_to_str(type_: Optional[type], subtypes: Optional[list] = None) -> str: # noqa: PLR0911
|
|
574
|
+
"""Convert a type to a string-based representation."""
|
|
575
|
+
if type_ is None:
|
|
576
|
+
return "NoneType"
|
|
577
|
+
|
|
513
578
|
origin = get_origin(type_)
|
|
514
579
|
|
|
515
580
|
if origin == Union:
|
|
516
581
|
args = get_args(type_)
|
|
517
|
-
formatted_types = ", ".join(
|
|
582
|
+
formatted_types = ", ".join(
|
|
583
|
+
SignalSchema._type_to_str(arg, subtypes) for arg in args
|
|
584
|
+
)
|
|
518
585
|
return f"Union[{formatted_types}]"
|
|
519
586
|
if origin == Optional:
|
|
520
587
|
args = get_args(type_)
|
|
521
|
-
type_str = SignalSchema._type_to_str(args[0])
|
|
588
|
+
type_str = SignalSchema._type_to_str(args[0], subtypes)
|
|
522
589
|
return f"Optional[{type_str}]"
|
|
523
|
-
if origin
|
|
590
|
+
if origin in (list, List): # noqa: UP006
|
|
524
591
|
args = get_args(type_)
|
|
525
|
-
type_str = SignalSchema._type_to_str(args[0])
|
|
592
|
+
type_str = SignalSchema._type_to_str(args[0], subtypes)
|
|
526
593
|
return f"list[{type_str}]"
|
|
527
|
-
if origin
|
|
594
|
+
if origin in (dict, Dict): # noqa: UP006
|
|
528
595
|
args = get_args(type_)
|
|
529
|
-
type_str =
|
|
530
|
-
|
|
596
|
+
type_str = (
|
|
597
|
+
SignalSchema._type_to_str(args[0], subtypes) if len(args) > 0 else ""
|
|
598
|
+
)
|
|
599
|
+
vals = (
|
|
600
|
+
f", {SignalSchema._type_to_str(args[1], subtypes)}"
|
|
601
|
+
if len(args) > 1
|
|
602
|
+
else ""
|
|
603
|
+
)
|
|
531
604
|
return f"dict[{type_str}{vals}]"
|
|
532
605
|
if origin == Annotated:
|
|
533
606
|
args = get_args(type_)
|
|
534
|
-
return SignalSchema._type_to_str(args[0])
|
|
535
|
-
if origin in (Literal, LiteralEx):
|
|
607
|
+
return SignalSchema._type_to_str(args[0], subtypes)
|
|
608
|
+
if origin in (Literal, LiteralEx) or type_ in (Literal, LiteralEx):
|
|
536
609
|
return "Literal"
|
|
610
|
+
if Any in (origin, type_):
|
|
611
|
+
return "Any"
|
|
612
|
+
if Final in (origin, type_):
|
|
613
|
+
return "Final"
|
|
614
|
+
if subtypes is not None:
|
|
615
|
+
# Include this type in the list of all subtypes, if requested.
|
|
616
|
+
subtypes.append(type_)
|
|
617
|
+
if not hasattr(type_, "__name__"):
|
|
618
|
+
# This can happen for some third-party or custom types, mostly on Python 3.9
|
|
619
|
+
warnings.warn(
|
|
620
|
+
f"Unable to determine name of type '{type_}'.",
|
|
621
|
+
SignalSchemaWarning,
|
|
622
|
+
stacklevel=2,
|
|
623
|
+
)
|
|
624
|
+
return "Any"
|
|
537
625
|
return type_.__name__
|
|
538
626
|
|
|
539
627
|
@staticmethod
|
datachain/listing.py
CHANGED
|
@@ -9,7 +9,8 @@ from sqlalchemy import Column
|
|
|
9
9
|
from sqlalchemy.sql import func
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
|
-
from datachain.
|
|
12
|
+
from datachain.lib.file import File
|
|
13
|
+
from datachain.node import DirType, Node, NodeWithPath
|
|
13
14
|
from datachain.sql.functions import path as pathfunc
|
|
14
15
|
from datachain.utils import suffix_to_number
|
|
15
16
|
|
|
@@ -80,16 +81,13 @@ class Listing:
|
|
|
80
81
|
finally:
|
|
81
82
|
fetch_listing.insert_entries_done()
|
|
82
83
|
|
|
83
|
-
def insert_entry(self, entry:
|
|
84
|
-
self.
|
|
85
|
-
self.dataset_rows.get_table(),
|
|
86
|
-
self.warehouse.prepare_entries(self.client.uri, [entry]),
|
|
87
|
-
)
|
|
84
|
+
def insert_entry(self, entry: File) -> None:
|
|
85
|
+
self.insert_entries([entry])
|
|
88
86
|
|
|
89
|
-
def insert_entries(self, entries: Iterable[
|
|
87
|
+
def insert_entries(self, entries: Iterable[File]) -> None:
|
|
90
88
|
self.warehouse.insert_rows(
|
|
91
89
|
self.dataset_rows.get_table(),
|
|
92
|
-
self.warehouse.prepare_entries(
|
|
90
|
+
self.warehouse.prepare_entries(entries),
|
|
93
91
|
)
|
|
94
92
|
|
|
95
93
|
def insert_entries_done(self) -> None:
|
datachain/node.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
6
|
from datachain.cache import UniqueId
|
|
7
|
-
from datachain.lib.file import File
|
|
8
7
|
from datachain.storage import StorageURI
|
|
9
8
|
from datachain.utils import TIME_ZERO, time_to_str
|
|
10
9
|
|
|
@@ -139,48 +138,6 @@ class Node:
|
|
|
139
138
|
return split[0]
|
|
140
139
|
|
|
141
140
|
|
|
142
|
-
@attrs.define
|
|
143
|
-
class Entry:
|
|
144
|
-
path: str = ""
|
|
145
|
-
etag: str = ""
|
|
146
|
-
version: str = ""
|
|
147
|
-
is_latest: bool = True
|
|
148
|
-
last_modified: Optional[datetime] = None
|
|
149
|
-
size: int = 0
|
|
150
|
-
location: Optional[str] = None
|
|
151
|
-
|
|
152
|
-
@classmethod
|
|
153
|
-
def from_file(cls, path: str, **kwargs) -> "Entry":
|
|
154
|
-
return cls(path=path, **kwargs)
|
|
155
|
-
|
|
156
|
-
@property
|
|
157
|
-
def full_path(self) -> str:
|
|
158
|
-
return self.path
|
|
159
|
-
|
|
160
|
-
@property
|
|
161
|
-
def name(self):
|
|
162
|
-
return self.path.rsplit("/", 1)[-1]
|
|
163
|
-
|
|
164
|
-
@property
|
|
165
|
-
def parent(self):
|
|
166
|
-
split = self.path.rsplit("/", 1)
|
|
167
|
-
if len(split) <= 1:
|
|
168
|
-
return ""
|
|
169
|
-
return split[0]
|
|
170
|
-
|
|
171
|
-
def to_file(self, source: str) -> File:
|
|
172
|
-
return File(
|
|
173
|
-
source=source,
|
|
174
|
-
path=self.path,
|
|
175
|
-
size=self.size,
|
|
176
|
-
version=self.version,
|
|
177
|
-
etag=self.etag,
|
|
178
|
-
is_latest=self.is_latest,
|
|
179
|
-
last_modified=self.last_modified,
|
|
180
|
-
location=self.location,
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
|
|
184
141
|
def get_path(parent: str, name: str):
|
|
185
142
|
return f"{parent}/{name}" if parent else name
|
|
186
143
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
|
-
datachain/asyn.py,sha256=
|
|
3
|
+
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
4
|
datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
|
|
5
5
|
datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
@@ -8,8 +8,8 @@ datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
|
8
8
|
datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
|
|
9
9
|
datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
|
-
datachain/listing.py,sha256=
|
|
12
|
-
datachain/node.py,sha256=
|
|
11
|
+
datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
|
|
12
|
+
datachain/node.py,sha256=2pF3Y9oYzElfiUBcw2LIv7LNNt--V4E-K021zjv0b0I,4748
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
15
15
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
@@ -17,17 +17,17 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=7yl_WMGS6CfOc_G2MCbVVkdAfAlcZb2gC_PvXzBnoJ0,69344
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
23
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
24
|
-
datachain/client/azure.py,sha256=
|
|
24
|
+
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
25
25
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
26
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
-
datachain/client/gcs.py,sha256=
|
|
28
|
-
datachain/client/hf.py,sha256=
|
|
29
|
-
datachain/client/local.py,sha256=
|
|
30
|
-
datachain/client/s3.py,sha256=
|
|
26
|
+
datachain/client/fsspec.py,sha256=S93K9bS76MGcLYgWKVZiPVivbMElJ9Fq1w67I8BCR-g,13311
|
|
27
|
+
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
28
|
+
datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
|
|
29
|
+
datachain/client/local.py,sha256=LTyISV4oNSOPUdsai5eNZYCGXNCn8rNGuAI0bdgbtnU,5006
|
|
30
|
+
datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
|
|
31
31
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
32
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
33
33
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
@@ -35,8 +35,8 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
|
|
|
35
35
|
datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
|
|
36
36
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=yooLHQXrpoqDguGlF0SGcCiMU1T82OEc4wr1ra8eBHo,28285
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=Pq6Nt3fyz1WFv6Mdtv2ZUr0_GFCNbafbtS4PdibblUg,32507
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
|
|
42
42
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
@@ -46,13 +46,13 @@ datachain/lib/dc.py,sha256=C-sfWRinV8pDK2P6UHLbScOahTlTiVQpoxUUdVllF2k,68710
|
|
|
46
46
|
datachain/lib/file.py,sha256=rXmyzUFgnLQ4J3CyOCcg-guhzAz4x9Ug595FbNn4Y2E,11398
|
|
47
47
|
datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
|
|
48
48
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
49
|
-
datachain/lib/listing.py,sha256=
|
|
49
|
+
datachain/lib/listing.py,sha256=mt-dsYfYFMPHN3zXnkohBHuueY-4tiNGPkcDYkKB0lY,3887
|
|
50
50
|
datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
|
|
51
51
|
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
52
|
-
datachain/lib/model_store.py,sha256=
|
|
53
|
-
datachain/lib/pytorch.py,sha256=
|
|
52
|
+
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
53
|
+
datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
|
|
54
54
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
55
|
-
datachain/lib/signal_schema.py,sha256=
|
|
55
|
+
datachain/lib/signal_schema.py,sha256=vb4yCC90_pEngiu9Irc02kCPyqBxkrFDL4TKr7UMY5U,23808
|
|
56
56
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
57
57
|
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
58
58
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
@@ -96,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
96
96
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
97
97
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
98
98
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
99
|
+
datachain-0.3.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.3.14.dist-info/METADATA,sha256=bItmxEsx2MEsJ78Mu1yjO-PX-RkDuWHMESoPuGiJgxw,17073
|
|
101
|
+
datachain-0.3.14.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
|
102
|
+
datachain-0.3.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.3.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.3.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|