datachain 0.1.13__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -4
- datachain/asyn.py +3 -3
- datachain/catalog/__init__.py +3 -3
- datachain/catalog/catalog.py +6 -6
- datachain/catalog/loader.py +3 -3
- datachain/cli.py +10 -2
- datachain/client/azure.py +37 -1
- datachain/client/fsspec.py +1 -1
- datachain/client/local.py +1 -1
- datachain/data_storage/__init__.py +1 -1
- datachain/data_storage/metastore.py +11 -3
- datachain/data_storage/schema.py +12 -7
- datachain/data_storage/sqlite.py +3 -0
- datachain/data_storage/warehouse.py +31 -30
- datachain/dataset.py +1 -3
- datachain/lib/arrow.py +85 -0
- datachain/lib/cached_stream.py +3 -85
- datachain/lib/dc.py +382 -179
- datachain/lib/feature.py +46 -91
- datachain/lib/feature_registry.py +4 -1
- datachain/lib/feature_utils.py +2 -2
- datachain/lib/file.py +30 -44
- datachain/lib/image.py +9 -2
- datachain/lib/meta_formats.py +66 -34
- datachain/lib/settings.py +5 -5
- datachain/lib/signal_schema.py +103 -105
- datachain/lib/udf.py +10 -38
- datachain/lib/udf_signature.py +11 -6
- datachain/lib/webdataset_laion.py +5 -22
- datachain/listing.py +8 -8
- datachain/node.py +1 -1
- datachain/progress.py +1 -1
- datachain/query/builtins.py +1 -1
- datachain/query/dataset.py +42 -119
- datachain/query/dispatch.py +1 -1
- datachain/query/metrics.py +19 -0
- datachain/query/schema.py +13 -3
- datachain/sql/__init__.py +1 -1
- datachain/sql/sqlite/base.py +34 -2
- datachain/sql/sqlite/vector.py +13 -5
- datachain/utils.py +1 -122
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/METADATA +11 -4
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/RECORD +47 -47
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/WHEEL +1 -1
- datachain/_version.py +0 -16
- datachain/lib/parquet.py +0 -32
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/LICENSE +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/top_level.txt +0 -0
datachain/lib/cached_stream.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import shutil
|
|
3
|
-
import tempfile
|
|
4
1
|
from abc import ABC
|
|
5
2
|
from contextlib import AbstractContextManager
|
|
6
3
|
|
|
@@ -8,9 +5,7 @@ from datachain.cache import UniqueId
|
|
|
8
5
|
|
|
9
6
|
|
|
10
7
|
class AbstractCachedStream(AbstractContextManager, ABC):
|
|
11
|
-
def __init__(self,
|
|
12
|
-
self.stream = stream
|
|
13
|
-
self.size = size
|
|
8
|
+
def __init__(self, catalog, uid: UniqueId):
|
|
14
9
|
self.catalog = catalog
|
|
15
10
|
self.uid = uid
|
|
16
11
|
self.mode = "rb"
|
|
@@ -19,86 +14,9 @@ class AbstractCachedStream(AbstractContextManager, ABC):
|
|
|
19
14
|
self.mode = mode
|
|
20
15
|
|
|
21
16
|
|
|
22
|
-
class ProgressiveCacheStream(AbstractCachedStream):
|
|
23
|
-
BUF_SIZE = 4096
|
|
24
|
-
|
|
25
|
-
def __init__(self, stream, size, catalog, uid: UniqueId):
|
|
26
|
-
super().__init__(stream, size, catalog, uid)
|
|
27
|
-
|
|
28
|
-
self.target_path = self.catalog.cache.path_from_checksum(self.uid.get_hash())
|
|
29
|
-
self.cached_file = None
|
|
30
|
-
|
|
31
|
-
self.temp_file = None
|
|
32
|
-
self.temp_file_pos = 0
|
|
33
|
-
|
|
34
|
-
def __enter__(self):
|
|
35
|
-
if os.path.exists(self.target_path):
|
|
36
|
-
self.cached_file = open(self.target_path, mode=self.mode)
|
|
37
|
-
return self.cached_file
|
|
38
|
-
|
|
39
|
-
tmp_dir = self.catalog.cache.tmp_dir
|
|
40
|
-
if not os.path.exists(tmp_dir):
|
|
41
|
-
os.makedirs(tmp_dir)
|
|
42
|
-
self.temp_file = tempfile.NamedTemporaryFile(
|
|
43
|
-
prefix=str(self.uid.get_hash()), dir=tmp_dir, delete=False
|
|
44
|
-
)
|
|
45
|
-
return self
|
|
46
|
-
|
|
47
|
-
def __exit__(self, *args):
|
|
48
|
-
self.close()
|
|
49
|
-
|
|
50
|
-
def read(self, size=-1):
|
|
51
|
-
buf = self.stream.read(size)
|
|
52
|
-
pos = self.stream.tell()
|
|
53
|
-
|
|
54
|
-
if pos >= self.temp_file_pos:
|
|
55
|
-
self._cache_catch_up(pos, buf)
|
|
56
|
-
|
|
57
|
-
return buf
|
|
58
|
-
|
|
59
|
-
def close(self):
|
|
60
|
-
if self.cached_file:
|
|
61
|
-
self.cached_file.close()
|
|
62
|
-
|
|
63
|
-
if self.temp_file:
|
|
64
|
-
if self.temp_file_pos < self.size:
|
|
65
|
-
self._cache_catch_up(self.size)
|
|
66
|
-
|
|
67
|
-
self.temp_file.close()
|
|
68
|
-
if not os.path.exists(self.target_path):
|
|
69
|
-
os.makedirs(os.path.dirname(self.target_path), exist_ok=True)
|
|
70
|
-
shutil.move(self.temp_file.name, self.target_path)
|
|
71
|
-
|
|
72
|
-
self.stream.close()
|
|
73
|
-
|
|
74
|
-
def _cache_catch_up(self, pos_target, latest_buf=None):
|
|
75
|
-
pos_to_restore = self.stream.tell()
|
|
76
|
-
try:
|
|
77
|
-
remainder = pos_target - self.temp_file_pos
|
|
78
|
-
self.stream.seek(self.temp_file_pos)
|
|
79
|
-
while remainder > 0:
|
|
80
|
-
chunk_size = min(self.BUF_SIZE, remainder)
|
|
81
|
-
buf = self.stream.read(chunk_size)
|
|
82
|
-
self._cache_update(buf)
|
|
83
|
-
remainder -= len(buf)
|
|
84
|
-
finally:
|
|
85
|
-
self.stream.seek(pos_to_restore)
|
|
86
|
-
|
|
87
|
-
def _cache_update(self, buf):
|
|
88
|
-
length = len(buf)
|
|
89
|
-
self.temp_file.write(buf)
|
|
90
|
-
self.temp_file_pos += length
|
|
91
|
-
|
|
92
|
-
def seek(self, offset, whence=0):
|
|
93
|
-
return self.stream.seek(offset, whence)
|
|
94
|
-
|
|
95
|
-
def tell(self):
|
|
96
|
-
return self.stream.tell()
|
|
97
|
-
|
|
98
|
-
|
|
99
17
|
class PreCachedStream(AbstractCachedStream):
|
|
100
|
-
def __init__(self,
|
|
101
|
-
super().__init__(
|
|
18
|
+
def __init__(self, catalog, uid: UniqueId):
|
|
19
|
+
super().__init__(catalog, uid)
|
|
102
20
|
self.client = self.catalog.get_client(self.uid.storage)
|
|
103
21
|
self.cached_file = None
|
|
104
22
|
|