datachain 0.1.13__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +0 -4
  2. datachain/asyn.py +3 -3
  3. datachain/catalog/__init__.py +3 -3
  4. datachain/catalog/catalog.py +6 -6
  5. datachain/catalog/loader.py +3 -3
  6. datachain/cli.py +10 -2
  7. datachain/client/azure.py +37 -1
  8. datachain/client/fsspec.py +1 -1
  9. datachain/client/local.py +1 -1
  10. datachain/data_storage/__init__.py +1 -1
  11. datachain/data_storage/metastore.py +11 -3
  12. datachain/data_storage/schema.py +12 -7
  13. datachain/data_storage/sqlite.py +3 -0
  14. datachain/data_storage/warehouse.py +31 -30
  15. datachain/dataset.py +1 -3
  16. datachain/lib/arrow.py +85 -0
  17. datachain/lib/cached_stream.py +3 -85
  18. datachain/lib/dc.py +382 -179
  19. datachain/lib/feature.py +46 -91
  20. datachain/lib/feature_registry.py +4 -1
  21. datachain/lib/feature_utils.py +2 -2
  22. datachain/lib/file.py +30 -44
  23. datachain/lib/image.py +9 -2
  24. datachain/lib/meta_formats.py +66 -34
  25. datachain/lib/settings.py +5 -5
  26. datachain/lib/signal_schema.py +103 -105
  27. datachain/lib/udf.py +10 -38
  28. datachain/lib/udf_signature.py +11 -6
  29. datachain/lib/webdataset_laion.py +5 -22
  30. datachain/listing.py +8 -8
  31. datachain/node.py +1 -1
  32. datachain/progress.py +1 -1
  33. datachain/query/builtins.py +1 -1
  34. datachain/query/dataset.py +42 -119
  35. datachain/query/dispatch.py +1 -1
  36. datachain/query/metrics.py +19 -0
  37. datachain/query/schema.py +13 -3
  38. datachain/sql/__init__.py +1 -1
  39. datachain/sql/sqlite/base.py +34 -2
  40. datachain/sql/sqlite/vector.py +13 -5
  41. datachain/utils.py +1 -122
  42. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/METADATA +11 -4
  43. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/RECORD +47 -47
  44. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/WHEEL +1 -1
  45. datachain/_version.py +0 -16
  46. datachain/lib/parquet.py +0 -32
  47. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/LICENSE +0 -0
  48. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/entry_points.txt +0 -0
  49. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,3 @@
1
- import os
2
- import shutil
3
- import tempfile
4
1
  from abc import ABC
5
2
  from contextlib import AbstractContextManager
6
3
 
@@ -8,9 +5,7 @@ from datachain.cache import UniqueId
8
5
 
9
6
 
10
7
  class AbstractCachedStream(AbstractContextManager, ABC):
11
- def __init__(self, stream, size, catalog, uid: UniqueId):
12
- self.stream = stream
13
- self.size = size
8
+ def __init__(self, catalog, uid: UniqueId):
14
9
  self.catalog = catalog
15
10
  self.uid = uid
16
11
  self.mode = "rb"
@@ -19,86 +14,9 @@ class AbstractCachedStream(AbstractContextManager, ABC):
19
14
  self.mode = mode
20
15
 
21
16
 
22
- class ProgressiveCacheStream(AbstractCachedStream):
23
- BUF_SIZE = 4096
24
-
25
- def __init__(self, stream, size, catalog, uid: UniqueId):
26
- super().__init__(stream, size, catalog, uid)
27
-
28
- self.target_path = self.catalog.cache.path_from_checksum(self.uid.get_hash())
29
- self.cached_file = None
30
-
31
- self.temp_file = None
32
- self.temp_file_pos = 0
33
-
34
- def __enter__(self):
35
- if os.path.exists(self.target_path):
36
- self.cached_file = open(self.target_path, mode=self.mode)
37
- return self.cached_file
38
-
39
- tmp_dir = self.catalog.cache.tmp_dir
40
- if not os.path.exists(tmp_dir):
41
- os.makedirs(tmp_dir)
42
- self.temp_file = tempfile.NamedTemporaryFile(
43
- prefix=str(self.uid.get_hash()), dir=tmp_dir, delete=False
44
- )
45
- return self
46
-
47
- def __exit__(self, *args):
48
- self.close()
49
-
50
- def read(self, size=-1):
51
- buf = self.stream.read(size)
52
- pos = self.stream.tell()
53
-
54
- if pos >= self.temp_file_pos:
55
- self._cache_catch_up(pos, buf)
56
-
57
- return buf
58
-
59
- def close(self):
60
- if self.cached_file:
61
- self.cached_file.close()
62
-
63
- if self.temp_file:
64
- if self.temp_file_pos < self.size:
65
- self._cache_catch_up(self.size)
66
-
67
- self.temp_file.close()
68
- if not os.path.exists(self.target_path):
69
- os.makedirs(os.path.dirname(self.target_path), exist_ok=True)
70
- shutil.move(self.temp_file.name, self.target_path)
71
-
72
- self.stream.close()
73
-
74
- def _cache_catch_up(self, pos_target, latest_buf=None):
75
- pos_to_restore = self.stream.tell()
76
- try:
77
- remainder = pos_target - self.temp_file_pos
78
- self.stream.seek(self.temp_file_pos)
79
- while remainder > 0:
80
- chunk_size = min(self.BUF_SIZE, remainder)
81
- buf = self.stream.read(chunk_size)
82
- self._cache_update(buf)
83
- remainder -= len(buf)
84
- finally:
85
- self.stream.seek(pos_to_restore)
86
-
87
- def _cache_update(self, buf):
88
- length = len(buf)
89
- self.temp_file.write(buf)
90
- self.temp_file_pos += length
91
-
92
- def seek(self, offset, whence=0):
93
- return self.stream.seek(offset, whence)
94
-
95
- def tell(self):
96
- return self.stream.tell()
97
-
98
-
99
17
  class PreCachedStream(AbstractCachedStream):
100
- def __init__(self, stream, size, catalog, uid: UniqueId):
101
- super().__init__(stream, size, catalog, uid)
18
+ def __init__(self, catalog, uid: UniqueId):
19
+ super().__init__(catalog, uid)
102
20
  self.client = self.catalog.get_client(self.uid.storage)
103
21
  self.cached_file = None
104
22