datachain 0.7.9__py3-none-any.whl → 0.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/client/fsspec.py +4 -2
- datachain/client/local.py +9 -4
- datachain/func/__init__.py +4 -1
- datachain/func/numeric.py +46 -0
- datachain/func/string.py +46 -0
- datachain/lib/convert/flatten.py +7 -5
- datachain/lib/convert/unflatten.py +2 -2
- datachain/lib/convert/values_to_tuples.py +1 -1
- datachain/lib/utils.py +1 -1
- datachain/query/dataset.py +1 -1
- datachain/sql/functions/numeric.py +12 -0
- datachain/sql/functions/string.py +12 -0
- datachain/sql/sqlite/base.py +40 -0
- datachain-0.7.10.dist-info/METADATA +207 -0
- {datachain-0.7.9.dist-info → datachain-0.7.10.dist-info}/RECORD +19 -19
- datachain-0.7.9.dist-info/METADATA +0 -488
- {datachain-0.7.9.dist-info → datachain-0.7.10.dist-info}/LICENSE +0 -0
- {datachain-0.7.9.dist-info → datachain-0.7.10.dist-info}/WHEEL +0 -0
- {datachain-0.7.9.dist-info → datachain-0.7.10.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.9.dist-info → datachain-0.7.10.dist-info}/top_level.txt +0 -0
datachain/client/fsspec.py
CHANGED
|
@@ -172,7 +172,7 @@ class Client(ABC):
|
|
|
172
172
|
return url == cls.PREFIX
|
|
173
173
|
|
|
174
174
|
@classmethod
|
|
175
|
-
def get_uri(cls, name) -> "StorageURI":
|
|
175
|
+
def get_uri(cls, name: str) -> "StorageURI":
|
|
176
176
|
from datachain.dataset import StorageURI
|
|
177
177
|
|
|
178
178
|
return StorageURI(f"{cls.PREFIX}{name}")
|
|
@@ -278,7 +278,9 @@ class Client(ABC):
|
|
|
278
278
|
) -> None:
|
|
279
279
|
await self._fetch_nested(start_prefix, result_queue)
|
|
280
280
|
|
|
281
|
-
async def _fetch_dir(
|
|
281
|
+
async def _fetch_dir(
|
|
282
|
+
self, prefix: str, pbar, result_queue: ResultQueue
|
|
283
|
+
) -> set[str]:
|
|
282
284
|
path = f"{self.name}/{prefix}"
|
|
283
285
|
infos = await self.ls_dir(path)
|
|
284
286
|
files = []
|
datachain/client/local.py
CHANGED
|
@@ -12,6 +12,7 @@ from datachain.lib.file import File
|
|
|
12
12
|
from .fsspec import Client
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
+
from datachain.cache import DataChainCache
|
|
15
16
|
from datachain.dataset import StorageURI
|
|
16
17
|
|
|
17
18
|
|
|
@@ -21,7 +22,11 @@ class FileClient(Client):
|
|
|
21
22
|
protocol = "file"
|
|
22
23
|
|
|
23
24
|
def __init__(
|
|
24
|
-
self,
|
|
25
|
+
self,
|
|
26
|
+
name: str,
|
|
27
|
+
fs_kwargs: dict[str, Any],
|
|
28
|
+
cache: "DataChainCache",
|
|
29
|
+
use_symlinks: bool = False,
|
|
25
30
|
) -> None:
|
|
26
31
|
super().__init__(name, fs_kwargs, cache)
|
|
27
32
|
self.use_symlinks = use_symlinks
|
|
@@ -30,7 +35,7 @@ class FileClient(Client):
|
|
|
30
35
|
raise TypeError("Signed urls are not implemented for local file system")
|
|
31
36
|
|
|
32
37
|
@classmethod
|
|
33
|
-
def get_uri(cls, name) -> "StorageURI":
|
|
38
|
+
def get_uri(cls, name: str) -> "StorageURI":
|
|
34
39
|
from datachain.dataset import StorageURI
|
|
35
40
|
|
|
36
41
|
return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
|
|
@@ -77,7 +82,7 @@ class FileClient(Client):
|
|
|
77
82
|
return bucket, path
|
|
78
83
|
|
|
79
84
|
@classmethod
|
|
80
|
-
def from_name(cls, name: str, cache, kwargs) -> "FileClient":
|
|
85
|
+
def from_name(cls, name: str, cache: "DataChainCache", kwargs) -> "FileClient":
|
|
81
86
|
use_symlinks = kwargs.pop("use_symlinks", False)
|
|
82
87
|
return cls(name, kwargs, cache, use_symlinks=use_symlinks)
|
|
83
88
|
|
|
@@ -85,7 +90,7 @@ class FileClient(Client):
|
|
|
85
90
|
def from_source(
|
|
86
91
|
cls,
|
|
87
92
|
uri: str,
|
|
88
|
-
cache,
|
|
93
|
+
cache: "DataChainCache",
|
|
89
94
|
use_symlinks: bool = False,
|
|
90
95
|
**kwargs,
|
|
91
96
|
) -> "FileClient":
|
datachain/func/__init__.py
CHANGED
|
@@ -17,8 +17,9 @@ from .aggregate import (
|
|
|
17
17
|
)
|
|
18
18
|
from .array import cosine_distance, euclidean_distance, length, sip_hash_64
|
|
19
19
|
from .conditional import greatest, least
|
|
20
|
-
from .numeric import bit_and, bit_or, bit_xor, int_hash_64
|
|
20
|
+
from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
|
|
21
21
|
from .random import rand
|
|
22
|
+
from .string import byte_hamming_distance
|
|
22
23
|
from .window import window
|
|
23
24
|
|
|
24
25
|
__all__ = [
|
|
@@ -26,8 +27,10 @@ __all__ = [
|
|
|
26
27
|
"array",
|
|
27
28
|
"avg",
|
|
28
29
|
"bit_and",
|
|
30
|
+
"bit_hamming_distance",
|
|
29
31
|
"bit_or",
|
|
30
32
|
"bit_xor",
|
|
33
|
+
"byte_hamming_distance",
|
|
31
34
|
"case",
|
|
32
35
|
"collect",
|
|
33
36
|
"concat",
|
datachain/func/numeric.py
CHANGED
|
@@ -160,3 +160,49 @@ def int_hash_64(col: Union[ColT, int]) -> Func:
|
|
|
160
160
|
return Func(
|
|
161
161
|
"int_hash_64", inner=numeric.int_hash_64, cols=cols, args=args, result_type=int
|
|
162
162
|
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def bit_hamming_distance(*args: Union[ColT, int]) -> Func:
|
|
166
|
+
"""
|
|
167
|
+
Computes the Hamming distance between the bit representations of two integer values.
|
|
168
|
+
|
|
169
|
+
The Hamming distance is the number of positions at which the corresponding bits
|
|
170
|
+
are different. This function returns the dissimilarity between the integers,
|
|
171
|
+
where 0 indicates identical integers and values closer to the number of bits
|
|
172
|
+
in the integer indicate higher dissimilarity.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
args (str | int): Two integers to compute the Hamming distance between.
|
|
176
|
+
If a str is provided, it is assumed to be the name of the column.
|
|
177
|
+
If an int is provided, it is assumed to be an integer literal.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Func: A Func object that represents the Hamming distance function.
|
|
181
|
+
|
|
182
|
+
Example:
|
|
183
|
+
```py
|
|
184
|
+
dc.mutate(
|
|
185
|
+
ham_dist=func.bit_hamming_distance("embed1", 123456),
|
|
186
|
+
)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Notes:
|
|
190
|
+
- Result column will always be of type int.
|
|
191
|
+
"""
|
|
192
|
+
cols, func_args = [], []
|
|
193
|
+
for arg in args:
|
|
194
|
+
if isinstance(arg, int):
|
|
195
|
+
func_args.append(arg)
|
|
196
|
+
else:
|
|
197
|
+
cols.append(arg)
|
|
198
|
+
|
|
199
|
+
if len(cols) + len(func_args) != 2:
|
|
200
|
+
raise ValueError("bit_hamming_distance() requires exactly two arguments")
|
|
201
|
+
|
|
202
|
+
return Func(
|
|
203
|
+
"bit_hamming_distance",
|
|
204
|
+
inner=numeric.bit_hamming_distance,
|
|
205
|
+
cols=cols,
|
|
206
|
+
args=func_args,
|
|
207
|
+
result_type=int,
|
|
208
|
+
)
|
datachain/func/string.py
CHANGED
|
@@ -152,3 +152,49 @@ def regexp_replace(col: Union[str, Func], regex: str, replacement: str) -> Func:
|
|
|
152
152
|
args = None
|
|
153
153
|
|
|
154
154
|
return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def byte_hamming_distance(*args: Union[str, Func]) -> Func:
|
|
158
|
+
"""
|
|
159
|
+
Computes the Hamming distance between two strings.
|
|
160
|
+
|
|
161
|
+
The Hamming distance is the number of positions at which the corresponding
|
|
162
|
+
characters are different. This function returns the dissimilarity between
|
|
163
|
+
the strings, where 0 indicates identical strings and values closer to the length
|
|
164
|
+
of the strings indicate higher dissimilarity.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
args (str | literal): Two strings to compute the Hamming distance between.
|
|
168
|
+
If a str is provided, it is assumed to be the name of the column.
|
|
169
|
+
If a Literal is provided, it is assumed to be a string literal.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Func: A Func object that represents the Hamming distance function.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
```py
|
|
176
|
+
dc.mutate(
|
|
177
|
+
ham_dist=func.byte_hamming_distance("file.phash", literal("hello")),
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Notes:
|
|
182
|
+
- Result column will always be of type int.
|
|
183
|
+
"""
|
|
184
|
+
cols, func_args = [], []
|
|
185
|
+
for arg in args:
|
|
186
|
+
if get_origin(arg) is literal:
|
|
187
|
+
func_args.append(arg)
|
|
188
|
+
else:
|
|
189
|
+
cols.append(arg)
|
|
190
|
+
|
|
191
|
+
if len(cols) + len(func_args) != 2:
|
|
192
|
+
raise ValueError("byte_hamming_distance() requires exactly two arguments")
|
|
193
|
+
|
|
194
|
+
return Func(
|
|
195
|
+
"byte_hamming_distance",
|
|
196
|
+
inner=string.byte_hamming_distance,
|
|
197
|
+
cols=cols,
|
|
198
|
+
args=func_args,
|
|
199
|
+
result_type=int,
|
|
200
|
+
)
|
datachain/lib/convert/flatten.py
CHANGED
|
@@ -1,19 +1,21 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
|
|
1
3
|
from pydantic import BaseModel
|
|
2
4
|
|
|
3
5
|
from datachain.lib.model_store import ModelStore
|
|
4
6
|
|
|
5
7
|
|
|
6
|
-
def flatten(obj: BaseModel):
|
|
8
|
+
def flatten(obj: BaseModel) -> tuple:
|
|
7
9
|
return tuple(_flatten_fields_values(obj.model_fields, obj))
|
|
8
10
|
|
|
9
11
|
|
|
10
|
-
def flatten_list(obj_list):
|
|
12
|
+
def flatten_list(obj_list: list[BaseModel]) -> tuple:
|
|
11
13
|
return tuple(
|
|
12
14
|
val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
def _flatten_list_field(value: list):
|
|
18
|
+
def _flatten_list_field(value: list) -> list:
|
|
17
19
|
assert isinstance(value, list)
|
|
18
20
|
if value and ModelStore.is_pydantic(type(value[0])):
|
|
19
21
|
return [val.model_dump() for val in value]
|
|
@@ -22,7 +24,7 @@ def _flatten_list_field(value: list):
|
|
|
22
24
|
return value
|
|
23
25
|
|
|
24
26
|
|
|
25
|
-
def _flatten_fields_values(fields, obj: BaseModel):
|
|
27
|
+
def _flatten_fields_values(fields: dict, obj: BaseModel) -> Generator:
|
|
26
28
|
for name, f_info in fields.items():
|
|
27
29
|
anno = f_info.annotation
|
|
28
30
|
# Optimization: Access attributes directly to skip the model_dump() call.
|
|
@@ -40,5 +42,5 @@ def _flatten_fields_values(fields, obj: BaseModel):
|
|
|
40
42
|
yield value
|
|
41
43
|
|
|
42
44
|
|
|
43
|
-
def _flatten(obj):
|
|
45
|
+
def _flatten(obj: BaseModel) -> tuple:
|
|
44
46
|
return tuple(_flatten_fields_values(obj.model_fields, obj))
|
|
@@ -9,12 +9,12 @@ from pydantic import BaseModel
|
|
|
9
9
|
from datachain.query.schema import DEFAULT_DELIMITER
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict:
|
|
12
|
+
def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos: int = 0) -> dict:
|
|
13
13
|
return unflatten_to_json_pos(model, row, pos)[0]
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def unflatten_to_json_pos(
|
|
17
|
-
model: type[BaseModel], row: Sequence[Any], pos=0
|
|
17
|
+
model: type[BaseModel], row: Sequence[Any], pos: int = 0
|
|
18
18
|
) -> tuple[dict, int]:
|
|
19
19
|
res = {}
|
|
20
20
|
for name, f_info in model.model_fields.items():
|
|
@@ -11,7 +11,7 @@ from datachain.lib.utils import DataChainParamsError
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class ValuesToTupleError(DataChainParamsError):
|
|
14
|
-
def __init__(self, ds_name, msg):
|
|
14
|
+
def __init__(self, ds_name: str, msg: str):
|
|
15
15
|
if ds_name:
|
|
16
16
|
ds_name = f"' {ds_name}'"
|
|
17
17
|
super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
|
datachain/lib/utils.py
CHANGED
datachain/query/dataset.py
CHANGED
|
@@ -215,7 +215,7 @@ class DatasetDiffOperation(Step):
|
|
|
215
215
|
Should return select query that calculates desired diff between dataset queries
|
|
216
216
|
"""
|
|
217
217
|
|
|
218
|
-
def apply(self, query_generator, temp_tables: list[str]):
|
|
218
|
+
def apply(self, query_generator, temp_tables: list[str]) -> "StepResult":
|
|
219
219
|
source_query = query_generator.exclude(("sys__id",))
|
|
220
220
|
target_query = self.dq.apply_steps().select()
|
|
221
221
|
temp_tables.extend(self.dq.temp_table_names)
|
|
@@ -35,9 +35,21 @@ class int_hash_64(GenericFunction): # noqa: N801
|
|
|
35
35
|
inherit_cache = True
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
class bit_hamming_distance(GenericFunction): # noqa: N801
|
|
39
|
+
"""
|
|
40
|
+
Returns the Hamming distance between two integers.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
type = Int64()
|
|
44
|
+
package = "numeric"
|
|
45
|
+
name = "hamming_distance"
|
|
46
|
+
inherit_cache = True
|
|
47
|
+
|
|
48
|
+
|
|
38
49
|
compiler_not_implemented(bit_and)
|
|
39
50
|
compiler_not_implemented(bit_or)
|
|
40
51
|
compiler_not_implemented(bit_xor)
|
|
41
52
|
compiler_not_implemented(bit_rshift)
|
|
42
53
|
compiler_not_implemented(bit_lshift)
|
|
43
54
|
compiler_not_implemented(int_hash_64)
|
|
55
|
+
compiler_not_implemented(bit_hamming_distance)
|
|
@@ -48,7 +48,19 @@ class replace(GenericFunction): # noqa: N801
|
|
|
48
48
|
inherit_cache = True
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
class byte_hamming_distance(GenericFunction): # noqa: N801
|
|
52
|
+
"""
|
|
53
|
+
Returns the Hamming distance between two strings.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
type = Int64()
|
|
57
|
+
package = "string"
|
|
58
|
+
name = "hamming_distance"
|
|
59
|
+
inherit_cache = True
|
|
60
|
+
|
|
61
|
+
|
|
51
62
|
compiler_not_implemented(length)
|
|
52
63
|
compiler_not_implemented(split)
|
|
53
64
|
compiler_not_implemented(regexp_replace)
|
|
54
65
|
compiler_not_implemented(replace)
|
|
66
|
+
compiler_not_implemented(byte_hamming_distance)
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -90,6 +90,7 @@ def setup():
|
|
|
90
90
|
compiles(string.split, "sqlite")(compile_string_split)
|
|
91
91
|
compiles(string.regexp_replace, "sqlite")(compile_string_regexp_replace)
|
|
92
92
|
compiles(string.replace, "sqlite")(compile_string_replace)
|
|
93
|
+
compiles(string.byte_hamming_distance, "sqlite")(compile_byte_hamming_distance)
|
|
93
94
|
compiles(conditional.greatest, "sqlite")(compile_greatest)
|
|
94
95
|
compiles(conditional.least, "sqlite")(compile_least)
|
|
95
96
|
compiles(Values, "sqlite")(compile_values)
|
|
@@ -104,6 +105,7 @@ def setup():
|
|
|
104
105
|
compiles(numeric.bit_rshift, "sqlite")(compile_bitwise_rshift)
|
|
105
106
|
compiles(numeric.bit_lshift, "sqlite")(compile_bitwise_lshift)
|
|
106
107
|
compiles(numeric.int_hash_64, "sqlite")(compile_int_hash_64)
|
|
108
|
+
compiles(numeric.bit_hamming_distance, "sqlite")(compile_bit_hamming_distance)
|
|
107
109
|
|
|
108
110
|
if load_usearch_extension(sqlite3.connect(":memory:")):
|
|
109
111
|
compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
|
|
@@ -191,6 +193,26 @@ def sqlite_int_hash_64(x: int) -> int:
|
|
|
191
193
|
return x if x < 1 << 63 else (x & MAX_INT64) - (1 << 64)
|
|
192
194
|
|
|
193
195
|
|
|
196
|
+
def sqlite_bit_hamming_distance(a: int, b: int) -> int:
|
|
197
|
+
"""Calculate the Hamming distance between two integers."""
|
|
198
|
+
diff = (a & MAX_INT64) ^ (b & MAX_INT64)
|
|
199
|
+
if hasattr(diff, "bit_count"):
|
|
200
|
+
return diff.bit_count()
|
|
201
|
+
return bin(diff).count("1")
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def sqlite_byte_hamming_distance(a: str, b: str) -> int:
|
|
205
|
+
"""Calculate the Hamming distance between two strings."""
|
|
206
|
+
diff = 0
|
|
207
|
+
if len(a) < len(b):
|
|
208
|
+
diff = len(b) - len(a)
|
|
209
|
+
b = b[: len(a)]
|
|
210
|
+
elif len(b) < len(a):
|
|
211
|
+
diff = len(a) - len(b)
|
|
212
|
+
a = a[: len(b)]
|
|
213
|
+
return diff + sum(c1 != c2 for c1, c2 in zip(a, b))
|
|
214
|
+
|
|
215
|
+
|
|
194
216
|
def register_user_defined_sql_functions() -> None:
|
|
195
217
|
# Register optional functions if we have the necessary dependencies
|
|
196
218
|
# and otherwise register functions that will raise an exception with
|
|
@@ -225,6 +247,9 @@ def register_user_defined_sql_functions() -> None:
|
|
|
225
247
|
"bitwise_lshift", 2, lambda a, b: a << b, deterministic=True
|
|
226
248
|
)
|
|
227
249
|
conn.create_function("int_hash_64", 1, sqlite_int_hash_64, deterministic=True)
|
|
250
|
+
conn.create_function(
|
|
251
|
+
"bit_hamming_distance", 2, sqlite_bit_hamming_distance, deterministic=True
|
|
252
|
+
)
|
|
228
253
|
|
|
229
254
|
_registered_function_creators["numeric_functions"] = create_numeric_functions
|
|
230
255
|
|
|
@@ -237,6 +262,9 @@ def register_user_defined_sql_functions() -> None:
|
|
|
237
262
|
conn.create_function(
|
|
238
263
|
"regexp_replace", 3, sqlite_regexp_replace, deterministic=True
|
|
239
264
|
)
|
|
265
|
+
conn.create_function(
|
|
266
|
+
"byte_hamming_distance", 2, sqlite_byte_hamming_distance, deterministic=True
|
|
267
|
+
)
|
|
240
268
|
|
|
241
269
|
_registered_function_creators["string_functions"] = create_string_functions
|
|
242
270
|
|
|
@@ -383,6 +411,18 @@ def compile_int_hash_64(element, compiler, **kwargs):
|
|
|
383
411
|
return compiler.process(func.int_hash_64(*element.clauses.clauses), **kwargs)
|
|
384
412
|
|
|
385
413
|
|
|
414
|
+
def compile_bit_hamming_distance(element, compiler, **kwargs):
|
|
415
|
+
return compiler.process(
|
|
416
|
+
func.bit_hamming_distance(*element.clauses.clauses), **kwargs
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def compile_byte_hamming_distance(element, compiler, **kwargs):
|
|
421
|
+
return compiler.process(
|
|
422
|
+
func.byte_hamming_distance(*element.clauses.clauses), **kwargs
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
|
|
386
426
|
def py_json_array_length(arr):
|
|
387
427
|
return len(orjson.loads(arr))
|
|
388
428
|
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datachain
|
|
3
|
+
Version: 0.7.10
|
|
4
|
+
Summary: Wrangle unstructured AI data at scale
|
|
5
|
+
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
+
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
|
+
Project-URL: Source, https://github.com/iterative/datachain
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/x-rst
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: tomlkit
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: numpy<3,>=1
|
|
23
|
+
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: pyarrow
|
|
25
|
+
Requires-Dist: typing-extensions
|
|
26
|
+
Requires-Dist: python-dateutil>=2
|
|
27
|
+
Requires-Dist: attrs>=21.3.0
|
|
28
|
+
Requires-Dist: s3fs>=2024.2.0
|
|
29
|
+
Requires-Dist: gcsfs>=2024.2.0
|
|
30
|
+
Requires-Dist: adlfs>=2024.2.0
|
|
31
|
+
Requires-Dist: dvc-data<4,>=3.10
|
|
32
|
+
Requires-Dist: dvc-objects<6,>=4
|
|
33
|
+
Requires-Dist: shtab<2,>=1.3.4
|
|
34
|
+
Requires-Dist: sqlalchemy>=2
|
|
35
|
+
Requires-Dist: multiprocess==0.70.16
|
|
36
|
+
Requires-Dist: cloudpickle
|
|
37
|
+
Requires-Dist: orjson>=3.10.5
|
|
38
|
+
Requires-Dist: pydantic<3,>=2
|
|
39
|
+
Requires-Dist: jmespath>=1.0
|
|
40
|
+
Requires-Dist: datamodel-code-generator>=0.25
|
|
41
|
+
Requires-Dist: Pillow<12,>=10.0.0
|
|
42
|
+
Requires-Dist: msgpack<2,>=1.0.4
|
|
43
|
+
Requires-Dist: psutil
|
|
44
|
+
Requires-Dist: huggingface_hub
|
|
45
|
+
Requires-Dist: iterative-telemetry>=0.0.9
|
|
46
|
+
Requires-Dist: platformdirs
|
|
47
|
+
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
|
+
Requires-Dist: tabulate
|
|
49
|
+
Provides-Extra: docs
|
|
50
|
+
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
51
|
+
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
52
|
+
Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
|
|
53
|
+
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
54
|
+
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
55
|
+
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
56
|
+
Provides-Extra: torch
|
|
57
|
+
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
58
|
+
Requires-Dist: torchvision; extra == "torch"
|
|
59
|
+
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
60
|
+
Provides-Extra: remote
|
|
61
|
+
Requires-Dist: lz4; extra == "remote"
|
|
62
|
+
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
63
|
+
Provides-Extra: vector
|
|
64
|
+
Requires-Dist: usearch; extra == "vector"
|
|
65
|
+
Provides-Extra: hf
|
|
66
|
+
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
67
|
+
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
68
|
+
Provides-Extra: tests
|
|
69
|
+
Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
|
|
70
|
+
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
71
|
+
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
72
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
73
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
74
|
+
Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
|
|
75
|
+
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
76
|
+
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
77
|
+
Requires-Dist: virtualenv; extra == "tests"
|
|
78
|
+
Requires-Dist: dulwich; extra == "tests"
|
|
79
|
+
Requires-Dist: hypothesis; extra == "tests"
|
|
80
|
+
Requires-Dist: open_clip_torch; extra == "tests"
|
|
81
|
+
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
82
|
+
Requires-Dist: requests-mock; extra == "tests"
|
|
83
|
+
Requires-Dist: scipy; extra == "tests"
|
|
84
|
+
Provides-Extra: dev
|
|
85
|
+
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
86
|
+
Requires-Dist: mypy==1.13.0; extra == "dev"
|
|
87
|
+
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
88
|
+
Requires-Dist: types-pytz; extra == "dev"
|
|
89
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
90
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
91
|
+
Requires-Dist: types-tabulate; extra == "dev"
|
|
92
|
+
Provides-Extra: examples
|
|
93
|
+
Requires-Dist: datachain[tests]; extra == "examples"
|
|
94
|
+
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
95
|
+
Requires-Dist: defusedxml; extra == "examples"
|
|
96
|
+
Requires-Dist: accelerate; extra == "examples"
|
|
97
|
+
Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
|
|
98
|
+
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
|
+
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
|
+
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
+
Requires-Dist: ultralytics==8.3.37; extra == "examples"
|
|
102
|
+
|
|
103
|
+
================
|
|
104
|
+
|logo| DataChain
|
|
105
|
+
================
|
|
106
|
+
|
|
107
|
+
|PyPI| |Python Version| |Codecov| |Tests|
|
|
108
|
+
|
|
109
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
110
|
+
:height: 24
|
|
111
|
+
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
112
|
+
:target: https://pypi.org/project/datachain/
|
|
113
|
+
:alt: PyPI
|
|
114
|
+
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
115
|
+
:target: https://pypi.org/project/datachain
|
|
116
|
+
:alt: Python Version
|
|
117
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
118
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
119
|
+
:alt: Codecov
|
|
120
|
+
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
121
|
+
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
122
|
+
:alt: Tests
|
|
123
|
+
|
|
124
|
+
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
125
|
+
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
126
|
+
(e.g. S3) to process data efficiently without data duplication and manages metadata
|
|
127
|
+
in an internal database for easy and efficient querying.
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
Use Cases
|
|
131
|
+
=========
|
|
132
|
+
|
|
133
|
+
1. **ETL.** Pythonic framework for describing and running unstructured data transformations
|
|
134
|
+
and enrichments, applying models to data, including LLMs.
|
|
135
|
+
2. **Analytics.** DataChain dataset is a table that combines all the information about data
|
|
136
|
+
objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
|
|
137
|
+
on these tables at scale.
|
|
138
|
+
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
139
|
+
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
Key Features
|
|
143
|
+
============
|
|
144
|
+
|
|
145
|
+
📂 **Multimodal Dataset Versioning.**
|
|
146
|
+
- Version unstructured data without moving or creating data copies, by supporting
|
|
147
|
+
references to S3, GCP, Azure, and local file systems.
|
|
148
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
149
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
150
|
+
|
|
151
|
+
🐍 **Python-friendly.**
|
|
152
|
+
- Operate on Python objects and object fields: float scores, strings, matrixes,
|
|
153
|
+
LLM response objects.
|
|
154
|
+
- Run Python code in a high-scale, terabytes size datasets, with built-in
|
|
155
|
+
parallelization and memory-efficient computing — no SQL or Spark required.
|
|
156
|
+
|
|
157
|
+
🧠 **Data Enrichment and Processing.**
|
|
158
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
159
|
+
- Filter, join, and group datasets by metadata. Search by vector embeddings.
|
|
160
|
+
- High-performance vectorized operations on Python objects: sum, count, avg, etc.
|
|
161
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
Getting Started
|
|
165
|
+
===============
|
|
166
|
+
|
|
167
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
Contributing
|
|
171
|
+
============
|
|
172
|
+
|
|
173
|
+
Contributions are very welcome. To learn more, see the `Contributor Guide`_.
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
Community and Support
|
|
177
|
+
=====================
|
|
178
|
+
|
|
179
|
+
* `Docs <https://docs.datachain.ai/>`_
|
|
180
|
+
* `File an issue`_ if you encounter any problems
|
|
181
|
+
* `Discord Chat <https://dvc.org/chat>`_
|
|
182
|
+
* `Email <mailto:support@dvc.org>`_
|
|
183
|
+
* `Twitter <https://twitter.com/DVCorg>`_
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
DataChain Studio Platform
|
|
187
|
+
=========================
|
|
188
|
+
|
|
189
|
+
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
190
|
+
|
|
191
|
+
- **Centralized dataset registry** to manage data, code and dependency
|
|
192
|
+
dependencies in one place.
|
|
193
|
+
- **Data Lineage** for data sources as well as derivative dataset.
|
|
194
|
+
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
195
|
+
- **Scalable Compute** to handle large datasets (100M+ files) and in-house
|
|
196
|
+
AI model inference.
|
|
197
|
+
- **Access control** including SSO and team based collaboration.
|
|
198
|
+
|
|
199
|
+
.. _PyPI: https://pypi.org/
|
|
200
|
+
.. _file an issue: https://github.com/iterative/datachain/issues
|
|
201
|
+
.. github-only
|
|
202
|
+
.. _Contributor Guide: https://docs.datachain.ai/contributing
|
|
203
|
+
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
204
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
205
|
+
.. _SQLite: https://www.sqlite.org/
|
|
206
|
+
.. _Getting Started: https://docs.datachain.ai/
|
|
207
|
+
.. _DataChain Studio: https://studio.datachain.ai/
|
|
@@ -24,10 +24,10 @@ datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5
|
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
25
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
|
|
28
28
|
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
29
29
|
datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
|
|
30
|
-
datachain/client/local.py,sha256=
|
|
30
|
+
datachain/client/local.py,sha256=f2HBqWH8SQM5CyiJ0ljfePVROg2FszWaAn6E2c8RiLE,4596
|
|
31
31
|
datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
|
|
32
32
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
33
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
@@ -37,16 +37,16 @@ datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu
|
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
38
|
datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
|
|
39
39
|
datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
|
|
40
|
-
datachain/func/__init__.py,sha256=
|
|
40
|
+
datachain/func/__init__.py,sha256=TG6JHFKtLi06Nd5iLszXIflEq-VKZcKMdgo_KiQ8SGQ,1055
|
|
41
41
|
datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
|
|
42
42
|
datachain/func/array.py,sha256=zHDNWuWLA7HVa9FEvQeHhVi00_xqenyleTqcLwkXWBI,5477
|
|
43
43
|
datachain/func/base.py,sha256=wA0sBQAVyN9LPxoo7Ox83peS0zUVnyuKxukwAcjGLfY,534
|
|
44
44
|
datachain/func/conditional.py,sha256=mQroxsoExpBW84Zm5dAYP4OpBblWmzfnF2qJq9rba54,2223
|
|
45
45
|
datachain/func/func.py,sha256=mJ_rOXMpoqnK4-d5eF9boSMx5hWzgKoMLPGpZQqLAfw,15222
|
|
46
|
-
datachain/func/numeric.py,sha256=
|
|
46
|
+
datachain/func/numeric.py,sha256=gMe1Ks0dqQKHkjcpvj7I5S-neECzQ_gltPQLNoaWOyo,5632
|
|
47
47
|
datachain/func/path.py,sha256=mqN_mfkwv44z2II7DMTp_fGGw95hmTCNls_TOFNpr4k,3155
|
|
48
48
|
datachain/func/random.py,sha256=pENOLj9rSmWfGCnOsUIaCsVC5486zQb66qfQvXaz9Z4,452
|
|
49
|
-
datachain/func/string.py,sha256=
|
|
49
|
+
datachain/func/string.py,sha256=8az3BTeezlaZt6NW-54GWX7WSosAOVMbTr6bXIYyJq4,5958
|
|
50
50
|
datachain/func/window.py,sha256=0MB1yjpVbwOrl_WNLZ8V3jkJz3o0XlYinpAcZQJuxiA,1688
|
|
51
51
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
|
|
@@ -68,16 +68,16 @@ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
|
68
68
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
69
69
|
datachain/lib/udf.py,sha256=-j0krjNAELTqRI0dB1N65AmawtcIY5vN---AuUcW8Us,13637
|
|
70
70
|
datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
|
|
71
|
-
datachain/lib/utils.py,sha256=
|
|
71
|
+
datachain/lib/utils.py,sha256=QrjVs_oLRXEotOPUYurBJypBFi_ReTJmxcnJeH4j2Uk,1596
|
|
72
72
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
73
|
datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
|
|
74
74
|
datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
|
|
75
75
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
|
-
datachain/lib/convert/flatten.py,sha256=
|
|
76
|
+
datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1NY4,1505
|
|
77
77
|
datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
|
|
78
78
|
datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
|
|
79
|
-
datachain/lib/convert/unflatten.py,sha256=
|
|
80
|
-
datachain/lib/convert/values_to_tuples.py,sha256=
|
|
79
|
+
datachain/lib/convert/unflatten.py,sha256=5RLIEB7utQFcXlyUIRGqu6VtmAN4N4whlslpO7xMQyI,2026
|
|
80
|
+
datachain/lib/convert/values_to_tuples.py,sha256=EFfIGBiVVltJQG8blzsQ1dGXneh4D3wdLfSUeoK10OI,3931
|
|
81
81
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
82
82
|
datachain/model/bbox.py,sha256=1Li1G3RdiQwLOAc2Mak2nQU0bcvdH-lXmXtA984CUWM,3154
|
|
83
83
|
datachain/model/pose.py,sha256=q9NgB8h66aKnYnLi7Pyf9bU-F_90W4cbvtSO3-_hkdk,3078
|
|
@@ -88,7 +88,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
|
|
|
88
88
|
datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
|
|
89
89
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
90
90
|
datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
|
|
91
|
-
datachain/query/dataset.py,sha256=
|
|
91
|
+
datachain/query/dataset.py,sha256=eXr9fJz2grX2evmkmsiH0Xeqajd8gFnujmt_USMxy0c,54563
|
|
92
92
|
datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
|
|
93
93
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
94
94
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -107,20 +107,20 @@ datachain/sql/functions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
107
107
|
datachain/sql/functions/aggregate.py,sha256=3AQdA8YHPFdtCEfwZKQXTT8SlQWdG9gD5PBtGN3Odqs,944
|
|
108
108
|
datachain/sql/functions/array.py,sha256=Zq59CaMHf_hFapU4kxvy2mwteH344k5Wksxja4MfBks,1204
|
|
109
109
|
datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
|
|
110
|
-
datachain/sql/functions/numeric.py,sha256=
|
|
110
|
+
datachain/sql/functions/numeric.py,sha256=BK2KCiPSgM2IveCq-9M_PG3CtPBlztaS9TTn1LGzyLs,1250
|
|
111
111
|
datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
|
|
112
112
|
datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
|
|
113
|
-
datachain/sql/functions/string.py,sha256=
|
|
113
|
+
datachain/sql/functions/string.py,sha256=E-T9OIzUR-GKaLgjZsEtg5CJrY_sLf1lt1awTvY7w2w,1426
|
|
114
114
|
datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
|
|
115
|
-
datachain/sql/sqlite/base.py,sha256=
|
|
115
|
+
datachain/sql/sqlite/base.py,sha256=E2PK3hoGlHey1eEjcReXRrI-c_ASr3AmAXaNYKDY_o8,18634
|
|
116
116
|
datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
|
|
117
117
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
118
118
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
119
119
|
datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
|
|
120
120
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
121
|
-
datachain-0.7.
|
|
122
|
-
datachain-0.7.
|
|
123
|
-
datachain-0.7.
|
|
124
|
-
datachain-0.7.
|
|
125
|
-
datachain-0.7.
|
|
126
|
-
datachain-0.7.
|
|
121
|
+
datachain-0.7.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
122
|
+
datachain-0.7.10.dist-info/METADATA,sha256=qtw_rToRdmR9-CO6MFCAGv6NWJJ87C95iQaDEnDE4H8,8371
|
|
123
|
+
datachain-0.7.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
124
|
+
datachain-0.7.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
125
|
+
datachain-0.7.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
126
|
+
datachain-0.7.10.dist-info/RECORD,,
|
|
@@ -1,488 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: datachain
|
|
3
|
-
Version: 0.7.9
|
|
4
|
-
Summary: Wrangle unstructured AI data at scale
|
|
5
|
-
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
-
License: Apache-2.0
|
|
7
|
-
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
-
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
|
-
Project-URL: Source, https://github.com/iterative/datachain
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
-
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
-
Requires-Python: >=3.9
|
|
17
|
-
Description-Content-Type: text/x-rst
|
|
18
|
-
License-File: LICENSE
|
|
19
|
-
Requires-Dist: pyyaml
|
|
20
|
-
Requires-Dist: tomlkit
|
|
21
|
-
Requires-Dist: tqdm
|
|
22
|
-
Requires-Dist: numpy<3,>=1
|
|
23
|
-
Requires-Dist: pandas>=2.0.0
|
|
24
|
-
Requires-Dist: pyarrow
|
|
25
|
-
Requires-Dist: typing-extensions
|
|
26
|
-
Requires-Dist: python-dateutil>=2
|
|
27
|
-
Requires-Dist: attrs>=21.3.0
|
|
28
|
-
Requires-Dist: s3fs>=2024.2.0
|
|
29
|
-
Requires-Dist: gcsfs>=2024.2.0
|
|
30
|
-
Requires-Dist: adlfs>=2024.2.0
|
|
31
|
-
Requires-Dist: dvc-data<4,>=3.10
|
|
32
|
-
Requires-Dist: dvc-objects<6,>=4
|
|
33
|
-
Requires-Dist: shtab<2,>=1.3.4
|
|
34
|
-
Requires-Dist: sqlalchemy>=2
|
|
35
|
-
Requires-Dist: multiprocess==0.70.16
|
|
36
|
-
Requires-Dist: cloudpickle
|
|
37
|
-
Requires-Dist: orjson>=3.10.5
|
|
38
|
-
Requires-Dist: pydantic<3,>=2
|
|
39
|
-
Requires-Dist: jmespath>=1.0
|
|
40
|
-
Requires-Dist: datamodel-code-generator>=0.25
|
|
41
|
-
Requires-Dist: Pillow<12,>=10.0.0
|
|
42
|
-
Requires-Dist: msgpack<2,>=1.0.4
|
|
43
|
-
Requires-Dist: psutil
|
|
44
|
-
Requires-Dist: huggingface_hub
|
|
45
|
-
Requires-Dist: iterative-telemetry>=0.0.9
|
|
46
|
-
Requires-Dist: platformdirs
|
|
47
|
-
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
|
-
Requires-Dist: tabulate
|
|
49
|
-
Provides-Extra: docs
|
|
50
|
-
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
51
|
-
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
52
|
-
Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
|
|
53
|
-
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
54
|
-
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
55
|
-
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
56
|
-
Provides-Extra: torch
|
|
57
|
-
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
58
|
-
Requires-Dist: torchvision; extra == "torch"
|
|
59
|
-
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
60
|
-
Provides-Extra: remote
|
|
61
|
-
Requires-Dist: lz4; extra == "remote"
|
|
62
|
-
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
63
|
-
Provides-Extra: vector
|
|
64
|
-
Requires-Dist: usearch; extra == "vector"
|
|
65
|
-
Provides-Extra: hf
|
|
66
|
-
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
67
|
-
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
68
|
-
Provides-Extra: tests
|
|
69
|
-
Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
|
|
70
|
-
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
71
|
-
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
72
|
-
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
73
|
-
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
74
|
-
Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
|
|
75
|
-
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
76
|
-
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
77
|
-
Requires-Dist: virtualenv; extra == "tests"
|
|
78
|
-
Requires-Dist: dulwich; extra == "tests"
|
|
79
|
-
Requires-Dist: hypothesis; extra == "tests"
|
|
80
|
-
Requires-Dist: open_clip_torch; extra == "tests"
|
|
81
|
-
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
82
|
-
Requires-Dist: requests-mock; extra == "tests"
|
|
83
|
-
Requires-Dist: scipy; extra == "tests"
|
|
84
|
-
Provides-Extra: dev
|
|
85
|
-
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
86
|
-
Requires-Dist: mypy==1.13.0; extra == "dev"
|
|
87
|
-
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
88
|
-
Requires-Dist: types-pytz; extra == "dev"
|
|
89
|
-
Requires-Dist: types-PyYAML; extra == "dev"
|
|
90
|
-
Requires-Dist: types-requests; extra == "dev"
|
|
91
|
-
Requires-Dist: types-tabulate; extra == "dev"
|
|
92
|
-
Provides-Extra: examples
|
|
93
|
-
Requires-Dist: datachain[tests]; extra == "examples"
|
|
94
|
-
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
95
|
-
Requires-Dist: defusedxml; extra == "examples"
|
|
96
|
-
Requires-Dist: accelerate; extra == "examples"
|
|
97
|
-
Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
|
|
98
|
-
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
|
-
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
|
-
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.37; extra == "examples"
|
|
102
|
-
|
|
103
|
-
================
|
|
104
|
-
|logo| DataChain
|
|
105
|
-
================
|
|
106
|
-
|
|
107
|
-
|PyPI| |Python Version| |Codecov| |Tests|
|
|
108
|
-
|
|
109
|
-
.. |logo| image:: docs/assets/datachain.svg
|
|
110
|
-
:height: 24
|
|
111
|
-
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
112
|
-
:target: https://pypi.org/project/datachain/
|
|
113
|
-
:alt: PyPI
|
|
114
|
-
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
115
|
-
:target: https://pypi.org/project/datachain
|
|
116
|
-
:alt: Python Version
|
|
117
|
-
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
118
|
-
:target: https://codecov.io/gh/iterative/datachain
|
|
119
|
-
:alt: Codecov
|
|
120
|
-
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
121
|
-
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
122
|
-
:alt: Tests
|
|
123
|
-
|
|
124
|
-
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
125
|
-
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
126
|
-
(e.g., S3) to process data efficiently without data duplication and manages metadata
|
|
127
|
-
in an internal database for easy and efficient querying.
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
Use Cases
|
|
131
|
-
=========
|
|
132
|
-
|
|
133
|
-
1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
|
|
134
|
-
refining data in pre-training, finetuning or LLM evaluating stages.
|
|
135
|
-
2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
|
|
136
|
-
ad-hoc analytics using LLMs.
|
|
137
|
-
|
|
138
|
-
Key Features
|
|
139
|
-
============
|
|
140
|
-
|
|
141
|
-
📂 **Multimodal Dataset Versioning.**
|
|
142
|
-
- Version unstructured data without redundant data copies, by supporting
|
|
143
|
-
references to S3, GCP, Azure, and local file systems.
|
|
144
|
-
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
145
|
-
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
146
|
-
|
|
147
|
-
🐍 **Python-friendly.**
|
|
148
|
-
- Operate on Python objects and object fields: float scores, strings, matrixes,
|
|
149
|
-
LLM response objects.
|
|
150
|
-
- Run Python code in a high-scale, terabytes size datasets, with built-in
|
|
151
|
-
parallelization and memory-efficient computing — no SQL or Spark required.
|
|
152
|
-
|
|
153
|
-
🧠 **Data Enrichment and Processing.**
|
|
154
|
-
- Generate metadata using local AI models and LLM APIs.
|
|
155
|
-
- Filter, join, and group datasets by metadata. Search by vector embeddings.
|
|
156
|
-
- High-performance vectorized operations on Python objects: sum, count, avg, etc.
|
|
157
|
-
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
Quick Start
|
|
161
|
-
-----------
|
|
162
|
-
|
|
163
|
-
.. code:: console
|
|
164
|
-
|
|
165
|
-
$ pip install datachain
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
Selecting files using JSON metadata
|
|
169
|
-
======================================
|
|
170
|
-
|
|
171
|
-
A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
|
|
172
|
-
annotated with ground truth and model inferences in the 'json-pairs' format,
|
|
173
|
-
where each image has a matching JSON file like `cat.1009.json`:
|
|
174
|
-
|
|
175
|
-
.. code:: json
|
|
176
|
-
|
|
177
|
-
{
|
|
178
|
-
"class": "cat", "id": "1009", "num_annotators": 8,
|
|
179
|
-
"inference": {"class": "dog", "confidence": 0.68}
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
Example of downloading only "high-confidence cat" inferred images using JSON metadata:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
.. code:: py
|
|
186
|
-
|
|
187
|
-
from datachain import Column, DataChain
|
|
188
|
-
|
|
189
|
-
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
|
|
190
|
-
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
|
|
191
|
-
|
|
192
|
-
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
193
|
-
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
194
|
-
|
|
195
|
-
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
196
|
-
& (Column("meta.inference.class_") == "cat"))
|
|
197
|
-
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
Data curation with a local AI model
|
|
201
|
-
===================================
|
|
202
|
-
Batch inference with a simple sentiment model using the `transformers` library:
|
|
203
|
-
|
|
204
|
-
.. code:: shell
|
|
205
|
-
|
|
206
|
-
pip install transformers
|
|
207
|
-
|
|
208
|
-
The code below downloads files from the cloud, and applies a user-defined function
|
|
209
|
-
to each one of them. All files with a positive sentiment
|
|
210
|
-
detected are then copied to the local directory.
|
|
211
|
-
|
|
212
|
-
.. code:: py
|
|
213
|
-
|
|
214
|
-
from transformers import pipeline
|
|
215
|
-
from datachain import DataChain, Column
|
|
216
|
-
|
|
217
|
-
classifier = pipeline("sentiment-analysis", device="cpu",
|
|
218
|
-
model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
|
|
219
|
-
|
|
220
|
-
def is_positive_dialogue_ending(file) -> bool:
|
|
221
|
-
dialogue_ending = file.read()[-512:]
|
|
222
|
-
return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
|
|
223
|
-
|
|
224
|
-
chain = (
|
|
225
|
-
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
|
|
226
|
-
object_name="file", type="text")
|
|
227
|
-
.settings(parallel=8, cache=True)
|
|
228
|
-
.map(is_positive=is_positive_dialogue_ending)
|
|
229
|
-
.save("file_response")
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
positive_chain = chain.filter(Column("is_positive") == True)
|
|
233
|
-
positive_chain.export_files("./output")
|
|
234
|
-
|
|
235
|
-
print(f"{positive_chain.count()} files were exported")
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
13 files were exported
|
|
240
|
-
|
|
241
|
-
.. code:: shell
|
|
242
|
-
|
|
243
|
-
$ ls output/datachain-demo/chatbot-KiT/
|
|
244
|
-
15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
|
|
245
|
-
$ ls output/datachain-demo/chatbot-KiT/ | wc -l
|
|
246
|
-
13
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
LLM judging chatbots
|
|
250
|
-
=============================
|
|
251
|
-
|
|
252
|
-
LLMs can work as universal classifiers. In the example below,
|
|
253
|
-
we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
|
|
254
|
-
Mistral API key at https://console.mistral.ai
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
.. code:: shell
|
|
258
|
-
|
|
259
|
-
$ pip install mistralai (Requires version >=1.0.0)
|
|
260
|
-
$ export MISTRAL_API_KEY=_your_key_
|
|
261
|
-
|
|
262
|
-
DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
|
|
263
|
-
|
|
264
|
-
.. code:: py
|
|
265
|
-
|
|
266
|
-
from mistralai import Mistral
|
|
267
|
-
from datachain import File, DataChain, Column
|
|
268
|
-
|
|
269
|
-
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
270
|
-
|
|
271
|
-
def eval_dialogue(file: File) -> bool:
|
|
272
|
-
client = Mistral()
|
|
273
|
-
response = client.chat.complete(
|
|
274
|
-
model="open-mixtral-8x22b",
|
|
275
|
-
messages=[{"role": "system", "content": PROMPT},
|
|
276
|
-
{"role": "user", "content": file.read()}])
|
|
277
|
-
result = response.choices[0].message.content
|
|
278
|
-
return result.lower().startswith("success")
|
|
279
|
-
|
|
280
|
-
chain = (
|
|
281
|
-
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
282
|
-
.settings(parallel=4, cache=True)
|
|
283
|
-
.map(is_success=eval_dialogue)
|
|
284
|
-
.save("mistral_files")
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
successful_chain = chain.filter(Column("is_success") == True)
|
|
288
|
-
successful_chain.export_files("./output_mistral")
|
|
289
|
-
|
|
290
|
-
print(f"{successful_chain.count()} files were exported")
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
294
|
-
|
|
295
|
-
.. code:: shell
|
|
296
|
-
|
|
297
|
-
$ ls output_mistral/datachain-demo/chatbot-KiT/
|
|
298
|
-
1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
|
|
299
|
-
$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
|
|
300
|
-
31
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
Serializing Python-objects
|
|
305
|
-
==========================
|
|
306
|
-
|
|
307
|
-
LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
|
|
308
|
-
model performance parameters.
|
|
309
|
-
|
|
310
|
-
Instead of extracting this information from the Mistral response data structure (class
|
|
311
|
-
`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
.. code:: py
|
|
315
|
-
|
|
316
|
-
from mistralai import Mistral
|
|
317
|
-
from mistralai.models import ChatCompletionResponse
|
|
318
|
-
from datachain import File, DataChain, Column
|
|
319
|
-
|
|
320
|
-
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
321
|
-
|
|
322
|
-
def eval_dialog(file: File) -> ChatCompletionResponse:
|
|
323
|
-
client = MistralClient()
|
|
324
|
-
return client.chat(
|
|
325
|
-
model="open-mixtral-8x22b",
|
|
326
|
-
messages=[{"role": "system", "content": PROMPT},
|
|
327
|
-
{"role": "user", "content": file.read()}])
|
|
328
|
-
|
|
329
|
-
chain = (
|
|
330
|
-
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
331
|
-
.settings(parallel=4, cache=True)
|
|
332
|
-
.map(response=eval_dialog)
|
|
333
|
-
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
334
|
-
.save("response")
|
|
335
|
-
)
|
|
336
|
-
|
|
337
|
-
chain.select("file.name", "status", "response.usage").show(5)
|
|
338
|
-
|
|
339
|
-
success_rate = chain.filter(Column("status") == "success").count() / chain.count()
|
|
340
|
-
print(f"{100*success_rate:.1f}% dialogs were successful")
|
|
341
|
-
|
|
342
|
-
Output:
|
|
343
|
-
|
|
344
|
-
.. code:: shell
|
|
345
|
-
|
|
346
|
-
file status response response response
|
|
347
|
-
name usage usage usage
|
|
348
|
-
prompt_tokens total_tokens completion_tokens
|
|
349
|
-
0 1.txt success 547 548 1
|
|
350
|
-
1 10.txt failure 3576 3578 2
|
|
351
|
-
2 11.txt failure 626 628 2
|
|
352
|
-
3 12.txt failure 1144 1182 38
|
|
353
|
-
4 13.txt success 1100 1101 1
|
|
354
|
-
|
|
355
|
-
[Limited by 5 rows]
|
|
356
|
-
64.0% dialogs were successful
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
Iterating over Python data structures
|
|
360
|
-
=============================================
|
|
361
|
-
|
|
362
|
-
In the previous examples, datasets were saved in the embedded database
|
|
363
|
-
(`SQLite`_ in folder `.datachain` of the working directory).
|
|
364
|
-
These datasets were automatically versioned, and can be accessed using
|
|
365
|
-
`DataChain.from_dataset("dataset_name")`.
|
|
366
|
-
|
|
367
|
-
Here is how to retrieve a saved dataset and iterate over the objects:
|
|
368
|
-
|
|
369
|
-
.. code:: py
|
|
370
|
-
|
|
371
|
-
chain = DataChain.from_dataset("response")
|
|
372
|
-
|
|
373
|
-
# Iterating one-by-one: support out-of-memory workflow
|
|
374
|
-
for file, response in chain.limit(5).collect("file", "response"):
|
|
375
|
-
# verify the collected Python objects
|
|
376
|
-
assert isinstance(response, ChatCompletionResponse)
|
|
377
|
-
|
|
378
|
-
status = response.choices[0].message.content[:7]
|
|
379
|
-
tokens = response.usage.total_tokens
|
|
380
|
-
print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
|
|
381
|
-
|
|
382
|
-
Output:
|
|
383
|
-
|
|
384
|
-
.. code:: shell
|
|
385
|
-
|
|
386
|
-
gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
|
|
387
|
-
gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
|
|
388
|
-
gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
|
|
389
|
-
gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
|
|
390
|
-
gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
Vectorized analytics over Python objects
|
|
394
|
-
========================================
|
|
395
|
-
|
|
396
|
-
Some operations can run inside the DB without deserialization.
|
|
397
|
-
For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
|
|
398
|
-
|
|
399
|
-
.. code:: py
|
|
400
|
-
|
|
401
|
-
chain = DataChain.from_dataset("mistral_dataset")
|
|
402
|
-
|
|
403
|
-
cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
|
|
404
|
-
+ chain.sum("response.usage.completion_tokens")*0.000006
|
|
405
|
-
print(f"Spent ${cost:.2f} on {chain.count()} calls")
|
|
406
|
-
|
|
407
|
-
Output:
|
|
408
|
-
|
|
409
|
-
.. code:: shell
|
|
410
|
-
|
|
411
|
-
Spent $0.08 on 50 calls
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
PyTorch data loader
|
|
415
|
-
===================
|
|
416
|
-
|
|
417
|
-
Chain results can be exported or passed directly to PyTorch dataloader.
|
|
418
|
-
For example, if we are interested in passing image and a label based on file
|
|
419
|
-
name suffix, the following code will do it:
|
|
420
|
-
|
|
421
|
-
.. code:: py
|
|
422
|
-
|
|
423
|
-
from torch.utils.data import DataLoader
|
|
424
|
-
from transformers import CLIPProcessor
|
|
425
|
-
|
|
426
|
-
from datachain import C, DataChain
|
|
427
|
-
|
|
428
|
-
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
429
|
-
|
|
430
|
-
chain = (
|
|
431
|
-
DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
|
|
432
|
-
.map(label=lambda name: name.split(".")[0], params=["file.name"])
|
|
433
|
-
.select("file", "label").to_pytorch(
|
|
434
|
-
transform=processor.image_processor,
|
|
435
|
-
tokenizer=processor.tokenizer,
|
|
436
|
-
)
|
|
437
|
-
)
|
|
438
|
-
loader = DataLoader(chain, batch_size=1)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
DataChain Studio Platform
|
|
442
|
-
-------------------------
|
|
443
|
-
|
|
444
|
-
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
445
|
-
|
|
446
|
-
- **Centralized dataset registry** to manage data, code and dependency
|
|
447
|
-
dependencies in one place.
|
|
448
|
-
- **Data Lineage** for data sources as well as direvative dataset.
|
|
449
|
-
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
450
|
-
- **Scalable Compute** to handle large datasets (100M+ files) and in-house
|
|
451
|
-
AI model inference.
|
|
452
|
-
- **Access control** including SSO and team based collaboration.
|
|
453
|
-
|
|
454
|
-
Tutorials
|
|
455
|
-
---------
|
|
456
|
-
|
|
457
|
-
* `Getting Started`_
|
|
458
|
-
* `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
|
|
459
|
-
* `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
|
|
460
|
-
* `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
Contributions
|
|
464
|
-
-------------
|
|
465
|
-
|
|
466
|
-
Contributions are very welcome.
|
|
467
|
-
To learn more, see the `Contributor Guide`_.
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
Community and Support
|
|
471
|
-
---------------------
|
|
472
|
-
|
|
473
|
-
* `Docs <https://datachain.dvc.ai/>`_
|
|
474
|
-
* `File an issue`_ if you encounter any problems
|
|
475
|
-
* `Discord Chat <https://dvc.org/chat>`_
|
|
476
|
-
* `Email <mailto:support@dvc.org>`_
|
|
477
|
-
* `Twitter <https://twitter.com/DVCorg>`_
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
.. _PyPI: https://pypi.org/
|
|
481
|
-
.. _file an issue: https://github.com/iterative/datachain/issues
|
|
482
|
-
.. github-only
|
|
483
|
-
.. _Contributor Guide: CONTRIBUTING.rst
|
|
484
|
-
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
485
|
-
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
486
|
-
.. _SQLite: https://www.sqlite.org/
|
|
487
|
-
.. _Getting Started: https://docs.datachain.ai/
|
|
488
|
-
.. _DataChain Studio: https://studio.datachain.ai/
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|