datachain 0.7.9__py3-none-any.whl → 0.7.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -172,7 +172,7 @@ class Client(ABC):
172
172
  return url == cls.PREFIX
173
173
 
174
174
  @classmethod
175
- def get_uri(cls, name) -> "StorageURI":
175
+ def get_uri(cls, name: str) -> "StorageURI":
176
176
  from datachain.dataset import StorageURI
177
177
 
178
178
  return StorageURI(f"{cls.PREFIX}{name}")
@@ -278,7 +278,9 @@ class Client(ABC):
278
278
  ) -> None:
279
279
  await self._fetch_nested(start_prefix, result_queue)
280
280
 
281
- async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
281
+ async def _fetch_dir(
282
+ self, prefix: str, pbar, result_queue: ResultQueue
283
+ ) -> set[str]:
282
284
  path = f"{self.name}/{prefix}"
283
285
  infos = await self.ls_dir(path)
284
286
  files = []
datachain/client/local.py CHANGED
@@ -12,6 +12,7 @@ from datachain.lib.file import File
12
12
  from .fsspec import Client
13
13
 
14
14
  if TYPE_CHECKING:
15
+ from datachain.cache import DataChainCache
15
16
  from datachain.dataset import StorageURI
16
17
 
17
18
 
@@ -21,7 +22,11 @@ class FileClient(Client):
21
22
  protocol = "file"
22
23
 
23
24
  def __init__(
24
- self, name: str, fs_kwargs: dict[str, Any], cache, use_symlinks: bool = False
25
+ self,
26
+ name: str,
27
+ fs_kwargs: dict[str, Any],
28
+ cache: "DataChainCache",
29
+ use_symlinks: bool = False,
25
30
  ) -> None:
26
31
  super().__init__(name, fs_kwargs, cache)
27
32
  self.use_symlinks = use_symlinks
@@ -30,7 +35,7 @@ class FileClient(Client):
30
35
  raise TypeError("Signed urls are not implemented for local file system")
31
36
 
32
37
  @classmethod
33
- def get_uri(cls, name) -> "StorageURI":
38
+ def get_uri(cls, name: str) -> "StorageURI":
34
39
  from datachain.dataset import StorageURI
35
40
 
36
41
  return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
@@ -77,7 +82,7 @@ class FileClient(Client):
77
82
  return bucket, path
78
83
 
79
84
  @classmethod
80
- def from_name(cls, name: str, cache, kwargs) -> "FileClient":
85
+ def from_name(cls, name: str, cache: "DataChainCache", kwargs) -> "FileClient":
81
86
  use_symlinks = kwargs.pop("use_symlinks", False)
82
87
  return cls(name, kwargs, cache, use_symlinks=use_symlinks)
83
88
 
@@ -85,7 +90,7 @@ class FileClient(Client):
85
90
  def from_source(
86
91
  cls,
87
92
  uri: str,
88
- cache,
93
+ cache: "DataChainCache",
89
94
  use_symlinks: bool = False,
90
95
  **kwargs,
91
96
  ) -> "FileClient":
@@ -17,8 +17,9 @@ from .aggregate import (
17
17
  )
18
18
  from .array import cosine_distance, euclidean_distance, length, sip_hash_64
19
19
  from .conditional import greatest, least
20
- from .numeric import bit_and, bit_or, bit_xor, int_hash_64
20
+ from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
21
21
  from .random import rand
22
+ from .string import byte_hamming_distance
22
23
  from .window import window
23
24
 
24
25
  __all__ = [
@@ -26,8 +27,10 @@ __all__ = [
26
27
  "array",
27
28
  "avg",
28
29
  "bit_and",
30
+ "bit_hamming_distance",
29
31
  "bit_or",
30
32
  "bit_xor",
33
+ "byte_hamming_distance",
31
34
  "case",
32
35
  "collect",
33
36
  "concat",
datachain/func/numeric.py CHANGED
@@ -160,3 +160,49 @@ def int_hash_64(col: Union[ColT, int]) -> Func:
160
160
  return Func(
161
161
  "int_hash_64", inner=numeric.int_hash_64, cols=cols, args=args, result_type=int
162
162
  )
163
+
164
+
165
+ def bit_hamming_distance(*args: Union[ColT, int]) -> Func:
166
+ """
167
+ Computes the Hamming distance between the bit representations of two integer values.
168
+
169
+ The Hamming distance is the number of positions at which the corresponding bits
170
+ are different. This function returns the dissimilarity between the integers,
171
+ where 0 indicates identical integers and values closer to the number of bits
172
+ in the integer indicate higher dissimilarity.
173
+
174
+ Args:
175
+ args (str | int): Two integers to compute the Hamming distance between.
176
+ If a str is provided, it is assumed to be the name of the column.
177
+ If an int is provided, it is assumed to be an integer literal.
178
+
179
+ Returns:
180
+ Func: A Func object that represents the Hamming distance function.
181
+
182
+ Example:
183
+ ```py
184
+ dc.mutate(
185
+ ham_dist=func.bit_hamming_distance("embed1", 123456),
186
+ )
187
+ ```
188
+
189
+ Notes:
190
+ - Result column will always be of type int.
191
+ """
192
+ cols, func_args = [], []
193
+ for arg in args:
194
+ if isinstance(arg, int):
195
+ func_args.append(arg)
196
+ else:
197
+ cols.append(arg)
198
+
199
+ if len(cols) + len(func_args) != 2:
200
+ raise ValueError("bit_hamming_distance() requires exactly two arguments")
201
+
202
+ return Func(
203
+ "bit_hamming_distance",
204
+ inner=numeric.bit_hamming_distance,
205
+ cols=cols,
206
+ args=func_args,
207
+ result_type=int,
208
+ )
datachain/func/string.py CHANGED
@@ -152,3 +152,49 @@ def regexp_replace(col: Union[str, Func], regex: str, replacement: str) -> Func:
152
152
  args = None
153
153
 
154
154
  return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str)
155
+
156
+
157
+ def byte_hamming_distance(*args: Union[str, Func]) -> Func:
158
+ """
159
+ Computes the Hamming distance between two strings.
160
+
161
+ The Hamming distance is the number of positions at which the corresponding
162
+ characters are different. This function returns the dissimilarity between
163
+ the strings, where 0 indicates identical strings and values closer to the length
164
+ of the strings indicate higher dissimilarity.
165
+
166
+ Args:
167
+ args (str | literal): Two strings to compute the Hamming distance between.
168
+ If a str is provided, it is assumed to be the name of the column.
169
+ If a Literal is provided, it is assumed to be a string literal.
170
+
171
+ Returns:
172
+ Func: A Func object that represents the Hamming distance function.
173
+
174
+ Example:
175
+ ```py
176
+ dc.mutate(
177
+ ham_dist=func.byte_hamming_distance("file.phash", literal("hello")),
178
+ )
179
+ ```
180
+
181
+ Notes:
182
+ - Result column will always be of type int.
183
+ """
184
+ cols, func_args = [], []
185
+ for arg in args:
186
+ if get_origin(arg) is literal:
187
+ func_args.append(arg)
188
+ else:
189
+ cols.append(arg)
190
+
191
+ if len(cols) + len(func_args) != 2:
192
+ raise ValueError("byte_hamming_distance() requires exactly two arguments")
193
+
194
+ return Func(
195
+ "byte_hamming_distance",
196
+ inner=string.byte_hamming_distance,
197
+ cols=cols,
198
+ args=func_args,
199
+ result_type=int,
200
+ )
@@ -1,19 +1,21 @@
1
+ from collections.abc import Generator
2
+
1
3
  from pydantic import BaseModel
2
4
 
3
5
  from datachain.lib.model_store import ModelStore
4
6
 
5
7
 
6
- def flatten(obj: BaseModel):
8
+ def flatten(obj: BaseModel) -> tuple:
7
9
  return tuple(_flatten_fields_values(obj.model_fields, obj))
8
10
 
9
11
 
10
- def flatten_list(obj_list):
12
+ def flatten_list(obj_list: list[BaseModel]) -> tuple:
11
13
  return tuple(
12
14
  val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
13
15
  )
14
16
 
15
17
 
16
- def _flatten_list_field(value: list):
18
+ def _flatten_list_field(value: list) -> list:
17
19
  assert isinstance(value, list)
18
20
  if value and ModelStore.is_pydantic(type(value[0])):
19
21
  return [val.model_dump() for val in value]
@@ -22,7 +24,7 @@ def _flatten_list_field(value: list):
22
24
  return value
23
25
 
24
26
 
25
- def _flatten_fields_values(fields, obj: BaseModel):
27
+ def _flatten_fields_values(fields: dict, obj: BaseModel) -> Generator:
26
28
  for name, f_info in fields.items():
27
29
  anno = f_info.annotation
28
30
  # Optimization: Access attributes directly to skip the model_dump() call.
@@ -40,5 +42,5 @@ def _flatten_fields_values(fields, obj: BaseModel):
40
42
  yield value
41
43
 
42
44
 
43
- def _flatten(obj):
45
+ def _flatten(obj: BaseModel) -> tuple:
44
46
  return tuple(_flatten_fields_values(obj.model_fields, obj))
@@ -9,12 +9,12 @@ from pydantic import BaseModel
9
9
  from datachain.query.schema import DEFAULT_DELIMITER
10
10
 
11
11
 
12
- def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict:
12
+ def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos: int = 0) -> dict:
13
13
  return unflatten_to_json_pos(model, row, pos)[0]
14
14
 
15
15
 
16
16
  def unflatten_to_json_pos(
17
- model: type[BaseModel], row: Sequence[Any], pos=0
17
+ model: type[BaseModel], row: Sequence[Any], pos: int = 0
18
18
  ) -> tuple[dict, int]:
19
19
  res = {}
20
20
  for name, f_info in model.model_fields.items():
@@ -11,7 +11,7 @@ from datachain.lib.utils import DataChainParamsError
11
11
 
12
12
 
13
13
  class ValuesToTupleError(DataChainParamsError):
14
- def __init__(self, ds_name, msg):
14
+ def __init__(self, ds_name: str, msg: str):
15
15
  if ds_name:
16
16
  ds_name = f"' {ds_name}'"
17
17
  super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
datachain/lib/utils.py CHANGED
@@ -28,7 +28,7 @@ class DataChainParamsError(DataChainError):
28
28
 
29
29
 
30
30
  class DataChainColumnError(DataChainParamsError):
31
- def __init__(self, col_name, msg):
31
+ def __init__(self, col_name: str, msg: str):
32
32
  super().__init__(f"Error for column {col_name}: {msg}")
33
33
 
34
34
 
@@ -215,7 +215,7 @@ class DatasetDiffOperation(Step):
215
215
  Should return select query that calculates desired diff between dataset queries
216
216
  """
217
217
 
218
- def apply(self, query_generator, temp_tables: list[str]):
218
+ def apply(self, query_generator, temp_tables: list[str]) -> "StepResult":
219
219
  source_query = query_generator.exclude(("sys__id",))
220
220
  target_query = self.dq.apply_steps().select()
221
221
  temp_tables.extend(self.dq.temp_table_names)
@@ -35,9 +35,21 @@ class int_hash_64(GenericFunction): # noqa: N801
35
35
  inherit_cache = True
36
36
 
37
37
 
38
+ class bit_hamming_distance(GenericFunction): # noqa: N801
39
+ """
40
+ Returns the Hamming distance between two integers.
41
+ """
42
+
43
+ type = Int64()
44
+ package = "numeric"
45
+ name = "hamming_distance"
46
+ inherit_cache = True
47
+
48
+
38
49
  compiler_not_implemented(bit_and)
39
50
  compiler_not_implemented(bit_or)
40
51
  compiler_not_implemented(bit_xor)
41
52
  compiler_not_implemented(bit_rshift)
42
53
  compiler_not_implemented(bit_lshift)
43
54
  compiler_not_implemented(int_hash_64)
55
+ compiler_not_implemented(bit_hamming_distance)
@@ -48,7 +48,19 @@ class replace(GenericFunction): # noqa: N801
48
48
  inherit_cache = True
49
49
 
50
50
 
51
+ class byte_hamming_distance(GenericFunction): # noqa: N801
52
+ """
53
+ Returns the Hamming distance between two strings.
54
+ """
55
+
56
+ type = Int64()
57
+ package = "string"
58
+ name = "hamming_distance"
59
+ inherit_cache = True
60
+
61
+
51
62
  compiler_not_implemented(length)
52
63
  compiler_not_implemented(split)
53
64
  compiler_not_implemented(regexp_replace)
54
65
  compiler_not_implemented(replace)
66
+ compiler_not_implemented(byte_hamming_distance)
@@ -90,6 +90,7 @@ def setup():
90
90
  compiles(string.split, "sqlite")(compile_string_split)
91
91
  compiles(string.regexp_replace, "sqlite")(compile_string_regexp_replace)
92
92
  compiles(string.replace, "sqlite")(compile_string_replace)
93
+ compiles(string.byte_hamming_distance, "sqlite")(compile_byte_hamming_distance)
93
94
  compiles(conditional.greatest, "sqlite")(compile_greatest)
94
95
  compiles(conditional.least, "sqlite")(compile_least)
95
96
  compiles(Values, "sqlite")(compile_values)
@@ -104,6 +105,7 @@ def setup():
104
105
  compiles(numeric.bit_rshift, "sqlite")(compile_bitwise_rshift)
105
106
  compiles(numeric.bit_lshift, "sqlite")(compile_bitwise_lshift)
106
107
  compiles(numeric.int_hash_64, "sqlite")(compile_int_hash_64)
108
+ compiles(numeric.bit_hamming_distance, "sqlite")(compile_bit_hamming_distance)
107
109
 
108
110
  if load_usearch_extension(sqlite3.connect(":memory:")):
109
111
  compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
@@ -191,6 +193,26 @@ def sqlite_int_hash_64(x: int) -> int:
191
193
  return x if x < 1 << 63 else (x & MAX_INT64) - (1 << 64)
192
194
 
193
195
 
196
+ def sqlite_bit_hamming_distance(a: int, b: int) -> int:
197
+ """Calculate the Hamming distance between two integers."""
198
+ diff = (a & MAX_INT64) ^ (b & MAX_INT64)
199
+ if hasattr(diff, "bit_count"):
200
+ return diff.bit_count()
201
+ return bin(diff).count("1")
202
+
203
+
204
+ def sqlite_byte_hamming_distance(a: str, b: str) -> int:
205
+ """Calculate the Hamming distance between two strings."""
206
+ diff = 0
207
+ if len(a) < len(b):
208
+ diff = len(b) - len(a)
209
+ b = b[: len(a)]
210
+ elif len(b) < len(a):
211
+ diff = len(a) - len(b)
212
+ a = a[: len(b)]
213
+ return diff + sum(c1 != c2 for c1, c2 in zip(a, b))
214
+
215
+
194
216
  def register_user_defined_sql_functions() -> None:
195
217
  # Register optional functions if we have the necessary dependencies
196
218
  # and otherwise register functions that will raise an exception with
@@ -225,6 +247,9 @@ def register_user_defined_sql_functions() -> None:
225
247
  "bitwise_lshift", 2, lambda a, b: a << b, deterministic=True
226
248
  )
227
249
  conn.create_function("int_hash_64", 1, sqlite_int_hash_64, deterministic=True)
250
+ conn.create_function(
251
+ "bit_hamming_distance", 2, sqlite_bit_hamming_distance, deterministic=True
252
+ )
228
253
 
229
254
  _registered_function_creators["numeric_functions"] = create_numeric_functions
230
255
 
@@ -237,6 +262,9 @@ def register_user_defined_sql_functions() -> None:
237
262
  conn.create_function(
238
263
  "regexp_replace", 3, sqlite_regexp_replace, deterministic=True
239
264
  )
265
+ conn.create_function(
266
+ "byte_hamming_distance", 2, sqlite_byte_hamming_distance, deterministic=True
267
+ )
240
268
 
241
269
  _registered_function_creators["string_functions"] = create_string_functions
242
270
 
@@ -383,6 +411,18 @@ def compile_int_hash_64(element, compiler, **kwargs):
383
411
  return compiler.process(func.int_hash_64(*element.clauses.clauses), **kwargs)
384
412
 
385
413
 
414
+ def compile_bit_hamming_distance(element, compiler, **kwargs):
415
+ return compiler.process(
416
+ func.bit_hamming_distance(*element.clauses.clauses), **kwargs
417
+ )
418
+
419
+
420
+ def compile_byte_hamming_distance(element, compiler, **kwargs):
421
+ return compiler.process(
422
+ func.byte_hamming_distance(*element.clauses.clauses), **kwargs
423
+ )
424
+
425
+
386
426
  def py_json_array_length(arr):
387
427
  return len(orjson.loads(arr))
388
428
 
@@ -0,0 +1,207 @@
1
+ Metadata-Version: 2.1
2
+ Name: datachain
3
+ Version: 0.7.10
4
+ Summary: Wrangle unstructured AI data at scale
5
+ Author-email: Dmitry Petrov <support@dvc.org>
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://datachain.dvc.ai
8
+ Project-URL: Issues, https://github.com/iterative/datachain/issues
9
+ Project-URL: Source, https://github.com/iterative/datachain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Development Status :: 2 - Pre-Alpha
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/x-rst
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml
20
+ Requires-Dist: tomlkit
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: numpy<3,>=1
23
+ Requires-Dist: pandas>=2.0.0
24
+ Requires-Dist: pyarrow
25
+ Requires-Dist: typing-extensions
26
+ Requires-Dist: python-dateutil>=2
27
+ Requires-Dist: attrs>=21.3.0
28
+ Requires-Dist: s3fs>=2024.2.0
29
+ Requires-Dist: gcsfs>=2024.2.0
30
+ Requires-Dist: adlfs>=2024.2.0
31
+ Requires-Dist: dvc-data<4,>=3.10
32
+ Requires-Dist: dvc-objects<6,>=4
33
+ Requires-Dist: shtab<2,>=1.3.4
34
+ Requires-Dist: sqlalchemy>=2
35
+ Requires-Dist: multiprocess==0.70.16
36
+ Requires-Dist: cloudpickle
37
+ Requires-Dist: orjson>=3.10.5
38
+ Requires-Dist: pydantic<3,>=2
39
+ Requires-Dist: jmespath>=1.0
40
+ Requires-Dist: datamodel-code-generator>=0.25
41
+ Requires-Dist: Pillow<12,>=10.0.0
42
+ Requires-Dist: msgpack<2,>=1.0.4
43
+ Requires-Dist: psutil
44
+ Requires-Dist: huggingface_hub
45
+ Requires-Dist: iterative-telemetry>=0.0.9
46
+ Requires-Dist: platformdirs
47
+ Requires-Dist: dvc-studio-client<1,>=0.21
48
+ Requires-Dist: tabulate
49
+ Provides-Extra: docs
50
+ Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
+ Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
52
+ Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
53
+ Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
54
+ Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
55
+ Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
56
+ Provides-Extra: torch
57
+ Requires-Dist: torch>=2.1.0; extra == "torch"
58
+ Requires-Dist: torchvision; extra == "torch"
59
+ Requires-Dist: transformers>=4.36.0; extra == "torch"
60
+ Provides-Extra: remote
61
+ Requires-Dist: lz4; extra == "remote"
62
+ Requires-Dist: requests>=2.22.0; extra == "remote"
63
+ Provides-Extra: vector
64
+ Requires-Dist: usearch; extra == "vector"
65
+ Provides-Extra: hf
66
+ Requires-Dist: numba>=0.60.0; extra == "hf"
67
+ Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
68
+ Provides-Extra: tests
69
+ Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
70
+ Requires-Dist: pytest<9,>=8; extra == "tests"
71
+ Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
72
+ Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
73
+ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
74
+ Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
75
+ Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
76
+ Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
77
+ Requires-Dist: virtualenv; extra == "tests"
78
+ Requires-Dist: dulwich; extra == "tests"
79
+ Requires-Dist: hypothesis; extra == "tests"
80
+ Requires-Dist: open_clip_torch; extra == "tests"
81
+ Requires-Dist: aiotools>=1.7.0; extra == "tests"
82
+ Requires-Dist: requests-mock; extra == "tests"
83
+ Requires-Dist: scipy; extra == "tests"
84
+ Provides-Extra: dev
85
+ Requires-Dist: datachain[docs,tests]; extra == "dev"
86
+ Requires-Dist: mypy==1.13.0; extra == "dev"
87
+ Requires-Dist: types-python-dateutil; extra == "dev"
88
+ Requires-Dist: types-pytz; extra == "dev"
89
+ Requires-Dist: types-PyYAML; extra == "dev"
90
+ Requires-Dist: types-requests; extra == "dev"
91
+ Requires-Dist: types-tabulate; extra == "dev"
92
+ Provides-Extra: examples
93
+ Requires-Dist: datachain[tests]; extra == "examples"
94
+ Requires-Dist: numpy<2,>=1; extra == "examples"
95
+ Requires-Dist: defusedxml; extra == "examples"
96
+ Requires-Dist: accelerate; extra == "examples"
97
+ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
98
+ Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
+ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
+ Requires-Dist: onnx==1.16.1; extra == "examples"
101
+ Requires-Dist: ultralytics==8.3.37; extra == "examples"
102
+
103
+ ================
104
+ |logo| DataChain
105
+ ================
106
+
107
+ |PyPI| |Python Version| |Codecov| |Tests|
108
+
109
+ .. |logo| image:: docs/assets/datachain.svg
110
+ :height: 24
111
+ .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
112
+ :target: https://pypi.org/project/datachain/
113
+ :alt: PyPI
114
+ .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
115
+ :target: https://pypi.org/project/datachain
116
+ :alt: Python Version
117
+ .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
118
+ :target: https://codecov.io/gh/iterative/datachain
119
+ :alt: Codecov
120
+ .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
121
+ :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
122
+ :alt: Tests
123
+
124
+ DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
125
+ data like images, audio, videos, text and PDFs. It integrates with external storage
126
+ (e.g. S3) to process data efficiently without data duplication and manages metadata
127
+ in an internal database for easy and efficient querying.
128
+
129
+
130
+ Use Cases
131
+ =========
132
+
133
+ 1. **ETL.** Pythonic framework for describing and running unstructured data transformations
134
+ and enrichments, applying models to data, including LLMs.
135
+ 2. **Analytics.** DataChain dataset is a table that combines all the information about data
136
+ objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
137
+ on these tables at scale.
138
+ 3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
139
+ Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
140
+
141
+
142
+ Key Features
143
+ ============
144
+
145
+ 📂 **Multimodal Dataset Versioning.**
146
+ - Version unstructured data without moving or creating data copies, by supporting
147
+ references to S3, GCP, Azure, and local file systems.
148
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
149
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
150
+
151
+ 🐍 **Python-friendly.**
152
+ - Operate on Python objects and object fields: float scores, strings, matrixes,
153
+ LLM response objects.
154
+ - Run Python code in a high-scale, terabytes size datasets, with built-in
155
+ parallelization and memory-efficient computing — no SQL or Spark required.
156
+
157
+ 🧠 **Data Enrichment and Processing.**
158
+ - Generate metadata using local AI models and LLM APIs.
159
+ - Filter, join, and group datasets by metadata. Search by vector embeddings.
160
+ - High-performance vectorized operations on Python objects: sum, count, avg, etc.
161
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
162
+
163
+
164
+ Getting Started
165
+ ===============
166
+
167
+ Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
168
+
169
+
170
+ Contributing
171
+ ============
172
+
173
+ Contributions are very welcome. To learn more, see the `Contributor Guide`_.
174
+
175
+
176
+ Community and Support
177
+ =====================
178
+
179
+ * `Docs <https://docs.datachain.ai/>`_
180
+ * `File an issue`_ if you encounter any problems
181
+ * `Discord Chat <https://dvc.org/chat>`_
182
+ * `Email <mailto:support@dvc.org>`_
183
+ * `Twitter <https://twitter.com/DVCorg>`_
184
+
185
+
186
+ DataChain Studio Platform
187
+ =========================
188
+
189
+ `DataChain Studio`_ is a proprietary solution for teams that offers:
190
+
191
+ - **Centralized dataset registry** to manage data, code and dependency
192
+ dependencies in one place.
193
+ - **Data Lineage** for data sources as well as derivative dataset.
194
+ - **UI for Multimodal Data** like images, videos, and PDFs.
195
+ - **Scalable Compute** to handle large datasets (100M+ files) and in-house
196
+ AI model inference.
197
+ - **Access control** including SSO and team based collaboration.
198
+
199
+ .. _PyPI: https://pypi.org/
200
+ .. _file an issue: https://github.com/iterative/datachain/issues
201
+ .. github-only
202
+ .. _Contributor Guide: https://docs.datachain.ai/contributing
203
+ .. _Pydantic: https://github.com/pydantic/pydantic
204
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
205
+ .. _SQLite: https://www.sqlite.org/
206
+ .. _Getting Started: https://docs.datachain.ai/
207
+ .. _DataChain Studio: https://studio.datachain.ai/
@@ -24,10 +24,10 @@ datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
- datachain/client/fsspec.py,sha256=KDGLhJMnive73hI8GABeP_aQZv1w5M_6rxz6KRRxaHI,12712
27
+ datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
28
28
  datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
29
29
  datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
30
- datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
30
+ datachain/client/local.py,sha256=f2HBqWH8SQM5CyiJ0ljfePVROg2FszWaAn6E2c8RiLE,4596
31
31
  datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
32
32
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
33
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
@@ -37,16 +37,16 @@ datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
38
  datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
39
39
  datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
40
- datachain/func/__init__.py,sha256=oz-GbCcp5jnN82u6cghWTGzmU9IQvtvllOof73wE52g,934
40
+ datachain/func/__init__.py,sha256=TG6JHFKtLi06Nd5iLszXIflEq-VKZcKMdgo_KiQ8SGQ,1055
41
41
  datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
42
42
  datachain/func/array.py,sha256=zHDNWuWLA7HVa9FEvQeHhVi00_xqenyleTqcLwkXWBI,5477
43
43
  datachain/func/base.py,sha256=wA0sBQAVyN9LPxoo7Ox83peS0zUVnyuKxukwAcjGLfY,534
44
44
  datachain/func/conditional.py,sha256=mQroxsoExpBW84Zm5dAYP4OpBblWmzfnF2qJq9rba54,2223
45
45
  datachain/func/func.py,sha256=mJ_rOXMpoqnK4-d5eF9boSMx5hWzgKoMLPGpZQqLAfw,15222
46
- datachain/func/numeric.py,sha256=GcUX6ijZvzfac8CZrHE0gRc9WCPiutcMLKqNXtbn-Yo,4186
46
+ datachain/func/numeric.py,sha256=gMe1Ks0dqQKHkjcpvj7I5S-neECzQ_gltPQLNoaWOyo,5632
47
47
  datachain/func/path.py,sha256=mqN_mfkwv44z2II7DMTp_fGGw95hmTCNls_TOFNpr4k,3155
48
48
  datachain/func/random.py,sha256=pENOLj9rSmWfGCnOsUIaCsVC5486zQb66qfQvXaz9Z4,452
49
- datachain/func/string.py,sha256=NQzaXXYu7yb72HPADy4WrFlcgvTS77L9x7-qvCKJtnk,4522
49
+ datachain/func/string.py,sha256=8az3BTeezlaZt6NW-54GWX7WSosAOVMbTr6bXIYyJq4,5958
50
50
  datachain/func/window.py,sha256=0MB1yjpVbwOrl_WNLZ8V3jkJz3o0XlYinpAcZQJuxiA,1688
51
51
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
@@ -68,16 +68,16 @@ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
68
68
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
69
69
  datachain/lib/udf.py,sha256=-j0krjNAELTqRI0dB1N65AmawtcIY5vN---AuUcW8Us,13637
70
70
  datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
71
- datachain/lib/utils.py,sha256=om-MCiyYwvPHtFq3V2rBKrRDNkio9XXofj7RsUIlHKU,1586
71
+ datachain/lib/utils.py,sha256=QrjVs_oLRXEotOPUYurBJypBFi_ReTJmxcnJeH4j2Uk,1596
72
72
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
73
  datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
74
74
  datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
75
75
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
- datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
76
+ datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1NY4,1505
77
77
  datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
78
78
  datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
79
- datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
80
- datachain/lib/convert/values_to_tuples.py,sha256=varRCnSMT_pZmHznrd2Yi05qXLLz_v9YH_pOCpHSkdc,3921
79
+ datachain/lib/convert/unflatten.py,sha256=5RLIEB7utQFcXlyUIRGqu6VtmAN4N4whlslpO7xMQyI,2026
80
+ datachain/lib/convert/values_to_tuples.py,sha256=EFfIGBiVVltJQG8blzsQ1dGXneh4D3wdLfSUeoK10OI,3931
81
81
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
82
82
  datachain/model/bbox.py,sha256=1Li1G3RdiQwLOAc2Mak2nQU0bcvdH-lXmXtA984CUWM,3154
83
83
  datachain/model/pose.py,sha256=q9NgB8h66aKnYnLi7Pyf9bU-F_90W4cbvtSO3-_hkdk,3078
@@ -88,7 +88,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
88
88
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
89
89
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
90
90
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
91
- datachain/query/dataset.py,sha256=J6SbCLnFlZgCxRchc3tVk5tcC7xo1Hp616JGlEZXCDo,54547
91
+ datachain/query/dataset.py,sha256=eXr9fJz2grX2evmkmsiH0Xeqajd8gFnujmt_USMxy0c,54563
92
92
  datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
93
93
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
94
94
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -107,20 +107,20 @@ datachain/sql/functions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
107
107
  datachain/sql/functions/aggregate.py,sha256=3AQdA8YHPFdtCEfwZKQXTT8SlQWdG9gD5PBtGN3Odqs,944
108
108
  datachain/sql/functions/array.py,sha256=Zq59CaMHf_hFapU4kxvy2mwteH344k5Wksxja4MfBks,1204
109
109
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
110
- datachain/sql/functions/numeric.py,sha256=DFTTEWsvBBXwbaaC4zdxhAoqUYwI6nbymG-nzbzdPv8,972
110
+ datachain/sql/functions/numeric.py,sha256=BK2KCiPSgM2IveCq-9M_PG3CtPBlztaS9TTn1LGzyLs,1250
111
111
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
112
112
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
113
- datachain/sql/functions/string.py,sha256=DYgiw8XSk7ge7GXvyRI1zbaMruIizNeI-puOjriQGZQ,1148
113
+ datachain/sql/functions/string.py,sha256=E-T9OIzUR-GKaLgjZsEtg5CJrY_sLf1lt1awTvY7w2w,1426
114
114
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
115
- datachain/sql/sqlite/base.py,sha256=eQv2U32jChG9tnYSFE4SS2Mvfb7-W3Ok3Ffhew9qkKI,17254
115
+ datachain/sql/sqlite/base.py,sha256=E2PK3hoGlHey1eEjcReXRrI-c_ASr3AmAXaNYKDY_o8,18634
116
116
  datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
117
117
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
118
118
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
119
119
  datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
120
120
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
121
- datachain-0.7.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
122
- datachain-0.7.9.dist-info/METADATA,sha256=iu58cwfGQVYTwn53symALXVpe9292EWXdOly2MWuPZY,18006
123
- datachain-0.7.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
124
- datachain-0.7.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
125
- datachain-0.7.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
126
- datachain-0.7.9.dist-info/RECORD,,
121
+ datachain-0.7.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
122
+ datachain-0.7.10.dist-info/METADATA,sha256=qtw_rToRdmR9-CO6MFCAGv6NWJJ87C95iQaDEnDE4H8,8371
123
+ datachain-0.7.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
124
+ datachain-0.7.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
125
+ datachain-0.7.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
126
+ datachain-0.7.10.dist-info/RECORD,,
@@ -1,488 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: datachain
3
- Version: 0.7.9
4
- Summary: Wrangle unstructured AI data at scale
5
- Author-email: Dmitry Petrov <support@dvc.org>
6
- License: Apache-2.0
7
- Project-URL: Documentation, https://datachain.dvc.ai
8
- Project-URL: Issues, https://github.com/iterative/datachain/issues
9
- Project-URL: Source, https://github.com/iterative/datachain
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
- Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Classifier: Programming Language :: Python :: 3.12
15
- Classifier: Development Status :: 2 - Pre-Alpha
16
- Requires-Python: >=3.9
17
- Description-Content-Type: text/x-rst
18
- License-File: LICENSE
19
- Requires-Dist: pyyaml
20
- Requires-Dist: tomlkit
21
- Requires-Dist: tqdm
22
- Requires-Dist: numpy<3,>=1
23
- Requires-Dist: pandas>=2.0.0
24
- Requires-Dist: pyarrow
25
- Requires-Dist: typing-extensions
26
- Requires-Dist: python-dateutil>=2
27
- Requires-Dist: attrs>=21.3.0
28
- Requires-Dist: s3fs>=2024.2.0
29
- Requires-Dist: gcsfs>=2024.2.0
30
- Requires-Dist: adlfs>=2024.2.0
31
- Requires-Dist: dvc-data<4,>=3.10
32
- Requires-Dist: dvc-objects<6,>=4
33
- Requires-Dist: shtab<2,>=1.3.4
34
- Requires-Dist: sqlalchemy>=2
35
- Requires-Dist: multiprocess==0.70.16
36
- Requires-Dist: cloudpickle
37
- Requires-Dist: orjson>=3.10.5
38
- Requires-Dist: pydantic<3,>=2
39
- Requires-Dist: jmespath>=1.0
40
- Requires-Dist: datamodel-code-generator>=0.25
41
- Requires-Dist: Pillow<12,>=10.0.0
42
- Requires-Dist: msgpack<2,>=1.0.4
43
- Requires-Dist: psutil
44
- Requires-Dist: huggingface_hub
45
- Requires-Dist: iterative-telemetry>=0.0.9
46
- Requires-Dist: platformdirs
47
- Requires-Dist: dvc-studio-client<1,>=0.21
48
- Requires-Dist: tabulate
49
- Provides-Extra: docs
50
- Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
- Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
52
- Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
53
- Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
54
- Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
55
- Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
56
- Provides-Extra: torch
57
- Requires-Dist: torch>=2.1.0; extra == "torch"
58
- Requires-Dist: torchvision; extra == "torch"
59
- Requires-Dist: transformers>=4.36.0; extra == "torch"
60
- Provides-Extra: remote
61
- Requires-Dist: lz4; extra == "remote"
62
- Requires-Dist: requests>=2.22.0; extra == "remote"
63
- Provides-Extra: vector
64
- Requires-Dist: usearch; extra == "vector"
65
- Provides-Extra: hf
66
- Requires-Dist: numba>=0.60.0; extra == "hf"
67
- Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
68
- Provides-Extra: tests
69
- Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
70
- Requires-Dist: pytest<9,>=8; extra == "tests"
71
- Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
72
- Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
73
- Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
74
- Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
75
- Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
76
- Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
77
- Requires-Dist: virtualenv; extra == "tests"
78
- Requires-Dist: dulwich; extra == "tests"
79
- Requires-Dist: hypothesis; extra == "tests"
80
- Requires-Dist: open_clip_torch; extra == "tests"
81
- Requires-Dist: aiotools>=1.7.0; extra == "tests"
82
- Requires-Dist: requests-mock; extra == "tests"
83
- Requires-Dist: scipy; extra == "tests"
84
- Provides-Extra: dev
85
- Requires-Dist: datachain[docs,tests]; extra == "dev"
86
- Requires-Dist: mypy==1.13.0; extra == "dev"
87
- Requires-Dist: types-python-dateutil; extra == "dev"
88
- Requires-Dist: types-pytz; extra == "dev"
89
- Requires-Dist: types-PyYAML; extra == "dev"
90
- Requires-Dist: types-requests; extra == "dev"
91
- Requires-Dist: types-tabulate; extra == "dev"
92
- Provides-Extra: examples
93
- Requires-Dist: datachain[tests]; extra == "examples"
94
- Requires-Dist: numpy<2,>=1; extra == "examples"
95
- Requires-Dist: defusedxml; extra == "examples"
96
- Requires-Dist: accelerate; extra == "examples"
97
- Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
98
- Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
- Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
- Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.37; extra == "examples"
102
-
103
- ================
104
- |logo| DataChain
105
- ================
106
-
107
- |PyPI| |Python Version| |Codecov| |Tests|
108
-
109
- .. |logo| image:: docs/assets/datachain.svg
110
- :height: 24
111
- .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
112
- :target: https://pypi.org/project/datachain/
113
- :alt: PyPI
114
- .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
115
- :target: https://pypi.org/project/datachain
116
- :alt: Python Version
117
- .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
118
- :target: https://codecov.io/gh/iterative/datachain
119
- :alt: Codecov
120
- .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
121
- :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
122
- :alt: Tests
123
-
124
- DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
125
- data like images, audio, videos, text and PDFs. It integrates with external storage
126
- (e.g., S3) to process data efficiently without data duplication and manages metadata
127
- in an internal database for easy and efficient querying.
128
-
129
-
130
- Use Cases
131
- =========
132
-
133
- 1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
134
- refining data in pre-training, finetuning or LLM evaluating stages.
135
- 2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
136
- ad-hoc analytics using LLMs.
137
-
138
- Key Features
139
- ============
140
-
141
- 📂 **Multimodal Dataset Versioning.**
142
- - Version unstructured data without redundant data copies, by supporting
143
- references to S3, GCP, Azure, and local file systems.
144
- - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
145
- - Unite files and metadata together into persistent, versioned, columnar datasets.
146
-
147
- 🐍 **Python-friendly.**
148
- - Operate on Python objects and object fields: float scores, strings, matrixes,
149
- LLM response objects.
150
- - Run Python code in a high-scale, terabytes size datasets, with built-in
151
- parallelization and memory-efficient computing — no SQL or Spark required.
152
-
153
- 🧠 **Data Enrichment and Processing.**
154
- - Generate metadata using local AI models and LLM APIs.
155
- - Filter, join, and group datasets by metadata. Search by vector embeddings.
156
- - High-performance vectorized operations on Python objects: sum, count, avg, etc.
157
- - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
158
-
159
-
160
- Quick Start
161
- -----------
162
-
163
- .. code:: console
164
-
165
- $ pip install datachain
166
-
167
-
168
- Selecting files using JSON metadata
169
- ======================================
170
-
171
- A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
172
- annotated with ground truth and model inferences in the 'json-pairs' format,
173
- where each image has a matching JSON file like `cat.1009.json`:
174
-
175
- .. code:: json
176
-
177
- {
178
- "class": "cat", "id": "1009", "num_annotators": 8,
179
- "inference": {"class": "dog", "confidence": 0.68}
180
- }
181
-
182
- Example of downloading only "high-confidence cat" inferred images using JSON metadata:
183
-
184
-
185
- .. code:: py
186
-
187
- from datachain import Column, DataChain
188
-
189
- meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
190
- images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
191
-
192
- images_id = images.map(id=lambda file: file.path.split('.')[-2])
193
- annotated = images_id.merge(meta, on="id", right_on="meta.id")
194
-
195
- likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
196
- & (Column("meta.inference.class_") == "cat"))
197
- likely_cats.export_files("high-confidence-cats/", signal="file")
198
-
199
-
200
- Data curation with a local AI model
201
- ===================================
202
- Batch inference with a simple sentiment model using the `transformers` library:
203
-
204
- .. code:: shell
205
-
206
- pip install transformers
207
-
208
- The code below downloads files from the cloud, and applies a user-defined function
209
- to each one of them. All files with a positive sentiment
210
- detected are then copied to the local directory.
211
-
212
- .. code:: py
213
-
214
- from transformers import pipeline
215
- from datachain import DataChain, Column
216
-
217
- classifier = pipeline("sentiment-analysis", device="cpu",
218
- model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
219
-
220
- def is_positive_dialogue_ending(file) -> bool:
221
- dialogue_ending = file.read()[-512:]
222
- return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
223
-
224
- chain = (
225
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
226
- object_name="file", type="text")
227
- .settings(parallel=8, cache=True)
228
- .map(is_positive=is_positive_dialogue_ending)
229
- .save("file_response")
230
- )
231
-
232
- positive_chain = chain.filter(Column("is_positive") == True)
233
- positive_chain.export_files("./output")
234
-
235
- print(f"{positive_chain.count()} files were exported")
236
-
237
-
238
-
239
- 13 files were exported
240
-
241
- .. code:: shell
242
-
243
- $ ls output/datachain-demo/chatbot-KiT/
244
- 15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
245
- $ ls output/datachain-demo/chatbot-KiT/ | wc -l
246
- 13
247
-
248
-
249
- LLM judging chatbots
250
- =============================
251
-
252
- LLMs can work as universal classifiers. In the example below,
253
- we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
254
- Mistral API key at https://console.mistral.ai
255
-
256
-
257
- .. code:: shell
258
-
259
- $ pip install mistralai (Requires version >=1.0.0)
260
- $ export MISTRAL_API_KEY=_your_key_
261
-
262
- DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
263
-
264
- .. code:: py
265
-
266
- from mistralai import Mistral
267
- from datachain import File, DataChain, Column
268
-
269
- PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
270
-
271
- def eval_dialogue(file: File) -> bool:
272
- client = Mistral()
273
- response = client.chat.complete(
274
- model="open-mixtral-8x22b",
275
- messages=[{"role": "system", "content": PROMPT},
276
- {"role": "user", "content": file.read()}])
277
- result = response.choices[0].message.content
278
- return result.lower().startswith("success")
279
-
280
- chain = (
281
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
282
- .settings(parallel=4, cache=True)
283
- .map(is_success=eval_dialogue)
284
- .save("mistral_files")
285
- )
286
-
287
- successful_chain = chain.filter(Column("is_success") == True)
288
- successful_chain.export_files("./output_mistral")
289
-
290
- print(f"{successful_chain.count()} files were exported")
291
-
292
-
293
- With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
294
-
295
- .. code:: shell
296
-
297
- $ ls output_mistral/datachain-demo/chatbot-KiT/
298
- 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
299
- $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
300
- 31
301
-
302
-
303
-
304
- Serializing Python-objects
305
- ==========================
306
-
307
- LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
308
- model performance parameters.
309
-
310
- Instead of extracting this information from the Mistral response data structure (class
311
- `ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
312
-
313
-
314
- .. code:: py
315
-
316
- from mistralai import Mistral
317
- from mistralai.models import ChatCompletionResponse
318
- from datachain import File, DataChain, Column
319
-
320
- PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
321
-
322
- def eval_dialog(file: File) -> ChatCompletionResponse:
323
- client = MistralClient()
324
- return client.chat(
325
- model="open-mixtral-8x22b",
326
- messages=[{"role": "system", "content": PROMPT},
327
- {"role": "user", "content": file.read()}])
328
-
329
- chain = (
330
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
331
- .settings(parallel=4, cache=True)
332
- .map(response=eval_dialog)
333
- .map(status=lambda response: response.choices[0].message.content.lower()[:7])
334
- .save("response")
335
- )
336
-
337
- chain.select("file.name", "status", "response.usage").show(5)
338
-
339
- success_rate = chain.filter(Column("status") == "success").count() / chain.count()
340
- print(f"{100*success_rate:.1f}% dialogs were successful")
341
-
342
- Output:
343
-
344
- .. code:: shell
345
-
346
- file status response response response
347
- name usage usage usage
348
- prompt_tokens total_tokens completion_tokens
349
- 0 1.txt success 547 548 1
350
- 1 10.txt failure 3576 3578 2
351
- 2 11.txt failure 626 628 2
352
- 3 12.txt failure 1144 1182 38
353
- 4 13.txt success 1100 1101 1
354
-
355
- [Limited by 5 rows]
356
- 64.0% dialogs were successful
357
-
358
-
359
- Iterating over Python data structures
360
- =============================================
361
-
362
- In the previous examples, datasets were saved in the embedded database
363
- (`SQLite`_ in folder `.datachain` of the working directory).
364
- These datasets were automatically versioned, and can be accessed using
365
- `DataChain.from_dataset("dataset_name")`.
366
-
367
- Here is how to retrieve a saved dataset and iterate over the objects:
368
-
369
- .. code:: py
370
-
371
- chain = DataChain.from_dataset("response")
372
-
373
- # Iterating one-by-one: support out-of-memory workflow
374
- for file, response in chain.limit(5).collect("file", "response"):
375
- # verify the collected Python objects
376
- assert isinstance(response, ChatCompletionResponse)
377
-
378
- status = response.choices[0].message.content[:7]
379
- tokens = response.usage.total_tokens
380
- print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
381
-
382
- Output:
383
-
384
- .. code:: shell
385
-
386
- gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
387
- gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
388
- gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
389
- gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
390
- gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
391
-
392
-
393
- Vectorized analytics over Python objects
394
- ========================================
395
-
396
- Some operations can run inside the DB without deserialization.
397
- For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
398
-
399
- .. code:: py
400
-
401
- chain = DataChain.from_dataset("mistral_dataset")
402
-
403
- cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
404
- + chain.sum("response.usage.completion_tokens")*0.000006
405
- print(f"Spent ${cost:.2f} on {chain.count()} calls")
406
-
407
- Output:
408
-
409
- .. code:: shell
410
-
411
- Spent $0.08 on 50 calls
412
-
413
-
414
- PyTorch data loader
415
- ===================
416
-
417
- Chain results can be exported or passed directly to PyTorch dataloader.
418
- For example, if we are interested in passing image and a label based on file
419
- name suffix, the following code will do it:
420
-
421
- .. code:: py
422
-
423
- from torch.utils.data import DataLoader
424
- from transformers import CLIPProcessor
425
-
426
- from datachain import C, DataChain
427
-
428
- processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
429
-
430
- chain = (
431
- DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
432
- .map(label=lambda name: name.split(".")[0], params=["file.name"])
433
- .select("file", "label").to_pytorch(
434
- transform=processor.image_processor,
435
- tokenizer=processor.tokenizer,
436
- )
437
- )
438
- loader = DataLoader(chain, batch_size=1)
439
-
440
-
441
- DataChain Studio Platform
442
- -------------------------
443
-
444
- `DataChain Studio`_ is a proprietary solution for teams that offers:
445
-
446
- - **Centralized dataset registry** to manage data, code and dependency
447
- dependencies in one place.
448
- - **Data Lineage** for data sources as well as direvative dataset.
449
- - **UI for Multimodal Data** like images, videos, and PDFs.
450
- - **Scalable Compute** to handle large datasets (100M+ files) and in-house
451
- AI model inference.
452
- - **Access control** including SSO and team based collaboration.
453
-
454
- Tutorials
455
- ---------
456
-
457
- * `Getting Started`_
458
- * `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
459
- * `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
460
- * `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
461
-
462
-
463
- Contributions
464
- -------------
465
-
466
- Contributions are very welcome.
467
- To learn more, see the `Contributor Guide`_.
468
-
469
-
470
- Community and Support
471
- ---------------------
472
-
473
- * `Docs <https://datachain.dvc.ai/>`_
474
- * `File an issue`_ if you encounter any problems
475
- * `Discord Chat <https://dvc.org/chat>`_
476
- * `Email <mailto:support@dvc.org>`_
477
- * `Twitter <https://twitter.com/DVCorg>`_
478
-
479
-
480
- .. _PyPI: https://pypi.org/
481
- .. _file an issue: https://github.com/iterative/datachain/issues
482
- .. github-only
483
- .. _Contributor Guide: CONTRIBUTING.rst
484
- .. _Pydantic: https://github.com/pydantic/pydantic
485
- .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
486
- .. _SQLite: https://www.sqlite.org/
487
- .. _Getting Started: https://docs.datachain.ai/
488
- .. _DataChain Studio: https://studio.datachain.ai/