datachain 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -1,4 +0,0 @@
1
- try:
2
- from ._version import version as __version__
3
- except ImportError:
4
- __version__ = "UNKNOWN"
datachain/cli.py CHANGED
@@ -5,13 +5,14 @@ import sys
5
5
  import traceback
6
6
  from argparse import SUPPRESS, Action, ArgumentParser, ArgumentTypeError, Namespace
7
7
  from collections.abc import Iterable, Iterator, Mapping, Sequence
8
+ from importlib.metadata import PackageNotFoundError, version
8
9
  from itertools import chain
9
10
  from multiprocessing import freeze_support
10
11
  from typing import TYPE_CHECKING, Optional, Union
11
12
 
12
13
  import shtab
13
14
 
14
- from datachain import __version__, utils
15
+ from datachain import utils
15
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
16
17
  from datachain.utils import DataChainDir
17
18
 
@@ -96,6 +97,12 @@ def add_show_args(parser: ArgumentParser) -> None:
96
97
 
97
98
 
98
99
  def get_parser() -> ArgumentParser: # noqa: PLR0915
100
+ try:
101
+ __version__ = version("datachain")
102
+ except PackageNotFoundError:
103
+ # package is not installed
104
+ __version__ = "unknown"
105
+
99
106
  parser = ArgumentParser(
100
107
  description="DataChain: Wrangle unstructured AI data at scale", prog="datachain"
101
108
  )
@@ -31,7 +31,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
31
31
  """
32
32
  c_set: dict[str, sa.Column] = {}
33
33
  for c in columns:
34
- if ec := c_set.get(c.name, None):
34
+ if (ec := c_set.get(c.name, None)) is not None:
35
35
  if str(ec.type) != str(c.type):
36
36
  raise ValueError(
37
37
  f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
@@ -171,8 +171,8 @@ class DataTable:
171
171
  ):
172
172
  # copy columns, since re-using the same objects from another table
173
173
  # may raise an error
174
- columns = [cls.copy_column(c) for c in columns if c.name != "id"]
175
- columns = [sa.Column("id", Int, primary_key=True), *columns]
174
+ columns = cls.sys_columns() + [cls.copy_column(c) for c in columns]
175
+ columns = dedup_columns(columns)
176
176
 
177
177
  if metadata is None:
178
178
  metadata = sa.MetaData()
@@ -230,11 +230,17 @@ class DataTable:
230
230
  def delete(self):
231
231
  return self.apply_conditions(self.table.delete())
232
232
 
233
+ @staticmethod
234
+ def sys_columns():
235
+ return [
236
+ sa.Column("id", Int, primary_key=True),
237
+ sa.Column("random", Int64, nullable=False, default=f.random()),
238
+ ]
239
+
233
240
  @classmethod
234
241
  def file_columns(cls) -> list[sa.Column]:
235
242
  return [
236
- sa.Column("id", Int, primary_key=True),
237
- sa.Column("random", Int64, nullable=False),
243
+ *cls.sys_columns(),
238
244
  sa.Column("vtype", String, nullable=False, index=True),
239
245
  sa.Column("dir_type", Int, index=True),
240
246
  sa.Column("parent", String, index=True),
@@ -33,6 +33,7 @@ from datachain.data_storage.schema import (
33
33
  from datachain.dataset import DatasetRecord
34
34
  from datachain.error import DataChainError
35
35
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
36
+ from datachain.sql.sqlite.base import load_usearch_extension
36
37
  from datachain.sql.types import SQLType
37
38
  from datachain.storage import StorageURI
38
39
  from datachain.utils import DataChainDir
@@ -114,6 +115,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
114
115
  if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
115
116
  db.set_trace_callback(print)
116
117
 
118
+ load_usearch_extension(db)
119
+
117
120
  return cls(engine, MetaData(), db, db_file)
118
121
  except RuntimeError:
119
122
  raise DataChainError("Can't connect to SQLite DB") from None
@@ -1,6 +1,3 @@
1
- import os
2
- import shutil
3
- import tempfile
4
1
  from abc import ABC
5
2
  from contextlib import AbstractContextManager
6
3
 
@@ -8,9 +5,7 @@ from datachain.cache import UniqueId
8
5
 
9
6
 
10
7
  class AbstractCachedStream(AbstractContextManager, ABC):
11
- def __init__(self, stream, size, catalog, uid: UniqueId):
12
- self.stream = stream
13
- self.size = size
8
+ def __init__(self, catalog, uid: UniqueId):
14
9
  self.catalog = catalog
15
10
  self.uid = uid
16
11
  self.mode = "rb"
@@ -19,86 +14,9 @@ class AbstractCachedStream(AbstractContextManager, ABC):
19
14
  self.mode = mode
20
15
 
21
16
 
22
- class ProgressiveCacheStream(AbstractCachedStream):
23
- BUF_SIZE = 4096
24
-
25
- def __init__(self, stream, size, catalog, uid: UniqueId):
26
- super().__init__(stream, size, catalog, uid)
27
-
28
- self.target_path = self.catalog.cache.path_from_checksum(self.uid.get_hash())
29
- self.cached_file = None
30
-
31
- self.temp_file = None
32
- self.temp_file_pos = 0
33
-
34
- def __enter__(self):
35
- if os.path.exists(self.target_path):
36
- self.cached_file = open(self.target_path, mode=self.mode)
37
- return self.cached_file
38
-
39
- tmp_dir = self.catalog.cache.tmp_dir
40
- if not os.path.exists(tmp_dir):
41
- os.makedirs(tmp_dir)
42
- self.temp_file = tempfile.NamedTemporaryFile(
43
- prefix=str(self.uid.get_hash()), dir=tmp_dir, delete=False
44
- )
45
- return self
46
-
47
- def __exit__(self, *args):
48
- self.close()
49
-
50
- def read(self, size=-1):
51
- buf = self.stream.read(size)
52
- pos = self.stream.tell()
53
-
54
- if pos >= self.temp_file_pos:
55
- self._cache_catch_up(pos, buf)
56
-
57
- return buf
58
-
59
- def close(self):
60
- if self.cached_file:
61
- self.cached_file.close()
62
-
63
- if self.temp_file:
64
- if self.temp_file_pos < self.size:
65
- self._cache_catch_up(self.size)
66
-
67
- self.temp_file.close()
68
- if not os.path.exists(self.target_path):
69
- os.makedirs(os.path.dirname(self.target_path), exist_ok=True)
70
- shutil.move(self.temp_file.name, self.target_path)
71
-
72
- self.stream.close()
73
-
74
- def _cache_catch_up(self, pos_target, latest_buf=None):
75
- pos_to_restore = self.stream.tell()
76
- try:
77
- remainder = pos_target - self.temp_file_pos
78
- self.stream.seek(self.temp_file_pos)
79
- while remainder > 0:
80
- chunk_size = min(self.BUF_SIZE, remainder)
81
- buf = self.stream.read(chunk_size)
82
- self._cache_update(buf)
83
- remainder -= len(buf)
84
- finally:
85
- self.stream.seek(pos_to_restore)
86
-
87
- def _cache_update(self, buf):
88
- length = len(buf)
89
- self.temp_file.write(buf)
90
- self.temp_file_pos += length
91
-
92
- def seek(self, offset, whence=0):
93
- return self.stream.seek(offset, whence)
94
-
95
- def tell(self):
96
- return self.stream.tell()
97
-
98
-
99
17
  class PreCachedStream(AbstractCachedStream):
100
- def __init__(self, stream, size, catalog, uid: UniqueId):
101
- super().__init__(stream, size, catalog, uid)
18
+ def __init__(self, catalog, uid: UniqueId):
19
+ super().__init__(catalog, uid)
102
20
  self.client = self.catalog.get_client(self.uid.storage)
103
21
  self.cached_file = None
104
22
 
datachain/lib/dc.py CHANGED
@@ -39,6 +39,8 @@ if TYPE_CHECKING:
39
39
  import pandas as pd
40
40
  from typing_extensions import Self
41
41
 
42
+ from datachain.catalog import Catalog
43
+
42
44
  C = Column
43
45
 
44
46
 
@@ -200,10 +202,12 @@ class DataChain(DatasetQuery):
200
202
  def from_storage(
201
203
  cls,
202
204
  path,
205
+ *,
203
206
  type: Literal["binary", "text", "image"] = "binary",
207
+ catalog: Optional["Catalog"] = None,
204
208
  recursive: Optional[bool] = True,
205
209
  anon: bool = False,
206
- ) -> "DataChain":
210
+ ) -> "Self":
207
211
  """Get data from a storage as a list of file with all file attributes. It
208
212
  returns the chain itself as usual.
209
213
 
@@ -220,7 +224,7 @@ class DataChain(DatasetQuery):
220
224
  ```
221
225
  """
222
226
  func = get_file(type)
223
- return DataChain(path, recursive=recursive, anon=anon).map(file=func)
227
+ return cls(path, catalog=catalog, recursive=recursive, anon=anon).map(file=func)
224
228
 
225
229
  @classmethod
226
230
  def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
datachain/lib/feature.py CHANGED
@@ -7,6 +7,7 @@ from datetime import datetime
7
7
  from functools import lru_cache
8
8
  from types import GenericAlias
9
9
  from typing import (
10
+ TYPE_CHECKING,
10
11
  Any,
11
12
  ClassVar,
12
13
  Literal,
@@ -39,6 +40,9 @@ from datachain.sql.types import (
39
40
  String,
40
41
  )
41
42
 
43
+ if TYPE_CHECKING:
44
+ from datachain.catalog import Catalog
45
+
42
46
  FeatureStandardType = Union[
43
47
  type[int],
44
48
  type[str],
@@ -158,7 +162,7 @@ class Feature(BaseModel):
158
162
  s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
159
163
  return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
160
164
 
161
- def _set_stream(self, catalog, stream=None, caching_enabled: bool = False) -> None:
165
+ def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
162
166
  pass
163
167
 
164
168
  @classmethod
@@ -1,6 +1,7 @@
1
+ import logging
1
2
  from typing import Any, ClassVar, Optional
2
3
 
3
- from datachain.cli import logger
4
+ logger = logging.getLogger(__name__)
4
5
 
5
6
 
6
7
  class Registry:
@@ -16,7 +17,7 @@ class Registry:
16
17
  version = fr._version # type: ignore[attr-defined]
17
18
  if version in cls.reg[name]:
18
19
  full_name = f"{name}@{version}"
19
- logger.warning(f"Feature {full_name} is already registered")
20
+ logger.warning("Feature %s is already registered", full_name)
20
21
  cls.reg[name][version] = fr
21
22
 
22
23
  @classmethod
datachain/lib/file.py CHANGED
@@ -2,11 +2,10 @@ import json
2
2
  from abc import ABC, abstractmethod
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import Any, ClassVar, Literal, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
6
6
  from urllib.parse import unquote, urlparse
7
7
  from urllib.request import url2pathname
8
8
 
9
- from fsspec import Callback
10
9
  from fsspec.implementations.local import LocalFileSystem
11
10
  from pydantic import Field, field_validator
12
11
 
@@ -18,6 +17,9 @@ from datachain.lib.utils import DataChainError
18
17
  from datachain.sql.types import JSON, Int, String
19
18
  from datachain.utils import TIME_ZERO
20
19
 
20
+ if TYPE_CHECKING:
21
+ from datachain.catalog import Catalog
22
+
21
23
 
22
24
  class FileFeature(Feature):
23
25
  _is_file = True
@@ -182,26 +184,17 @@ class File(FileFeature):
182
184
 
183
185
  def open(self):
184
186
  if self._stream is None:
185
- if self._catalog is None:
186
- raise FileError(self, "stream is not set")
187
- self._stream = self._open_stream()
187
+ raise FileError(self, "stream is not set")
188
188
 
189
189
  if self.location:
190
190
  return VFileRegistry.resolve(self, self.location)
191
191
 
192
192
  return self._stream
193
193
 
194
- def _set_stream(
195
- self, catalog=None, stream=None, caching_enabled: bool = False
196
- ) -> None:
197
- if self._catalog is None and catalog is None:
198
- raise DataChainError(f"Cannot set file '{stream}' without catalog")
199
-
200
- if catalog:
201
- self._catalog = catalog
202
-
194
+ def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
195
+ self._catalog = catalog
203
196
  stream_class = PreCachedStream if caching_enabled else PreDownloadStream
204
- self._stream = stream_class(stream, self.size, self._catalog, self.get_uid())
197
+ self._stream = stream_class(self._catalog, self.get_uid())
205
198
  self._caching_enabled = caching_enabled
206
199
 
207
200
  def get_uid(self) -> UniqueId:
@@ -232,11 +225,6 @@ class File(FileFeature):
232
225
  def get_uri(self):
233
226
  return f"{self.source}/{self.get_full_name()}"
234
227
 
235
- def _open_stream(self, cache: bool = False, cb: Optional[Callback] = None):
236
- client = self._catalog.get_client(self.source)
237
- uid = self.get_uid()
238
- return client.open_object(uid, use_cache=cache, cb=cb)
239
-
240
228
  def get_path(self) -> str:
241
229
  path = unquote(self.get_uri())
242
230
  fs = self.get_fs()
@@ -258,10 +246,8 @@ class TextFile(File):
258
246
  super().__init__(**kwargs)
259
247
  self._stream = None
260
248
 
261
- def _set_stream(
262
- self, catalog=None, stream=None, caching_enabled: bool = False
263
- ) -> None:
264
- super()._set_stream(catalog, stream, caching_enabled)
249
+ def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
250
+ super()._set_stream(catalog, caching_enabled)
265
251
  self._stream.set_mode("r")
266
252
 
267
253
 
datachain/lib/udf.py CHANGED
@@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, Callable, Optional
6
6
  from datachain.lib.feature import Feature
7
7
  from datachain.lib.signal_schema import SignalSchema
8
8
  from datachain.lib.utils import DataChainError, DataChainParamsError
9
- from datachain.query import Stream, udf
9
+ from datachain.query import udf
10
10
 
11
11
  if TYPE_CHECKING:
12
- from dvxc.query.udf import UDFWrapper
12
+ from datachain.query.udf import UDFWrapper
13
13
 
14
14
 
15
15
  class UdfError(DataChainParamsError):
@@ -34,11 +34,6 @@ class UDFBase:
34
34
 
35
35
  params_spec = params.to_udf_spec()
36
36
  self.params_spec = list(params_spec.keys())
37
- self._contains_stream = False
38
- if params.contains_file():
39
- self.params_spec.insert(0, Stream()) # type: ignore[arg-type]
40
- self._contains_stream = True
41
-
42
37
  self.output_spec = output.to_udf_spec()
43
38
 
44
39
  self._catalog = None
@@ -122,18 +117,10 @@ class UDFBase:
122
117
  rows = [rows]
123
118
  objs = []
124
119
  for row in rows:
125
- if self._contains_stream:
126
- stream, *row = row
127
- else:
128
- stream = None
129
-
130
120
  obj_row = self.params.row_to_objs(row)
131
-
132
- if self._contains_stream:
133
- for obj in obj_row:
134
- if isinstance(obj, Feature):
135
- obj._set_stream(self._catalog, stream, True)
136
-
121
+ for obj in obj_row:
122
+ if isinstance(obj, Feature):
123
+ obj._set_stream(self._catalog, caching_enabled=True)
137
124
  objs.append(obj_row)
138
125
  return objs
139
126
 
@@ -150,13 +137,7 @@ class UDFBase:
150
137
  output_map[name] = []
151
138
 
152
139
  for flat_obj in group:
153
- if self._contains_stream:
154
- position = 1
155
- stream = flat_obj[0]
156
- else:
157
- position = 0
158
- stream = None
159
-
140
+ position = 0
160
141
  for signal, (cls, length) in spec_map.items():
161
142
  slice = flat_obj[position : position + length]
162
143
  position += length
@@ -167,7 +148,7 @@ class UDFBase:
167
148
  obj = slice[0]
168
149
 
169
150
  if isinstance(obj, Feature):
170
- obj._set_stream(self._catalog, stream)
151
+ obj._set_stream(self._catalog)
171
152
  output_map[signal].append(obj)
172
153
 
173
154
  return list(output_map.values())
@@ -1737,22 +1737,16 @@ class DatasetQuery:
1737
1737
 
1738
1738
  # Exclude the id column and let the db create it to avoid unique
1739
1739
  # constraint violations.
1740
- cols = [col.name for col in dr.get_table().c if col.name != "id"]
1741
- assert cols
1742
1740
  q = query.exclude(("id",))
1743
-
1744
1741
  if q._order_by_clauses:
1745
1742
  # ensuring we have id sorted by order by clause if it exists in a query
1746
1743
  q = q.add_columns(
1747
1744
  f.row_number().over(order_by=q._order_by_clauses).label("id")
1748
1745
  )
1749
- cols.append("id")
1750
-
1751
- self.catalog.warehouse.db.execute(
1752
- sqlalchemy.insert(dr.get_table()).from_select(cols, q),
1753
- **kwargs,
1754
- )
1755
1746
 
1747
+ cols = tuple(c.name for c in q.columns)
1748
+ insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
1749
+ self.catalog.warehouse.db.execute(insert_q, **kwargs)
1756
1750
  self.catalog.metastore.update_dataset_status(
1757
1751
  dataset, DatasetStatus.COMPLETE, version=version
1758
1752
  )
@@ -71,8 +71,6 @@ def setup():
71
71
  compiles(sql_path.name, "sqlite")(compile_path_name)
72
72
  compiles(sql_path.file_stem, "sqlite")(compile_path_file_stem)
73
73
  compiles(sql_path.file_ext, "sqlite")(compile_path_file_ext)
74
- compiles(array.cosine_distance, "sqlite")(compile_cosine_distance)
75
- compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance)
76
74
  compiles(array.length, "sqlite")(compile_array_length)
77
75
  compiles(string.length, "sqlite")(compile_string_length)
78
76
  compiles(string.split, "sqlite")(compile_string_split)
@@ -81,6 +79,13 @@ def setup():
81
79
  compiles(Values, "sqlite")(compile_values)
82
80
  compiles(random.rand, "sqlite")(compile_rand)
83
81
 
82
+ if load_usearch_extension(sqlite3.connect(":memory:")):
83
+ compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
84
+ compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance_ext)
85
+ else:
86
+ compiles(array.cosine_distance, "sqlite")(compile_cosine_distance)
87
+ compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance)
88
+
84
89
  register_user_defined_sql_functions()
85
90
  setup_is_complete = True
86
91
 
@@ -246,11 +251,23 @@ def compile_path_file_ext(element, compiler, **kwargs):
246
251
  return compiler.process(path_file_ext(*element.clauses.clauses), **kwargs)
247
252
 
248
253
 
254
+ def compile_cosine_distance_ext(element, compiler, **kwargs):
255
+ run_compiler_hook("cosine_distance")
256
+ return f"distance_cosine_f32({compiler.process(element.clauses, **kwargs)})"
257
+
258
+
249
259
  def compile_cosine_distance(element, compiler, **kwargs):
250
260
  run_compiler_hook("cosine_distance")
251
261
  return f"cosine_distance({compiler.process(element.clauses, **kwargs)})"
252
262
 
253
263
 
264
+ def compile_euclidean_distance_ext(element, compiler, **kwargs):
265
+ run_compiler_hook("euclidean_distance")
266
+ return (
267
+ f"sqrt(distance_sqeuclidean_f32({compiler.process(element.clauses, **kwargs)}))"
268
+ )
269
+
270
+
254
271
  def compile_euclidean_distance(element, compiler, **kwargs):
255
272
  run_compiler_hook("euclidean_distance")
256
273
  return f"euclidean_distance({compiler.process(element.clauses, **kwargs)})"
@@ -330,3 +347,18 @@ def compile_values(element, compiler, **kwargs):
330
347
 
331
348
  def compile_rand(element, compiler, **kwargs):
332
349
  return compiler.process(func.random(), **kwargs)
350
+
351
+
352
+ def load_usearch_extension(conn) -> bool:
353
+ try:
354
+ # usearch is part of the vector optional dependencies
355
+ # we use the extension's cosine and euclidean distance functions
356
+ from usearch import sqlite_path
357
+
358
+ conn.enable_load_extension(True)
359
+ conn.load_extension(sqlite_path())
360
+ conn.enable_load_extension(False)
361
+ return True
362
+
363
+ except Exception: # noqa: BLE001
364
+ return False
@@ -1,15 +1,23 @@
1
- import json
1
+ import math
2
2
 
3
3
  import numpy as np
4
- from scipy.spatial import distance
5
4
 
6
5
 
7
6
  def euclidean_distance(a: str, b: str):
8
- a_np = np.array(json.loads(a))
9
- b_np = np.array(json.loads(b))
7
+ a_np = np.fromstring(a[1:-1], sep=",")
8
+ b_np = np.fromstring(b[1:-1], sep=",")
10
9
 
11
10
  return np.linalg.norm(b_np - a_np)
12
11
 
13
12
 
14
13
  def cosine_distance(a: str, b: str):
15
- return distance.cosine(json.loads(a), json.loads(b))
14
+ u = np.fromstring(a[1:-1], sep=",")
15
+ v = np.fromstring(b[1:-1], sep=",")
16
+
17
+ uv = np.inner(u, v)
18
+ uu = np.inner(u, u)
19
+ vv = np.inner(v, v)
20
+
21
+ dist = 1.0 - uv / math.sqrt(uu * vv)
22
+
23
+ return max(0, min(dist, 2.0))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -79,7 +79,7 @@ Requires-Dist: open-clip-torch ; extra == 'tests'
79
79
  Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
80
80
  Requires-Dist: requests-mock ; extra == 'tests'
81
81
  Provides-Extra: vector
82
- Requires-Dist: scipy ; extra == 'vector'
82
+ Requires-Dist: usearch ; extra == 'vector'
83
83
 
84
84
  |PyPI| |Python Version| |Codecov| |Tests| |License|
85
85
 
@@ -1,9 +1,8 @@
1
- datachain/__init__.py,sha256=9a0qX6tqyA9KC3ahLmGarqlRTZJXhM7HijAWpfUaOnQ,102
1
+ datachain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
- datachain/_version.py,sha256=H-qsvrxCpdhaQzyddR-yajEqI71hPxLa4KxzpP3uS1g,411
4
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
5
4
  datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
6
- datachain/cli.py,sha256=FLKRimIq917Dq0EmG3yLzMTqDaMA0vyCRUREOobUspY,32256
5
+ datachain/cli.py,sha256=lInqYMhk8YuPY-ZWkfWZmE-ZmdIChJgbs305-a_MWpo,32457
7
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
8
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
9
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
@@ -33,19 +32,19 @@ datachain/data_storage/db_engine.py,sha256=mxOoWP4ntBMgLeTAk4dlEeIJArAz4x_tFrHyt
33
32
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
34
33
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
34
  datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
36
- datachain/data_storage/schema.py,sha256=FrhmeZ_btT1CfVisa4ScabS11ixZ3xn3d_whvVsBtDA,8700
35
+ datachain/data_storage/schema.py,sha256=t58LexPOCam_vWV0W52otEDNXgtFPHX3QFApEncFy2s,8809
37
36
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=eHTiJ0VIxU-chnhKNTN14EsaSnw5LAaxTLi9aMCZpl4,24978
37
+ datachain/data_storage/sqlite.py,sha256=F68Q_AIqNAObZ5kJ0GnBqRC6e2D2sRehkQo8UzrHgtI,25079
39
38
  datachain/data_storage/warehouse.py,sha256=tL2mYoXVZe-coKLTRXEJ0sMdEr2BD0GwgIWip5PP5CM,33300
40
39
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
40
  datachain/lib/arrow.py,sha256=7lAas8hSh3vL7S7s2KOlkYn4viQpfVbM_FQ_hLCh5oc,2593
42
- datachain/lib/cached_stream.py,sha256=BQI6gpJ2y7_-jqQo_0VB9ntbkOVISvj9wlDwGDQbqw8,3537
41
+ datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
43
42
  datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
44
- datachain/lib/dc.py,sha256=szYQC4FOoYDMlSEDAPWZ25z4Nn-WeoaKiqKwwXbOJws,35355
45
- datachain/lib/feature.py,sha256=KiPiMrU8ec-bJuUs70Xh4jytZdzKk9puQNQnx03K-po,12057
46
- datachain/lib/feature_registry.py,sha256=YQsLYChNkYK6p2MpcVfAyBybtfN5EMiOJ8LIYakjmeQ,1602
43
+ datachain/lib/dc.py,sha256=PBbEZhSPnbvB6jh2eTgZyDSouAGbjgEv8xabW45_vmk,35460
44
+ datachain/lib/feature.py,sha256=QDloA9HE7URf9J_veKrguYBvSg-0cbXZFTswNxrKsB8,12135
45
+ datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
47
46
  datachain/lib/feature_utils.py,sha256=LIK233IWGWFhuav5Rm8de0xIOSnuwA1ubk6OYrxrfN0,4712
48
- datachain/lib/file.py,sha256=K0jH8Q5Xle2TiVDTCzmopku_7Lh-IVufV_mgtaCNHYI,8744
47
+ datachain/lib/file.py,sha256=GQrqGgCEHICrUTdzTz_yhXqJWiae9EPTte1sd3hKeEU,8246
49
48
  datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
50
49
  datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
51
50
  datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
@@ -58,7 +57,7 @@ datachain/lib/reader.py,sha256=rPXXNoTUdm6PQwkAlaU-nOBreP_q4ett_EjFStrA_W0,1727
58
57
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
59
58
  datachain/lib/signal_schema.py,sha256=KaH194dAH8Zt8FtlNAgdVqcZlJc42y7RbcB37ldPPAY,11688
60
59
  datachain/lib/text.py,sha256=EEZrYohADi5rAGg3aLLRwtvyAV9js_yWAGhr2C3QbwI,2424
61
- datachain/lib/udf.py,sha256=kPc_6fQ4DzbiYiXvbps7QPlJWTu9MSCS8eUfGqOhjG4,6124
60
+ datachain/lib/udf.py,sha256=D9TMxkAvj3zPRnZmkCxadEDtiG3B45t2xAEpuO14MOQ,5600
62
61
  datachain/lib/udf_signature.py,sha256=DAWMQ0dvFkKabpY5MV5K2q9YmOSTKfiV8KuUBs_6kMg,7258
63
62
  datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
64
63
  datachain/lib/utils.py,sha256=YQKzuW096SGe7QwHwdyS47k_9l2Rh73b-wBqt1-niw4,213
@@ -68,7 +67,7 @@ datachain/lib/webdataset_laion.py,sha256=HAtSCbVvEQqzKkoRamRxDKaQALSB3QmJRU2yWRF
68
67
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
69
68
  datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
70
69
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
71
- datachain/query/dataset.py,sha256=c0ZoNEjAMmn0BdSnRm8XRWEsbaMH3xa_jd6FBJQDY1o,64576
70
+ datachain/query/dataset.py,sha256=QYrtZApS8djybkuDfGO0tt8O6sCBlmkg9TE__R4eM-I,64475
72
71
  datachain/query/dispatch.py,sha256=fEk1qalxAb5JJhN-iq0Mg9MyWve4XoN1Q7uvrX4mJY4,13106
73
72
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
74
73
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -90,12 +89,12 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
90
89
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
91
90
  datachain/sql/functions/string.py,sha256=DsyY6ZMAUqmZVRSla-BJLsLYNsIgLOh4XLR3yvYJUbE,505
92
91
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
93
- datachain/sql/sqlite/base.py,sha256=XVxn4pB-N4pPfiby5uVvfH7feNzRKlBNzsc5eyKPvhI,10965
92
+ datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
94
93
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
95
- datachain/sql/sqlite/vector.py,sha256=stBeEW6fbVbILmAtV4khjXdJIGT13HkRWJeCoqIOk50,315
96
- datachain-0.2.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
97
- datachain-0.2.0.dist-info/METADATA,sha256=iMX8hWEMXu-4MtXlD_SVwW3ija6bOLqSbeQvHoiMNfQ,14344
98
- datachain-0.2.0.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
99
- datachain-0.2.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
100
- datachain-0.2.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
101
- datachain-0.2.0.dist-info/RECORD,,
94
+ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
95
+ datachain-0.2.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.1.dist-info/METADATA,sha256=kgX6auIOqU0DtW6dRyGWs1TrlGYLf1kN_By0XFW3t0Q,14346
97
+ datachain-0.2.1.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
98
+ datachain-0.2.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.1.dist-info/RECORD,,
datachain/_version.py DELETED
@@ -1,16 +0,0 @@
1
- # file generated by setuptools_scm
2
- # don't change, don't track in version control
3
- TYPE_CHECKING = False
4
- if TYPE_CHECKING:
5
- from typing import Tuple, Union
6
- VERSION_TUPLE = Tuple[Union[int, str], ...]
7
- else:
8
- VERSION_TUPLE = object
9
-
10
- version: str
11
- __version__: str
12
- __version_tuple__: VERSION_TUPLE
13
- version_tuple: VERSION_TUPLE
14
-
15
- __version__ = version = '0.2.0'
16
- __version_tuple__ = version_tuple = (0, 2, 0)