datachain 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,4 +1,5 @@
1
1
  import ast
2
+ import glob
2
3
  import io
3
4
  import json
4
5
  import logging
@@ -709,7 +710,12 @@ class Catalog:
709
710
 
710
711
  client_config = client_config or self.client_config
711
712
  client, path = self.parse_url(source, **client_config)
712
- prefix = posixpath.dirname(path)
713
+ stem = os.path.basename(os.path.normpath(path))
714
+ prefix = (
715
+ posixpath.dirname(path)
716
+ if glob.has_magic(stem) or client.fs.isfile(source)
717
+ else path
718
+ )
713
719
  storage_dataset_name = Storage.dataset_name(
714
720
  client.uri, posixpath.join(prefix, "")
715
721
  )
datachain/cli.py CHANGED
@@ -491,6 +491,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
491
491
  type=int,
492
492
  help="Dataset version",
493
493
  )
494
+ show_parser.add_argument("--schema", action="store_true", help="Show schema")
494
495
  add_show_args(show_parser)
495
496
 
496
497
  query_parser = subp.add_parser(
@@ -816,10 +817,15 @@ def show(
816
817
  offset: int = 0,
817
818
  columns: Sequence[str] = (),
818
819
  no_collapse: bool = False,
820
+ schema: bool = False,
819
821
  ) -> None:
822
+ from datachain.lib.dc import DataChain
820
823
  from datachain.query import DatasetQuery
821
824
  from datachain.utils import show_records
822
825
 
826
+ dataset = catalog.get_dataset(name)
827
+ dataset_version = dataset.get_version(version or dataset.latest_version)
828
+
823
829
  query = (
824
830
  DatasetQuery(name=name, version=version, catalog=catalog)
825
831
  .select(*columns)
@@ -828,6 +834,10 @@ def show(
828
834
  )
829
835
  records = query.to_db_records()
830
836
  show_records(records, collapse_columns=not no_collapse)
837
+ if schema and dataset_version.feature_schema:
838
+ print("\nSchema:")
839
+ dc = DataChain(name=name, version=version, catalog=catalog)
840
+ dc.print_schema()
831
841
 
832
842
 
833
843
  def query(
@@ -1013,6 +1023,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1013
1023
  offset=args.offset,
1014
1024
  columns=args.columns,
1015
1025
  no_collapse=args.no_collapse,
1026
+ schema=args.schema,
1016
1027
  )
1017
1028
  elif args.command == "rm-dataset":
1018
1029
  rm_dataset(catalog, args.name, version=args.version, force=args.force)
@@ -421,10 +421,6 @@ class AbstractMetastore(ABC, Serializable):
421
421
  ) -> None:
422
422
  """Set the status of the given job and dataset."""
423
423
 
424
- @abstractmethod
425
- def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
426
- """Returns the possibly stale jobs."""
427
-
428
424
 
429
425
  class AbstractDBMetastore(AbstractMetastore):
430
426
  """
@@ -19,8 +19,12 @@ from datachain.sql.types import Int, SQLType, UInt64
19
19
  if TYPE_CHECKING:
20
20
  from sqlalchemy import Engine
21
21
  from sqlalchemy.engine.interfaces import Dialect
22
- from sqlalchemy.sql.base import Executable, ReadOnlyColumnCollection
23
- from sqlalchemy.sql.elements import KeyedColumnElement
22
+ from sqlalchemy.sql.base import (
23
+ ColumnCollection,
24
+ Executable,
25
+ ReadOnlyColumnCollection,
26
+ )
27
+ from sqlalchemy.sql.elements import ColumnElement
24
28
 
25
29
 
26
30
  def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
@@ -43,7 +47,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
43
47
 
44
48
 
45
49
  def convert_rows_custom_column_types(
46
- columns: "ReadOnlyColumnCollection[str, KeyedColumnElement[Any]]",
50
+ columns: "ColumnCollection[str, ColumnElement[Any]]",
47
51
  rows: Iterator[tuple[Any, ...]],
48
52
  dialect: "Dialect",
49
53
  ):
@@ -496,9 +496,6 @@ class SQLiteMetastore(AbstractDBMetastore):
496
496
  def _jobs_insert(self) -> "Insert":
497
497
  return sqlite.insert(self._jobs)
498
498
 
499
- def get_possibly_stale_jobs(self) -> list[tuple[str, str, int]]:
500
- raise NotImplementedError("get_possibly_stale_jobs not implemented for SQLite")
501
-
502
499
 
503
500
  class SQLiteWarehouse(AbstractWarehouse):
504
501
  """
@@ -594,7 +591,7 @@ class SQLiteWarehouse(AbstractWarehouse):
594
591
  ):
595
592
  rows = self.db.execute(select_query, **kwargs)
596
593
  yield from convert_rows_custom_column_types(
597
- select_query.columns, rows, sqlite_dialect
594
+ select_query.selected_columns, rows, sqlite_dialect
598
595
  )
599
596
 
600
597
  def get_dataset_sources(
@@ -494,7 +494,7 @@ class AbstractWarehouse(ABC, Serializable):
494
494
  This gets nodes based on the provided query, and should be used sparingly,
495
495
  as it will be slow on any OLAP database systems.
496
496
  """
497
- columns = [c.name for c in query.columns]
497
+ columns = [c.name for c in query.selected_columns]
498
498
  for row in self.db.execute(query):
499
499
  d = dict(zip(columns, row))
500
500
  yield Node(**d)
@@ -912,29 +912,6 @@ class AbstractWarehouse(ABC, Serializable):
912
912
  for name in names:
913
913
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
914
914
 
915
- def subtract_query(
916
- self,
917
- source_query: sa.sql.selectable.Select,
918
- target_query: sa.sql.selectable.Select,
919
- ) -> sa.sql.selectable.Select:
920
- sq = source_query.alias("source_query")
921
- tq = target_query.alias("target_query")
922
-
923
- source_target_join = sa.join(
924
- sq,
925
- tq,
926
- (sq.c.source == tq.c.source)
927
- & (sq.c.parent == tq.c.parent)
928
- & (sq.c.name == tq.c.name),
929
- isouter=True,
930
- )
931
-
932
- return (
933
- select(*sq.c)
934
- .select_from(source_target_join)
935
- .where((tq.c.name == None) | (tq.c.name == "")) # noqa: E711
936
- )
937
-
938
915
  def changed_query(
939
916
  self,
940
917
  source_query: sa.sql.selectable.Select,
@@ -48,10 +48,10 @@ def _flatten_fields_values(fields, obj: BaseModel):
48
48
  value = getattr(obj, name)
49
49
 
50
50
  if isinstance(value, list):
51
- yield [
52
- val.model_dump() if ModelStore.is_pydantic(type(val)) else val
53
- for val in value
54
- ]
51
+ if value and ModelStore.is_pydantic(type(value[0])):
52
+ yield [val.model_dump() for val in value]
53
+ else:
54
+ yield value
55
55
  elif isinstance(value, dict):
56
56
  yield {
57
57
  key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
@@ -71,7 +71,10 @@ def values_to_tuples( # noqa: C901, PLR0912
71
71
  f"signal '{k}' has unsupported type '{typ.__name__}'."
72
72
  f" Please use DataModel types: {DataTypeNames}",
73
73
  )
74
- types_map[k] = typ
74
+ if typ is list:
75
+ types_map[k] = list[type(v[0][0])] # type: ignore[misc]
76
+ else:
77
+ types_map[k] = typ
75
78
 
76
79
  if length < 0:
77
80
  length = len_
datachain/lib/dc.py CHANGED
@@ -342,7 +342,7 @@ class DataChain(DatasetQuery):
342
342
  spec: Optional[DataType] = None,
343
343
  schema_from: Optional[str] = "auto",
344
344
  jmespath: Optional[str] = None,
345
- object_name: str = "",
345
+ object_name: Optional[str] = "",
346
346
  model_name: Optional[str] = None,
347
347
  show_schema: Optional[bool] = False,
348
348
  meta_type: Optional[str] = "json",
@@ -364,12 +364,12 @@ class DataChain(DatasetQuery):
364
364
  nrows : optional row limit for jsonl and JSON arrays
365
365
 
366
366
  Example:
367
- infer JSON schema from data, reduce using JMESPATH, print schema
367
+ infer JSON schema from data, reduce using JMESPATH
368
368
  ```py
369
369
  chain = DataChain.from_json("gs://json", jmespath="key1.key2")
370
370
  ```
371
371
 
372
- infer JSON schema from a particular path, print data model
372
+ infer JSON schema from a particular path
373
373
  ```py
374
374
  chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
375
375
  ```
@@ -384,7 +384,7 @@ class DataChain(DatasetQuery):
384
384
  if (not object_name) and jmespath:
385
385
  object_name = jmespath_to_name(jmespath)
386
386
  if not object_name:
387
- object_name = "json"
387
+ object_name = meta_type
388
388
  chain = DataChain.from_storage(path=path, type=type, **kwargs)
389
389
  signal_dict = {
390
390
  object_name: read_meta(
@@ -397,7 +397,67 @@ class DataChain(DatasetQuery):
397
397
  nrows=nrows,
398
398
  )
399
399
  }
400
- return chain.gen(**signal_dict) # type: ignore[arg-type]
400
+ return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
401
+
402
+ @classmethod
403
+ def from_jsonl(
404
+ cls,
405
+ path,
406
+ type: Literal["binary", "text", "image"] = "text",
407
+ spec: Optional[DataType] = None,
408
+ schema_from: Optional[str] = "auto",
409
+ jmespath: Optional[str] = None,
410
+ object_name: Optional[str] = "",
411
+ model_name: Optional[str] = None,
412
+ show_schema: Optional[bool] = False,
413
+ meta_type: Optional[str] = "jsonl",
414
+ nrows=None,
415
+ **kwargs,
416
+ ) -> "DataChain":
417
+ """Get data from JSON lines. It returns the chain itself.
418
+
419
+ Parameters:
420
+ path : storage URI with directory. URI must start with storage prefix such
421
+ as `s3://`, `gs://`, `az://` or "file:///"
422
+ type : read file as "binary", "text", or "image" data. Default is "binary".
423
+ spec : optional Data Model
424
+ schema_from : path to sample to infer spec (if schema not provided)
425
+ object_name : generated object column name
426
+ model_name : optional generated model name
427
+ show_schema : print auto-generated schema
428
+ jmespath : optional JMESPATH expression to reduce JSON
429
+ nrows : optional row limit for jsonl and JSON arrays
430
+
431
+ Example:
432
+ infer JSONl schema from data, limit parsing to 1 row
433
+ ```py
434
+ chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
435
+ ```
436
+ """
437
+ if schema_from == "auto":
438
+ schema_from = path
439
+
440
+ def jmespath_to_name(s: str):
441
+ name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
442
+ return s[:name_end]
443
+
444
+ if (not object_name) and jmespath:
445
+ object_name = jmespath_to_name(jmespath)
446
+ if not object_name:
447
+ object_name = meta_type
448
+ chain = DataChain.from_storage(path=path, type=type, **kwargs)
449
+ signal_dict = {
450
+ object_name: read_meta(
451
+ schema_from=schema_from,
452
+ meta_type=meta_type,
453
+ spec=spec,
454
+ model_name=model_name,
455
+ show_schema=show_schema,
456
+ jmespath=jmespath,
457
+ nrows=nrows,
458
+ )
459
+ }
460
+ return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
401
461
 
402
462
  @classmethod
403
463
  def datasets(
@@ -951,6 +1011,41 @@ class DataChain(DatasetQuery):
951
1011
 
952
1012
  return ds
953
1013
 
1014
+ def subtract( # type: ignore[override]
1015
+ self,
1016
+ other: "DataChain",
1017
+ on: Optional[Union[str, Sequence[str]]] = None,
1018
+ ) -> "Self":
1019
+ """Remove rows that appear in another chain.
1020
+
1021
+ Parameters:
1022
+ other: chain whose rows will be removed from `self`
1023
+ on: columns to consider for determining row equality. If unspecified,
1024
+ defaults to all common columns between `self` and `other`.
1025
+ """
1026
+ if isinstance(on, str):
1027
+ on = [on]
1028
+ if on is None:
1029
+ other_columns = set(other._effective_signals_schema.db_signals())
1030
+ signals = [
1031
+ c
1032
+ for c in self._effective_signals_schema.db_signals()
1033
+ if c in other_columns
1034
+ ]
1035
+ if not signals:
1036
+ raise DataChainParamsError("subtract(): no common columns")
1037
+ elif not isinstance(on, Sequence):
1038
+ raise TypeError(
1039
+ f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
1040
+ )
1041
+ elif not on:
1042
+ raise DataChainParamsError(
1043
+ "'on' cannot be empty",
1044
+ )
1045
+ else:
1046
+ signals = self.signals_schema.resolve(*on).db_signals()
1047
+ return super()._subtract(other, signals)
1048
+
954
1049
  @classmethod
955
1050
  def from_values(
956
1051
  cls,
datachain/lib/file.py CHANGED
@@ -12,7 +12,6 @@ from urllib.parse import unquote, urlparse
12
12
  from urllib.request import url2pathname
13
13
 
14
14
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
- from fsspec.implementations.local import LocalFileSystem
16
15
  from PIL import Image
17
16
  from pydantic import Field, field_validator
18
17
 
@@ -20,7 +19,7 @@ from datachain.cache import UniqueId
20
19
  from datachain.client.fileslice import FileSlice
21
20
  from datachain.lib.data_model import DataModel
22
21
  from datachain.lib.utils import DataChainError
23
- from datachain.sql.types import JSON, Int, String
22
+ from datachain.sql.types import JSON, Boolean, DateTime, Int, String
24
23
  from datachain.utils import TIME_ZERO
25
24
 
26
25
  if TYPE_CHECKING:
@@ -126,11 +125,13 @@ class File(DataModel):
126
125
  "source": String,
127
126
  "parent": String,
128
127
  "name": String,
128
+ "size": Int,
129
129
  "version": String,
130
130
  "etag": String,
131
- "size": Int,
132
- "vtype": String,
131
+ "is_latest": Boolean,
132
+ "last_modified": DateTime,
133
133
  "location": JSON,
134
+ "vtype": String,
134
135
  }
135
136
 
136
137
  _unique_id_keys: ClassVar[list[str]] = [
@@ -214,7 +215,7 @@ class File(DataModel):
214
215
  with self.open(mode="r") as stream:
215
216
  return stream.read()
216
217
 
217
- def write(self, destination: str):
218
+ def save(self, destination: str):
218
219
  """Writes it's content to destination"""
219
220
  with open(destination, mode="wb") as f:
220
221
  f.write(self.read())
@@ -232,7 +233,7 @@ class File(DataModel):
232
233
  dst_dir = os.path.dirname(dst)
233
234
  os.makedirs(dst_dir, exist_ok=True)
234
235
 
235
- self.write(dst)
236
+ self.save(dst)
236
237
 
237
238
  def _set_stream(
238
239
  self,
@@ -281,9 +282,8 @@ class File(DataModel):
281
282
  def get_path(self) -> str:
282
283
  """Returns file path."""
283
284
  path = unquote(self.get_uri())
284
- fs = self.get_fs()
285
- if isinstance(fs, LocalFileSystem):
286
- # Drop file:// protocol
285
+ source = urlparse(self.source)
286
+ if source.scheme == "file":
287
287
  path = urlparse(path).path
288
288
  path = url2pathname(path)
289
289
  return path
@@ -298,13 +298,10 @@ class File(DataModel):
298
298
  elif placement == "etag":
299
299
  path = f"{self.etag}{self.get_file_suffix()}"
300
300
  elif placement == "fullpath":
301
- fs = self.get_fs()
302
- if isinstance(fs, LocalFileSystem):
303
- path = unquote(self.get_full_name())
304
- else:
305
- path = (
306
- Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
307
- ).as_posix()
301
+ path = unquote(self.get_full_name())
302
+ source = urlparse(self.source)
303
+ if source.scheme and source.scheme != "file":
304
+ path = posixpath.join(source.netloc, path)
308
305
  elif placement == "checksum":
309
306
  raise NotImplementedError("Checksum placement not implemented yet")
310
307
  else:
@@ -330,7 +327,7 @@ class TextFile(File):
330
327
  with self.open() as stream:
331
328
  return stream.read()
332
329
 
333
- def write(self, destination: str):
330
+ def save(self, destination: str):
334
331
  """Writes it's content to destination"""
335
332
  with open(destination, mode="w") as f:
336
333
  f.write(self.read_text())
@@ -344,7 +341,7 @@ class ImageFile(File):
344
341
  fobj = super().read()
345
342
  return Image.open(BytesIO(fobj))
346
343
 
347
- def write(self, destination: str):
344
+ def save(self, destination: str):
348
345
  """Writes it's content to destination"""
349
346
  self.read().save(destination)
350
347
 
@@ -360,21 +357,25 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
360
357
  source: str,
361
358
  parent: str,
362
359
  name: str,
360
+ size: int,
363
361
  version: str,
364
362
  etag: str,
365
- size: int,
366
- vtype: str,
363
+ is_latest: bool,
364
+ last_modified: datetime,
367
365
  location: Optional[Union[dict, list[dict]]],
366
+ vtype: str,
368
367
  ) -> file: # type: ignore[valid-type]
369
368
  return file(
370
369
  source=source,
371
370
  parent=parent,
372
371
  name=name,
372
+ size=size,
373
373
  version=version,
374
374
  etag=etag,
375
- size=size,
376
- vtype=vtype,
375
+ is_latest=is_latest,
376
+ last_modified=last_modified,
377
377
  location=location,
378
+ vtype=vtype,
378
379
  )
379
380
 
380
381
  return get_file_type
@@ -11,9 +11,9 @@ from collections.abc import Iterator
11
11
  from typing import Any, Callable
12
12
 
13
13
  import jmespath as jsp
14
- from pydantic import ValidationError
14
+ from pydantic import Field, ValidationError # noqa: F401
15
15
 
16
- from datachain.lib.data_model import ModelStore # noqa: F401
16
+ from datachain.lib.data_model import DataModel # noqa: F401
17
17
  from datachain.lib.file import File
18
18
 
19
19
 
@@ -87,7 +87,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
87
87
  except subprocess.CalledProcessError as e:
88
88
  model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
89
89
  print(f"{model_output}")
90
- print("\n" + f"ModelStore.register({model_name})" + "\n")
90
+ print("\n" + "from datachain.lib.data_model import DataModel" + "\n")
91
+ print("\n" + f"DataModel.register({model_name})" + "\n")
91
92
  print("\n" + f"spec={model_name}" + "\n")
92
93
  return model_output
93
94
 
@@ -147,7 +148,7 @@ def read_meta( # noqa: C901
147
148
 
148
149
  def parse_data(
149
150
  file: File,
150
- DataModel=spec, # noqa: N803
151
+ data_model=spec,
151
152
  meta_type=meta_type,
152
153
  jmespath=jmespath,
153
154
  nrows=nrows,
@@ -155,7 +156,7 @@ def read_meta( # noqa: C901
155
156
  def validator(json_object: dict) -> spec:
156
157
  json_string = json.dumps(json_object)
157
158
  try:
158
- data_instance = DataModel.model_validate_json(json_string)
159
+ data_instance = data_model.model_validate_json(json_string)
159
160
  yield data_instance
160
161
  except ValidationError as e:
161
162
  print(f"Validation error occurred in file {file.name}:", e)
@@ -25,6 +25,7 @@ from typing import (
25
25
 
26
26
  import attrs
27
27
  import sqlalchemy
28
+ import sqlalchemy as sa
28
29
  from attrs import frozen
29
30
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
30
31
  from sqlalchemy import Column
@@ -250,7 +251,7 @@ class DatasetDiffOperation(Step):
250
251
  self,
251
252
  source_query: Select,
252
253
  target_query: Select,
253
- ) -> Select:
254
+ ) -> sa.Selectable:
254
255
  """
255
256
  Should return select query that calculates desired diff between dataset queries
256
257
  """
@@ -268,7 +269,7 @@ class DatasetDiffOperation(Step):
268
269
 
269
270
  columns = [
270
271
  c if isinstance(c, Column) else Column(c.name, c.type)
271
- for c in source_query.columns
272
+ for c in source_query.selected_columns
272
273
  ]
273
274
  temp_table = self.catalog.warehouse.create_dataset_rows_table(
274
275
  temp_table_name,
@@ -292,23 +293,16 @@ class DatasetDiffOperation(Step):
292
293
 
293
294
  @frozen
294
295
  class Subtract(DatasetDiffOperation):
295
- """
296
- Calculates rows that are in a source query but are not in target query (diff)
297
- This can be used to do delta updates (calculate UDF only on newly added rows)
298
- Example:
299
- >>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
300
- >>> ds_updated = (
301
- DatasetQuery("gs://dvcx-datalakes/dogs-and-cats")
302
- .filter(C.size > 1000) # we can also filter out source query
303
- .subtract(ds)
304
- .add_signals(calc_embeddings) # calculae embeddings only on new rows
305
- .union(ds) # union with old dataset that's missing new rows
306
- .save("dogs_cats_updated")
307
- )
308
- """
296
+ on: Sequence[str]
309
297
 
310
- def query(self, source_query: Select, target_query: Select) -> Select:
311
- return self.catalog.warehouse.subtract_query(source_query, target_query)
298
+ def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
299
+ sq = source_query.alias("source_query")
300
+ tq = target_query.alias("target_query")
301
+ where_clause = sa.and_(
302
+ getattr(sq.c, col_name).is_not_distinct_from(getattr(tq.c, col_name))
303
+ for col_name in self.on
304
+ ) # type: ignore[arg-type]
305
+ return sq.select().except_(sq.select().where(where_clause))
312
306
 
313
307
 
314
308
  @frozen
@@ -820,8 +814,16 @@ class SQLMutate(SQLClause):
820
814
  args: tuple[ColumnElement, ...]
821
815
 
822
816
  def apply_sql_clause(self, query: Select) -> Select:
823
- subquery = query.subquery()
824
- return sqlalchemy.select(*subquery.c, *self.args).select_from(subquery)
817
+ original_subquery = query.subquery()
818
+ # this is needed for new column to be used in clauses
819
+ # like ORDER BY, otherwise new column is not recognized
820
+ subquery = (
821
+ sqlalchemy.select(*original_subquery.c, *self.args)
822
+ .select_from(original_subquery)
823
+ .subquery()
824
+ )
825
+
826
+ return sqlalchemy.select(*subquery.c).select_from(subquery)
825
827
 
826
828
 
827
829
  @frozen
@@ -1252,7 +1254,7 @@ class DatasetQuery:
1252
1254
  def as_iterable(self, **kwargs) -> Iterator[ResultIter]:
1253
1255
  try:
1254
1256
  query = self.apply_steps().select()
1255
- selected_columns = [c.name for c in query.columns]
1257
+ selected_columns = [c.name for c in query.selected_columns]
1256
1258
  yield ResultIter(
1257
1259
  self.catalog.warehouse.dataset_rows_select(query, **kwargs),
1258
1260
  selected_columns,
@@ -1556,8 +1558,12 @@ class DatasetQuery:
1556
1558
 
1557
1559
  @detach
1558
1560
  def subtract(self, dq: "DatasetQuery") -> "Self":
1561
+ return self._subtract(dq, on=["source", "parent", "name"])
1562
+
1563
+ @detach
1564
+ def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
1559
1565
  query = self.clone()
1560
- query.steps.append(Subtract(dq, self.catalog))
1566
+ query.steps.append(Subtract(dq, self.catalog, on=on))
1561
1567
  return query
1562
1568
 
1563
1569
  @detach
@@ -1676,7 +1682,7 @@ class DatasetQuery:
1676
1682
  f.row_number().over(order_by=q._order_by_clauses).label("sys__id")
1677
1683
  )
1678
1684
 
1679
- cols = tuple(c.name for c in q.columns)
1685
+ cols = tuple(c.name for c in q.selected_columns)
1680
1686
  insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
1681
1687
  self.catalog.warehouse.db.execute(insert_q, **kwargs)
1682
1688
  self.catalog.metastore.update_dataset_status(
@@ -5,8 +5,8 @@ from datetime import MAXYEAR, MINYEAR, datetime, timezone
5
5
  from types import MappingProxyType
6
6
  from typing import Callable, Optional
7
7
 
8
+ import orjson
8
9
  import sqlalchemy as sa
9
- import ujson
10
10
  from sqlalchemy.dialects import sqlite
11
11
  from sqlalchemy.ext.compiler import compiles
12
12
  from sqlalchemy.sql.elements import literal
@@ -149,7 +149,7 @@ def missing_vector_function(name, exc):
149
149
 
150
150
 
151
151
  def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
152
- return ujson.dumps(string.split(sep, maxsplit))
152
+ return orjson.dumps(string.split(sep, maxsplit)).decode("utf-8")
153
153
 
154
154
 
155
155
  def register_user_defined_sql_functions() -> None:
@@ -274,7 +274,7 @@ def compile_euclidean_distance(element, compiler, **kwargs):
274
274
 
275
275
 
276
276
  def py_json_array_length(arr):
277
- return len(ujson.loads(arr))
277
+ return len(orjson.loads(arr))
278
278
 
279
279
 
280
280
  def compile_array_length(element, compiler, **kwargs):
@@ -1,7 +1,6 @@
1
- import json
2
1
  import sqlite3
3
2
 
4
- import ujson
3
+ import orjson
5
4
  from sqlalchemy import types
6
5
 
7
6
  from datachain.sql.types import TypeConverter, TypeReadConverter
@@ -29,22 +28,15 @@ class Array(types.UserDefinedType):
29
28
 
30
29
 
31
30
  def adapt_array(arr):
32
- return ujson.dumps(arr)
31
+ return orjson.dumps(arr).decode("utf-8")
33
32
 
34
33
 
35
34
  def convert_array(arr):
36
- return ujson.loads(arr)
35
+ return orjson.loads(arr)
37
36
 
38
37
 
39
38
  def adapt_np_array(arr):
40
- def _json_serialize(obj):
41
- if isinstance(obj, np.ndarray):
42
- return obj.tolist()
43
- return obj
44
-
45
- if np.issubdtype(arr.dtype, np.object_):
46
- return json.dumps(arr.tolist(), default=_json_serialize)
47
- return ujson.dumps(arr.tolist())
39
+ return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode("utf-8")
48
40
 
49
41
 
50
42
  def adapt_np_generic(val):
@@ -70,5 +62,5 @@ class SQLiteTypeConverter(TypeConverter):
70
62
  class SQLiteTypeReadConverter(TypeReadConverter):
71
63
  def array(self, value, item_type, dialect):
72
64
  if isinstance(value, str):
73
- value = ujson.loads(value)
65
+ value = orjson.loads(value)
74
66
  return super().array(value, item_type, dialect)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.12
3
+ Version: 0.2.14
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -35,7 +35,7 @@ Requires-Dist: sqlalchemy >=2
35
35
  Requires-Dist: multiprocess ==0.70.16
36
36
  Requires-Dist: dill ==0.3.8
37
37
  Requires-Dist: cloudpickle
38
- Requires-Dist: ujson >=5.9.0
38
+ Requires-Dist: orjson >=3.10.5
39
39
  Requires-Dist: pydantic <3,>=2
40
40
  Requires-Dist: jmespath >=1.0
41
41
  Requires-Dist: datamodel-code-generator >=0.25
@@ -45,9 +45,9 @@ Provides-Extra: dev
45
45
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
46
46
  Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
47
47
  Requires-Dist: types-python-dateutil ; extra == 'dev'
48
+ Requires-Dist: types-pytz ; extra == 'dev'
48
49
  Requires-Dist: types-PyYAML ; extra == 'dev'
49
50
  Requires-Dist: types-requests ; extra == 'dev'
50
- Requires-Dist: types-ujson ; extra == 'dev'
51
51
  Provides-Extra: docs
52
52
  Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
53
53
  Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
@@ -103,20 +103,18 @@ AI 🔗 DataChain
103
103
  DataChain is an open-source Python library for processing and curating unstructured
104
104
  data at scale.
105
105
 
106
- 🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
106
+ 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
107
107
 
108
- 🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
108
+ 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
109
109
 
110
- 🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
110
+ 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
111
111
 
112
112
 
113
- To ensure efficiency, Datachain supports parallel processing, parallel data
114
- downloads, and out-of-memory computing. It excels at optimizing batch operations.
115
- While most GenAI tools focus on online applications and realtime, DataChain is designed
116
- for offline data processing, data curation and ETL.
113
+ Datachain supports parallel processing, parallel data
114
+ downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
117
115
 
118
- The typical use cases are Computer Vision data curation, LLM analytics
119
- and validation.
116
+ The typical use cases include Computer Vision data curation, LLM analytics,
117
+ and validation of multimodal AI applications.
120
118
 
121
119
 
122
120
  .. code:: console
@@ -128,25 +126,25 @@ and validation.
128
126
  Quick Start
129
127
  -----------
130
128
 
131
- Basic evaluation
132
- ================
129
+ Data curation with a local model
130
+ =================================
133
131
 
134
132
  We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
135
- - 50 files total in the example.
136
- These dialogs involve users looking for better wireless plans chatting with bot.
137
- Our goal is to identify successful dialogs.
133
+ - 50 files total in this example.
134
+ These dialogs involve users chatting with a bot while looking for better wireless plans.
135
+ Our goal is to identify the successful dialogs.
138
136
 
139
- The data used in the examples is publicly available. Please feel free to run this code.
137
+ The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
140
138
 
141
- First, we'll use a simple sentiment analysis model. Please install transformers.
139
+ First, we'll show batch inference with a simple sentiment model using the `transformers` library:
142
140
 
143
141
  .. code:: shell
144
142
 
145
143
  pip install transformers
146
144
 
147
- The code below downloads files the cloud, applies function
148
- `is_positive_dialogue_ending()` to each. All files with a positive sentiment
149
- are copied to local directory `output/`.
145
+ The code below downloads files the cloud, and applies a user-defined function
146
+ to each one of them. All files with a positive sentiment
147
+ detected are then copied to the local directory.
150
148
 
151
149
  .. code:: py
152
150
 
@@ -169,7 +167,7 @@ are copied to local directory `output/`.
169
167
  )
170
168
 
171
169
  positive_chain = chain.filter(Column("is_positive") == True)
172
- positive_chain.export_files("./output1")
170
+ positive_chain.export_files("./output")
173
171
 
174
172
  print(f"{positive_chain.count()} files were exported")
175
173
 
@@ -185,11 +183,11 @@ are copied to local directory `output/`.
185
183
  13
186
184
 
187
185
 
188
- LLM judging LLMs dialogs
189
- ==========================
186
+ LLM judging chatbots
187
+ =============================
190
188
 
191
- Finding good dialogs using an LLM can be more efficient. In this example,
192
- we use Mistral with a free API. Please install the package and get a free
189
+ LLMs can work as efficient universal classifiers. In the example below,
190
+ we employ a free API from Mistral to judge the chatbot performance. Please get a free
193
191
  Mistral API key at https://console.mistral.ai
194
192
 
195
193
  .. code:: shell
@@ -197,9 +195,7 @@ Mistral API key at https://console.mistral.ai
197
195
  $ pip install mistralai
198
196
  $ export MISTRAL_API_KEY=_your_key_
199
197
 
200
- Below is a similar code example, but this time using an LLM to evaluate the dialogs.
201
- Note, only 4 threads were used in this example `parallel=4` due to a limitation of
202
- the free LLM service.
198
+ DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
203
199
 
204
200
  .. code:: py
205
201
 
@@ -231,7 +227,7 @@ the free LLM service.
231
227
  print(f"{successful_chain.count()} files were exported")
232
228
 
233
229
 
234
- With the current prompt, we found 31 files considered successful dialogs:
230
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
235
231
 
236
232
  .. code:: shell
237
233
 
@@ -245,11 +241,11 @@ With the current prompt, we found 31 files considered successful dialogs:
245
241
  Serializing Python-objects
246
242
  ==========================
247
243
 
248
- LLM responses contain valuable information for analytics, such as tokens used and the
249
- model. Preserving this information can be beneficial.
244
+ LLM responses may contain valuable information for analytics such as the number of tokens used, or the
245
+ model performance parameters.
250
246
 
251
- Instead of extracting this information from the Mistral data structure (class
252
- `ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
247
+ Instead of extracting this information from the Mistral response data structure (class
248
+ `ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
253
249
 
254
250
 
255
251
  .. code:: py
@@ -297,21 +293,23 @@ Output:
297
293
  64.0% dialogs were successful
298
294
 
299
295
 
300
- Complex Python data structures
296
+ Iterating over Python data structures
301
297
  =============================================
302
298
 
303
- In the previous examples, a few dataset were saved in the embedded database
304
- (`SQLite`_ in directory `.datachain`).
305
- These datasets are versioned, and can be accessed using
299
+ In the previous examples, datasets were saved in the embedded database
300
+ (`SQLite`_ in folder `.datachain` of the working directory).
301
+ These datasets were automatically versioned, and can be accessed using
306
302
  `DataChain.from_dataset("dataset_name")`.
307
303
 
304
+ Here is how to retrieve a saved dataset and iterate over the objects:
305
+
308
306
  .. code:: py
309
307
 
310
308
  chain = DataChain.from_dataset("response")
311
309
 
312
- # Iterating one-by-one: out of memory
310
+ # Iterating one-by-one: support out-of-memory workflow
313
311
  for file, response in chain.limit(5).collect("file", "response"):
314
- # You work with Python objects
312
+ # verify the collected Python objects
315
313
  assert isinstance(response, ChatCompletionResponse)
316
314
 
317
315
  status = response.choices[0].message.content[:7]
@@ -332,9 +330,8 @@ Output:
332
330
  Vectorized analytics over Python objects
333
331
  ========================================
334
332
 
335
- Some operations can be efficiently run inside the DB without deserializing Python objects.
336
- Let's calculate the cost of using LLM APIs in a vectorized way.
337
- Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
333
+ Some operations can run inside the DB without deserialization.
334
+ For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
338
335
 
339
336
  .. code:: py
340
337
 
@@ -406,6 +403,7 @@ Community and Support
406
403
  .. github-only
407
404
  .. _Contributor Guide: CONTRIBUTING.rst
408
405
  .. _Pydantic: https://github.com/pydantic/pydantic
406
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
409
407
  .. _SQLite: https://www.sqlite.org/
410
408
  .. _Getting Started: https://datachain.dvc.ai/
411
409
  .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
4
  datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
5
- datachain/cli.py,sha256=MSOID2t-kesk5Z80uoepN63rqvB7iZxaWYLqkiWehkQ,32628
5
+ datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=u8tvWooIon9ju59q8-Re_iqflgbCB-JMZD8n2UC4iag,80397
20
+ datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -32,20 +32,20 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
32
32
  datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
33
33
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=R1Jj8dOTAex8fjehewV2vUO4VhBSjj8JQI5mM3YhVEQ,54989
36
- datachain/data_storage/schema.py,sha256=hUykqT-As-__WffMdWTrSZwv9k5EYYowRke3OENQ3aY,8102
35
+ datachain/data_storage/metastore.py,sha256=wVcT8MiSH_paWEXN6eZ8Z3msrHY6vWtVFTH5kwHteRE,54852
36
+ datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=cIYobczfH72c4l-iMkxpkgcTuuvvT8Xi64iP7Zr3Skw,25084
39
- datachain/data_storage/warehouse.py,sha256=FedcsvkAphpi2tUnlcrxO4mYumiCQAcrB5XRAK9tfXQ,33288
38
+ datachain/data_storage/sqlite.py,sha256=i4h8ZY15A2YNXd2PU5BZPoRaBqqs9lOdPtBjC0BZy3s,24935
39
+ datachain/data_storage/warehouse.py,sha256=fQO6UZc2MFgFPRnpCQW7c1GCl3FJBYE4dtA_ZXWuA8M,32627
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
42
42
  datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
43
  datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
46
- datachain/lib/file.py,sha256=xiLHaqyl4rqcBLGD62YD3aBIAOmX4EBVucxIncpRi80,11916
45
+ datachain/lib/dc.py,sha256=I3BLJJK17kB8velBSCTjtoR8CcPZOHPgFTibS9OclmY,54155
46
+ datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
- datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
48
+ datachain/lib/meta_formats.py,sha256=WRjUzaBKo0IJFHhKz7dxzAKXjR4OvuzsLjkdjyewL6Q,7001
49
49
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
50
  datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
51
51
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
@@ -58,15 +58,15 @@ datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
59
59
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
60
60
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
- datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
61
+ datachain/lib/convert/flatten.py,sha256=vrj2Kg-I1YAq2OGAFIwFUqtIesGpweve3c1ipeFOvDQ,1615
62
62
  datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
63
63
  datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
64
64
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
65
- datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYgfJ6A2i7l_6Jo,3592
65
+ datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
66
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
67
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
68
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
69
- datachain/query/dataset.py,sha256=m0bDQK_xXB85KPdJpH3OHdW6WJd1_PMgi01GRcWiiSg,61280
69
+ datachain/query/dataset.py,sha256=VhsbHTOps-E4_trLzkJWGQV3zblN6LdlyHED9-3H5Vo,61388
70
70
  datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
71
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
72
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -88,13 +88,13 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
88
88
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
89
89
  datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
90
90
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
91
- datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
92
- datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
91
+ datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,12067
92
+ datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
93
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
94
94
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
- datachain-0.2.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.12.dist-info/METADATA,sha256=QfDhY5jkblcb94A5CxT-ELhDcwDzZq1ju4cPQXHDEkY,14333
97
- datachain-0.2.12.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
- datachain-0.2.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.12.dist-info/RECORD,,
95
+ datachain-0.2.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.14.dist-info/METADATA,sha256=UiBiVmF8nF2aIimMNPn3XB14OhIbRj0w4w5q72qTaRM,14577
97
+ datachain-0.2.14.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
+ datachain-0.2.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.14.dist-info/RECORD,,