datachain 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from datachain.lib import func
1
2
  from datachain.lib.data_model import DataModel, DataType, is_chain_type
2
3
  from datachain.lib.dc import C, Column, DataChain, Sys
3
4
  from datachain.lib.file import (
@@ -34,6 +35,7 @@ __all__ = [
34
35
  "Sys",
35
36
  "TarVFile",
36
37
  "TextFile",
38
+ "func",
37
39
  "is_chain_type",
38
40
  "metrics",
39
41
  "param",
@@ -989,13 +989,6 @@ class Catalog:
989
989
  c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
990
990
  }
991
991
 
992
- job_id = job_id or os.getenv("DATACHAIN_JOB_ID")
993
- if not job_id:
994
- from datachain.query.session import Session
995
-
996
- session = Session.get(catalog=self)
997
- job_id = session.job_id
998
-
999
992
  dataset = self.metastore.create_dataset_version(
1000
993
  dataset,
1001
994
  version,
@@ -1218,6 +1211,7 @@ class Catalog:
1218
1211
  preview=dataset_version.preview,
1219
1212
  job_id=dataset_version.job_id,
1220
1213
  )
1214
+
1221
1215
  # to avoid re-creating rows table, we are just renaming it for a new version
1222
1216
  # of target dataset
1223
1217
  self.warehouse.rename_dataset_table(
@@ -1325,8 +1319,6 @@ class Catalog:
1325
1319
  if offset:
1326
1320
  q = q.offset(offset)
1327
1321
 
1328
- q = q.order_by("sys__id")
1329
-
1330
1322
  return q.to_db_records()
1331
1323
 
1332
1324
  def signed_url(self, source: str, path: str, client_config=None) -> str:
@@ -763,6 +763,14 @@ class SQLiteWarehouse(AbstractWarehouse):
763
763
  query: Select,
764
764
  progress_cb: Optional[Callable[[int], None]] = None,
765
765
  ) -> None:
766
+ if len(query._group_by_clause) > 0:
767
+ select_q = query.with_only_columns(
768
+ *[c for c in query.selected_columns if c.name != "sys__id"]
769
+ )
770
+ q = table.insert().from_select(list(select_q.selected_columns), select_q)
771
+ self.db.execute(q)
772
+ return
773
+
766
774
  if "sys__id" in query.selected_columns:
767
775
  col_id = query.selected_columns.sys__id
768
776
  else:
@@ -215,10 +215,6 @@ class AbstractWarehouse(ABC, Serializable):
215
215
  limit = query._limit
216
216
  paginated_query = query.limit(page_size)
217
217
 
218
- if not paginated_query._order_by_clauses:
219
- # default order by is order by `sys__id`
220
- paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
221
-
222
218
  results = None
223
219
  offset = 0
224
220
  num_yielded = 0
@@ -4,15 +4,11 @@ from typing import Any
4
4
  from sqlalchemy import ColumnElement
5
5
 
6
6
 
7
- def sql_to_python(args_map: dict[str, ColumnElement]) -> dict[str, Any]:
8
- res = {}
9
- for name, sql_exp in args_map.items():
10
- try:
11
- type_ = sql_exp.type.python_type
12
- if type_ == Decimal:
13
- type_ = float
14
- except NotImplementedError:
15
- type_ = str
16
- res[name] = type_
17
-
18
- return res
7
+ def sql_to_python(sql_exp: ColumnElement) -> Any:
8
+ try:
9
+ type_ = sql_exp.type.python_type
10
+ if type_ == Decimal:
11
+ type_ = float
12
+ except NotImplementedError:
13
+ type_ = str
14
+ return type_
datachain/lib/dc.py CHANGED
@@ -29,6 +29,7 @@ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
29
29
  from datachain.lib.dataset_info import DatasetInfo
30
30
  from datachain.lib.file import ArrowRow, File, get_file_type
31
31
  from datachain.lib.file import ExportPlacement as FileExportPlacement
32
+ from datachain.lib.func import Func
32
33
  from datachain.lib.listing import (
33
34
  is_listing_dataset,
34
35
  is_listing_expired,
@@ -42,21 +43,12 @@ from datachain.lib.meta_formats import read_meta, read_schema
42
43
  from datachain.lib.model_store import ModelStore
43
44
  from datachain.lib.settings import Settings
44
45
  from datachain.lib.signal_schema import SignalSchema
45
- from datachain.lib.udf import (
46
- Aggregator,
47
- BatchMapper,
48
- Generator,
49
- Mapper,
50
- UDFBase,
51
- )
46
+ from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
52
47
  from datachain.lib.udf_signature import UdfSignature
53
- from datachain.lib.utils import DataChainParamsError
48
+ from datachain.lib.utils import DataChainColumnError, DataChainParamsError
54
49
  from datachain.query import Session
55
- from datachain.query.dataset import (
56
- DatasetQuery,
57
- PartitionByType,
58
- )
59
- from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
50
+ from datachain.query.dataset import DatasetQuery, PartitionByType
51
+ from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
60
52
  from datachain.sql.functions import path as pathfunc
61
53
  from datachain.telemetry import telemetry
62
54
  from datachain.utils import batched_it, inside_notebook
@@ -149,11 +141,6 @@ class DatasetMergeError(DataChainParamsError): # noqa: D101
149
141
  super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
150
142
 
151
143
 
152
- class DataChainColumnError(DataChainParamsError): # noqa: D101
153
- def __init__(self, col_name, msg): # noqa: D107
154
- super().__init__(f"Error for column {col_name}: {msg}")
155
-
156
-
157
144
  OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
158
145
 
159
146
 
@@ -982,10 +969,9 @@ class DataChain:
982
969
  row is left in the result set.
983
970
 
984
971
  Example:
985
- ```py
986
- dc.distinct("file.parent", "file.name")
987
- )
988
- ```
972
+ ```py
973
+ dc.distinct("file.parent", "file.name")
974
+ ```
989
975
  """
990
976
  return self._evolve(
991
977
  query=self._query.distinct(
@@ -1011,6 +997,60 @@ class DataChain:
1011
997
  query=self._query.select(*columns), signal_schema=new_schema
1012
998
  )
1013
999
 
1000
+ def group_by(
1001
+ self,
1002
+ *,
1003
+ partition_by: Union[str, Sequence[str]],
1004
+ **kwargs: Func,
1005
+ ) -> "Self":
1006
+ """Group rows by specified set of signals and return new signals
1007
+ with aggregated values.
1008
+
1009
+ Example:
1010
+ ```py
1011
+ chain = chain.group_by(
1012
+ cnt=func.count(),
1013
+ partition_by=("file_source", "file_ext"),
1014
+ )
1015
+ ```
1016
+ """
1017
+ if isinstance(partition_by, str):
1018
+ partition_by = [partition_by]
1019
+ if not partition_by:
1020
+ raise ValueError("At least one column should be provided for partition_by")
1021
+
1022
+ if not kwargs:
1023
+ raise ValueError("At least one column should be provided for group_by")
1024
+ for col_name, func in kwargs.items():
1025
+ if not isinstance(func, Func):
1026
+ raise DataChainColumnError(
1027
+ col_name,
1028
+ f"Column {col_name} has type {type(func)} but expected Func object",
1029
+ )
1030
+
1031
+ partition_by_columns: list[Column] = []
1032
+ signal_columns: list[Column] = []
1033
+ schema_fields: dict[str, DataType] = {}
1034
+
1035
+ # validate partition_by columns and add them to the schema
1036
+ for col_name in partition_by:
1037
+ col_db_name = ColumnMeta.to_db_name(col_name)
1038
+ col_type = self.signals_schema.get_column_type(col_db_name)
1039
+ col = Column(col_db_name, python_to_sql(col_type))
1040
+ partition_by_columns.append(col)
1041
+ schema_fields[col_db_name] = col_type
1042
+
1043
+ # validate signal columns and add them to the schema
1044
+ for col_name, func in kwargs.items():
1045
+ col = func.get_column(self.signals_schema, label=col_name)
1046
+ signal_columns.append(col)
1047
+ schema_fields[col_name] = func.get_result_type(self.signals_schema)
1048
+
1049
+ return self._evolve(
1050
+ query=self._query.group_by(signal_columns, partition_by_columns),
1051
+ signal_schema=SignalSchema(schema_fields),
1052
+ )
1053
+
1014
1054
  def mutate(self, **kwargs) -> "Self":
1015
1055
  """Create new signals based on existing signals.
1016
1056
 
@@ -1477,12 +1517,6 @@ class DataChain:
1477
1517
  fr_map = {col.lower(): df[col].tolist() for col in df.columns}
1478
1518
 
1479
1519
  for column in fr_map:
1480
- if column in DatasetRow.schema:
1481
- raise DatasetPrepareError(
1482
- name,
1483
- f"import from pandas error - column '{column}' conflicts with"
1484
- " default schema",
1485
- )
1486
1520
  if not column.isidentifier():
1487
1521
  raise DatasetPrepareError(
1488
1522
  name,
@@ -1994,6 +2028,8 @@ class DataChain:
1994
2028
  ),
1995
2029
  )
1996
2030
 
2031
+ session.add_dataset_version(dsr, dsr.latest_version)
2032
+
1997
2033
  if isinstance(to_insert, dict):
1998
2034
  to_insert = [to_insert]
1999
2035
  elif not to_insert:
@@ -0,0 +1,14 @@
1
+ from .aggregate import any_value, avg, collect, concat, count, max, min, sum
2
+ from .func import Func
3
+
4
+ __all__ = [
5
+ "Func",
6
+ "any_value",
7
+ "avg",
8
+ "collect",
9
+ "concat",
10
+ "count",
11
+ "max",
12
+ "min",
13
+ "sum",
14
+ ]
@@ -0,0 +1,42 @@
1
+ from typing import Optional
2
+
3
+ from sqlalchemy import func as sa_func
4
+
5
+ from datachain.sql import functions as dc_func
6
+
7
+ from .func import Func
8
+
9
+
10
+ def count(col: Optional[str] = None) -> Func:
11
+ return Func(inner=sa_func.count, col=col, result_type=int)
12
+
13
+
14
+ def sum(col: str) -> Func:
15
+ return Func(inner=sa_func.sum, col=col)
16
+
17
+
18
+ def avg(col: str) -> Func:
19
+ return Func(inner=dc_func.aggregate.avg, col=col)
20
+
21
+
22
+ def min(col: str) -> Func:
23
+ return Func(inner=sa_func.min, col=col)
24
+
25
+
26
+ def max(col: str) -> Func:
27
+ return Func(inner=sa_func.max, col=col)
28
+
29
+
30
+ def any_value(col: str) -> Func:
31
+ return Func(inner=dc_func.aggregate.any_value, col=col)
32
+
33
+
34
+ def collect(col: str) -> Func:
35
+ return Func(inner=dc_func.aggregate.collect, col=col, is_array=True)
36
+
37
+
38
+ def concat(col: str, separator="") -> Func:
39
+ def inner(arg):
40
+ return dc_func.aggregate.group_concat(arg, separator)
41
+
42
+ return Func(inner=inner, col=col, result_type=str)
@@ -0,0 +1,64 @@
1
+ from typing import TYPE_CHECKING, Callable, Optional
2
+
3
+ from datachain.lib.convert.python_to_sql import python_to_sql
4
+ from datachain.lib.utils import DataChainColumnError
5
+ from datachain.query.schema import Column, ColumnMeta
6
+
7
+ if TYPE_CHECKING:
8
+ from datachain import DataType
9
+ from datachain.lib.signal_schema import SignalSchema
10
+
11
+
12
+ class Func:
13
+ def __init__(
14
+ self,
15
+ inner: Callable,
16
+ col: Optional[str] = None,
17
+ result_type: Optional["DataType"] = None,
18
+ is_array: bool = False,
19
+ ) -> None:
20
+ self.inner = inner
21
+ self.col = col
22
+ self.result_type = result_type
23
+ self.is_array = is_array
24
+
25
+ @property
26
+ def db_col(self) -> Optional[str]:
27
+ return ColumnMeta.to_db_name(self.col) if self.col else None
28
+
29
+ def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
30
+ if not self.db_col:
31
+ return None
32
+ col_type: type = signals_schema.get_column_type(self.db_col)
33
+ return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
34
+
35
+ def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
36
+ col_type = self.db_col_type(signals_schema)
37
+
38
+ if self.result_type:
39
+ return self.result_type
40
+
41
+ if col_type:
42
+ return col_type
43
+
44
+ raise DataChainColumnError(
45
+ str(self.inner),
46
+ "Column name is required to infer result type",
47
+ )
48
+
49
+ def get_column(
50
+ self, signals_schema: "SignalSchema", label: Optional[str] = None
51
+ ) -> Column:
52
+ if self.col:
53
+ if label == "collect":
54
+ print(label)
55
+ col_type = self.get_result_type(signals_schema)
56
+ col = Column(self.db_col, python_to_sql(col_type))
57
+ func_col = self.inner(col)
58
+ else:
59
+ func_col = self.inner()
60
+
61
+ if label:
62
+ func_col = func_col.label(label)
63
+
64
+ return func_col
@@ -400,6 +400,12 @@ class SignalSchema:
400
400
  if ModelStore.is_pydantic(finfo.annotation):
401
401
  SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
402
402
 
403
+ def get_column_type(self, col_name: str) -> DataType:
404
+ for path, _type, has_subtree, _ in self.get_flat_tree():
405
+ if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
406
+ return _type
407
+ raise SignalResolvingError([col_name], "is not found")
408
+
403
409
  def db_signals(
404
410
  self, name: Optional[str] = None, as_columns=False
405
411
  ) -> Union[list[str], list[Column]]:
@@ -490,7 +496,7 @@ class SignalSchema:
490
496
  new_values[name] = args_map[name]
491
497
  else:
492
498
  # adding new signal
493
- new_values.update(sql_to_python({name: value}))
499
+ new_values[name] = sql_to_python(value)
494
500
 
495
501
  return SignalSchema(new_values)
496
502
 
@@ -534,12 +540,12 @@ class SignalSchema:
534
540
  for name, val in values.items()
535
541
  }
536
542
 
537
- def get_flat_tree(self) -> Iterator[tuple[list[str], type, bool, int]]:
543
+ def get_flat_tree(self) -> Iterator[tuple[list[str], DataType, bool, int]]:
538
544
  yield from self._get_flat_tree(self.tree, [], 0)
539
545
 
540
546
  def _get_flat_tree(
541
547
  self, tree: dict, prefix: list[str], depth: int
542
- ) -> Iterator[tuple[list[str], type, bool, int]]:
548
+ ) -> Iterator[tuple[list[str], DataType, bool, int]]:
543
549
  for name, (type_, substree) in tree.items():
544
550
  suffix = name.split(".")
545
551
  new_prefix = prefix + suffix
datachain/lib/utils.py CHANGED
@@ -23,3 +23,8 @@ class DataChainError(Exception):
23
23
  class DataChainParamsError(DataChainError):
24
24
  def __init__(self, message):
25
25
  super().__init__(message)
26
+
27
+
28
+ class DataChainColumnError(DataChainParamsError):
29
+ def __init__(self, col_name, msg):
30
+ super().__init__(f"Error for column {col_name}: {msg}")
@@ -1,12 +1,11 @@
1
1
  from .dataset import DatasetQuery
2
2
  from .params import param
3
- from .schema import C, DatasetRow, LocalFilename, Object, Stream
3
+ from .schema import C, LocalFilename, Object, Stream
4
4
  from .session import Session
5
5
 
6
6
  __all__ = [
7
7
  "C",
8
8
  "DatasetQuery",
9
- "DatasetRow",
10
9
  "LocalFilename",
11
10
  "Object",
12
11
  "Session",
datachain/query/batch.py CHANGED
@@ -97,7 +97,6 @@ class Partition(BatchingStrategy):
97
97
 
98
98
  ordered_query = query.order_by(None).order_by(
99
99
  PARTITION_COLUMN_ID,
100
- "sys__id",
101
100
  *query._order_by_clauses,
102
101
  )
103
102
 
@@ -591,10 +591,6 @@ class UDFSignal(UDFStep):
591
591
  return query, []
592
592
  table = self.catalog.warehouse.create_pre_udf_table(query)
593
593
  q: Select = sqlalchemy.select(*table.c)
594
- if query._order_by_clauses:
595
- # we are adding ordering only if it's explicitly added by user in
596
- # query part before adding signals
597
- q = q.order_by(table.c.sys__id)
598
594
  return q, [table]
599
595
 
600
596
  def create_result_query(
@@ -630,11 +626,6 @@ class UDFSignal(UDFStep):
630
626
  else:
631
627
  res = sqlalchemy.select(*cols1).select_from(subq)
632
628
 
633
- if query._order_by_clauses:
634
- # if ordering is used in query part before adding signals, we
635
- # will have it as order by id from select from pre-created udf table
636
- res = res.order_by(subq.c.sys__id)
637
-
638
629
  if self.partition_by is not None:
639
630
  subquery = res.subquery()
640
631
  res = sqlalchemy.select(*subquery.c).select_from(subquery)
@@ -666,13 +657,6 @@ class RowGenerator(UDFStep):
666
657
  def create_result_query(
667
658
  self, udf_table, query: Select
668
659
  ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]:
669
- if not query._order_by_clauses:
670
- # if we are not selecting all rows in UDF, we need to ensure that
671
- # we get the same rows as we got as inputs of UDF since selecting
672
- # without ordering can be non deterministic in some databases
673
- c = query.selected_columns
674
- query = query.order_by(c.sys__id)
675
-
676
660
  udf_table_query = udf_table.select().subquery()
677
661
  udf_table_cols: list[sqlalchemy.Label[Any]] = [
678
662
  label(c.name, c) for c in udf_table_query.columns
@@ -957,24 +941,24 @@ class SQLJoin(Step):
957
941
 
958
942
 
959
943
  @frozen
960
- class GroupBy(Step):
961
- """Group rows by a specific column."""
962
-
963
- cols: PartitionByType
944
+ class SQLGroupBy(SQLClause):
945
+ cols: Sequence[Union[str, ColumnElement]]
946
+ group_by: Sequence[Union[str, ColumnElement]]
964
947
 
965
- def clone(self) -> "Self":
966
- return self.__class__(self.cols)
948
+ def apply_sql_clause(self, query) -> Select:
949
+ if not self.cols:
950
+ raise ValueError("No columns to select")
951
+ if not self.group_by:
952
+ raise ValueError("No columns to group by")
967
953
 
968
- def apply(
969
- self, query_generator: QueryGenerator, temp_tables: list[str]
970
- ) -> StepResult:
971
- query = query_generator.select()
972
- grouped_query = query.group_by(*self.cols)
954
+ subquery = query.subquery()
973
955
 
974
- def q(*columns):
975
- return grouped_query.with_only_columns(*columns)
956
+ cols = [
957
+ subquery.c[str(c)] if isinstance(c, (str, C)) else c
958
+ for c in [*self.group_by, *self.cols]
959
+ ]
976
960
 
977
- return step_result(q, grouped_query.selected_columns)
961
+ return sqlalchemy.select(*cols).select_from(subquery).group_by(*self.group_by)
978
962
 
979
963
 
980
964
  def _validate_columns(
@@ -1130,25 +1114,14 @@ class DatasetQuery:
1130
1114
  query.steps = query.steps[-1:] + query.steps[:-1]
1131
1115
 
1132
1116
  result = query.starting_step.apply()
1133
- group_by = None
1134
1117
  self.dependencies.update(result.dependencies)
1135
1118
 
1136
1119
  for step in query.steps:
1137
- if isinstance(step, GroupBy):
1138
- if group_by is not None:
1139
- raise TypeError("only one group_by allowed")
1140
- group_by = step
1141
- continue
1142
-
1143
1120
  result = step.apply(
1144
1121
  result.query_generator, self.temp_table_names
1145
1122
  ) # a chain of steps linked by results
1146
1123
  self.dependencies.update(result.dependencies)
1147
1124
 
1148
- if group_by:
1149
- result = group_by.apply(result.query_generator, self.temp_table_names)
1150
- self.dependencies.update(result.dependencies)
1151
-
1152
1125
  return result.query_generator
1153
1126
 
1154
1127
  @staticmethod
@@ -1410,9 +1383,13 @@ class DatasetQuery:
1410
1383
  return query.as_scalar()
1411
1384
 
1412
1385
  @detach
1413
- def group_by(self, *cols: ColumnElement) -> "Self":
1386
+ def group_by(
1387
+ self,
1388
+ cols: Sequence[ColumnElement],
1389
+ group_by: Sequence[ColumnElement],
1390
+ ) -> "Self":
1414
1391
  query = self.clone()
1415
- query.steps.append(GroupBy(cols))
1392
+ query.steps.append(SQLGroupBy(cols, group_by))
1416
1393
  return query
1417
1394
 
1418
1395
  @detach
@@ -1591,6 +1568,8 @@ class DatasetQuery:
1591
1568
  )
1592
1569
  version = version or dataset.latest_version
1593
1570
 
1571
+ self.session.add_dataset_version(dataset=dataset, version=version)
1572
+
1594
1573
  dr = self.catalog.warehouse.dataset_rows(dataset)
1595
1574
 
1596
1575
  self.catalog.warehouse.copy_table(dr.get_table(), query.select())
datachain/query/schema.py CHANGED
@@ -1,16 +1,13 @@
1
1
  import functools
2
- import json
3
2
  from abc import ABC, abstractmethod
4
- from datetime import datetime, timezone
5
3
  from fnmatch import fnmatch
6
- from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
4
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
7
5
 
8
6
  import attrs
9
7
  import sqlalchemy as sa
10
8
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
11
9
 
12
10
  from datachain.lib.file import File
13
- from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
14
11
 
15
12
  if TYPE_CHECKING:
16
13
  from datachain.catalog import Catalog
@@ -228,61 +225,4 @@ def normalize_param(param: UDFParamSpec) -> UDFParameter:
228
225
  raise TypeError(f"Invalid UDF parameter: {param}")
229
226
 
230
227
 
231
- class DatasetRow:
232
- schema: ClassVar[dict[str, type[SQLType]]] = {
233
- "source": String,
234
- "path": String,
235
- "size": Int64,
236
- "location": JSON,
237
- "is_latest": Boolean,
238
- "last_modified": DateTime,
239
- "version": String,
240
- "etag": String,
241
- }
242
-
243
- @staticmethod
244
- def create(
245
- path: str,
246
- source: str = "",
247
- size: int = 0,
248
- location: Optional[dict[str, Any]] = None,
249
- is_latest: bool = True,
250
- last_modified: Optional[datetime] = None,
251
- version: str = "",
252
- etag: str = "",
253
- ) -> tuple[
254
- str,
255
- str,
256
- int,
257
- Optional[str],
258
- int,
259
- bool,
260
- datetime,
261
- str,
262
- str,
263
- int,
264
- ]:
265
- if location:
266
- location = json.dumps([location]) # type: ignore [assignment]
267
-
268
- last_modified = last_modified or datetime.now(timezone.utc)
269
-
270
- return ( # type: ignore [return-value]
271
- source,
272
- path,
273
- size,
274
- location,
275
- is_latest,
276
- last_modified,
277
- version,
278
- etag,
279
- )
280
-
281
- @staticmethod
282
- def extend(**columns):
283
- cols = {**DatasetRow.schema}
284
- cols.update(columns)
285
- return cols
286
-
287
-
288
228
  C = Column
@@ -1,9 +1,9 @@
1
1
  import atexit
2
+ import gc
2
3
  import logging
3
- import os
4
4
  import re
5
5
  import sys
6
- from typing import TYPE_CHECKING, Optional
6
+ from typing import TYPE_CHECKING, ClassVar, Optional
7
7
  from uuid import uuid4
8
8
 
9
9
  from datachain.catalog import get_catalog
@@ -11,6 +11,7 @@ from datachain.error import TableMissingError
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from datachain.catalog import Catalog
14
+ from datachain.dataset import DatasetRecord
14
15
 
15
16
  logger = logging.getLogger("datachain")
16
17
 
@@ -39,7 +40,7 @@ class Session:
39
40
  """
40
41
 
41
42
  GLOBAL_SESSION_CTX: Optional["Session"] = None
42
- GLOBAL_SESSION: Optional["Session"] = None
43
+ SESSION_CONTEXTS: ClassVar[list["Session"]] = []
43
44
  ORIGINAL_EXCEPT_HOOK = None
44
45
 
45
46
  DATASET_PREFIX = "session_"
@@ -64,18 +65,21 @@ class Session:
64
65
 
65
66
  session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
66
67
  self.name = f"{name}_{session_uuid}"
67
- self.job_id = os.getenv("DATACHAIN_JOB_ID") or str(uuid4())
68
68
  self.is_new_catalog = not catalog
69
69
  self.catalog = catalog or get_catalog(
70
70
  client_config=client_config, in_memory=in_memory
71
71
  )
72
+ self.dataset_versions: list[tuple[DatasetRecord, int]] = []
72
73
 
73
74
  def __enter__(self):
75
+ # Push the current context onto the stack
76
+ Session.SESSION_CONTEXTS.append(self)
77
+
74
78
  return self
75
79
 
76
80
  def __exit__(self, exc_type, exc_val, exc_tb):
77
81
  if exc_type:
78
- self._cleanup_created_versions(self.name)
82
+ self._cleanup_created_versions()
79
83
 
80
84
  self._cleanup_temp_datasets()
81
85
  if self.is_new_catalog:
@@ -83,6 +87,12 @@ class Session:
83
87
  self.catalog.warehouse.close_on_exit()
84
88
  self.catalog.id_generator.close_on_exit()
85
89
 
90
+ if Session.SESSION_CONTEXTS:
91
+ Session.SESSION_CONTEXTS.pop()
92
+
93
+ def add_dataset_version(self, dataset: "DatasetRecord", version: int) -> None:
94
+ self.dataset_versions.append((dataset, version))
95
+
86
96
  def generate_temp_dataset_name(self) -> str:
87
97
  return self.get_temp_prefix() + uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
88
98
 
@@ -98,21 +108,15 @@ class Session:
98
108
  except TableMissingError:
99
109
  pass
100
110
 
101
- def _cleanup_created_versions(self, job_id: str) -> None:
102
- versions = self.catalog.metastore.get_job_dataset_versions(job_id)
103
- if not versions:
111
+ def _cleanup_created_versions(self) -> None:
112
+ if not self.dataset_versions:
104
113
  return
105
114
 
106
- datasets = {}
107
- for dataset_name, version in versions:
108
- if dataset_name not in datasets:
109
- datasets[dataset_name] = self.catalog.get_dataset(dataset_name)
110
- dataset = datasets[dataset_name]
111
- logger.info(
112
- "Removing dataset version %s@%s due to exception", dataset_name, version
113
- )
115
+ for dataset, version in self.dataset_versions:
114
116
  self.catalog.remove_dataset_version(dataset, version)
115
117
 
118
+ self.dataset_versions.clear()
119
+
116
120
  @classmethod
117
121
  def get(
118
122
  cls,
@@ -125,33 +129,34 @@ class Session:
125
129
 
126
130
  Parameters:
127
131
  session (Session): Optional Session(). If not provided a new session will
128
- be created. It's needed mostly for simplie API purposes.
129
- catalog (Catalog): Optional catalog. By default a new catalog is created.
132
+ be created. It's needed mostly for simple API purposes.
133
+ catalog (Catalog): Optional catalog. By default, a new catalog is created.
130
134
  """
131
135
  if session:
132
136
  return session
133
137
 
134
- if cls.GLOBAL_SESSION is None:
138
+ # Access the active (most recent) context from the stack
139
+ if cls.SESSION_CONTEXTS:
140
+ return cls.SESSION_CONTEXTS[-1]
141
+
142
+ if cls.GLOBAL_SESSION_CTX is None:
135
143
  cls.GLOBAL_SESSION_CTX = Session(
136
144
  cls.GLOBAL_SESSION_NAME,
137
145
  catalog,
138
146
  client_config=client_config,
139
147
  in_memory=in_memory,
140
148
  )
141
- cls.GLOBAL_SESSION = cls.GLOBAL_SESSION_CTX.__enter__()
142
149
 
143
150
  atexit.register(cls._global_cleanup)
144
151
  cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
145
152
  sys.excepthook = cls.except_hook
146
153
 
147
- return cls.GLOBAL_SESSION
154
+ return cls.GLOBAL_SESSION_CTX
148
155
 
149
156
  @staticmethod
150
157
  def except_hook(exc_type, exc_value, exc_traceback):
158
+ Session.GLOBAL_SESSION_CTX.__exit__(exc_type, exc_value, exc_traceback)
151
159
  Session._global_cleanup()
152
- if Session.GLOBAL_SESSION_CTX is not None:
153
- job_id = Session.GLOBAL_SESSION_CTX.job_id
154
- Session.GLOBAL_SESSION_CTX._cleanup_created_versions(job_id)
155
160
 
156
161
  if Session.ORIGINAL_EXCEPT_HOOK:
157
162
  Session.ORIGINAL_EXCEPT_HOOK(exc_type, exc_value, exc_traceback)
@@ -160,7 +165,6 @@ class Session:
160
165
  def cleanup_for_tests(cls):
161
166
  if cls.GLOBAL_SESSION_CTX is not None:
162
167
  cls.GLOBAL_SESSION_CTX.__exit__(None, None, None)
163
- cls.GLOBAL_SESSION = None
164
168
  cls.GLOBAL_SESSION_CTX = None
165
169
  atexit.unregister(cls._global_cleanup)
166
170
 
@@ -171,3 +175,7 @@ class Session:
171
175
  def _global_cleanup():
172
176
  if Session.GLOBAL_SESSION_CTX is not None:
173
177
  Session.GLOBAL_SESSION_CTX.__exit__(None, None, None)
178
+
179
+ for obj in gc.get_objects(): # Get all tracked objects
180
+ if isinstance(obj, Session): # Cleanup temp dataset for session variables.
181
+ obj.__exit__(None, None, None)
@@ -1,7 +1,7 @@
1
1
  from sqlalchemy.sql.expression import func
2
2
 
3
3
  from . import array, path, string
4
- from .array import avg
4
+ from .aggregate import avg
5
5
  from .conditional import greatest, least
6
6
  from .random import rand
7
7
 
@@ -0,0 +1,47 @@
1
+ from sqlalchemy.sql.functions import GenericFunction, ReturnTypeFromArgs
2
+
3
+ from datachain.sql.types import Float, String
4
+ from datachain.sql.utils import compiler_not_implemented
5
+
6
+
7
+ class avg(GenericFunction): # noqa: N801
8
+ """
9
+ Returns the average of the column.
10
+ """
11
+
12
+ type = Float()
13
+ package = "array"
14
+ name = "avg"
15
+ inherit_cache = True
16
+
17
+
18
+ class group_concat(GenericFunction): # noqa: N801
19
+ """
20
+ Returns the concatenated string of the column.
21
+ """
22
+
23
+ type = String()
24
+ package = "array"
25
+ name = "group_concat"
26
+ inherit_cache = True
27
+
28
+
29
+ class any_value(ReturnTypeFromArgs): # noqa: N801
30
+ """
31
+ Returns first value of the column.
32
+ """
33
+
34
+ inherit_cache = True
35
+
36
+
37
+ class collect(ReturnTypeFromArgs): # noqa: N801
38
+ """
39
+ Returns an array of the column.
40
+ """
41
+
42
+ inherit_cache = True
43
+
44
+
45
+ compiler_not_implemented(avg)
46
+ compiler_not_implemented(group_concat)
47
+ compiler_not_implemented(any_value)
@@ -44,15 +44,7 @@ class sip_hash_64(GenericFunction): # noqa: N801
44
44
  inherit_cache = True
45
45
 
46
46
 
47
- class avg(GenericFunction): # noqa: N801
48
- type = Float()
49
- package = "array"
50
- name = "avg"
51
- inherit_cache = True
52
-
53
-
54
47
  compiler_not_implemented(cosine_distance)
55
48
  compiler_not_implemented(euclidean_distance)
56
49
  compiler_not_implemented(length)
57
50
  compiler_not_implemented(sip_hash_64)
58
- compiler_not_implemented(avg)
@@ -14,7 +14,7 @@ from sqlalchemy.sql.elements import literal
14
14
  from sqlalchemy.sql.expression import case
15
15
  from sqlalchemy.sql.functions import func
16
16
 
17
- from datachain.sql.functions import array, conditional, random, string
17
+ from datachain.sql.functions import aggregate, array, conditional, random, string
18
18
  from datachain.sql.functions import path as sql_path
19
19
  from datachain.sql.selectable import Values, base_values_compiler
20
20
  from datachain.sql.sqlite.types import (
@@ -84,7 +84,10 @@ def setup():
84
84
  compiles(conditional.least, "sqlite")(compile_least)
85
85
  compiles(Values, "sqlite")(compile_values)
86
86
  compiles(random.rand, "sqlite")(compile_rand)
87
- compiles(array.avg, "sqlite")(compile_avg)
87
+ compiles(aggregate.avg, "sqlite")(compile_avg)
88
+ compiles(aggregate.group_concat, "sqlite")(compile_group_concat)
89
+ compiles(aggregate.any_value, "sqlite")(compile_any_value)
90
+ compiles(aggregate.collect, "sqlite")(compile_collect)
88
91
 
89
92
  if load_usearch_extension(sqlite3.connect(":memory:")):
90
93
  compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
@@ -400,6 +403,21 @@ def compile_avg(element, compiler, **kwargs):
400
403
  return compiler.process(func.avg(*element.clauses.clauses), **kwargs)
401
404
 
402
405
 
406
+ def compile_group_concat(element, compiler, **kwargs):
407
+ return compiler.process(func.aggregate_strings(*element.clauses.clauses), **kwargs)
408
+
409
+
410
+ def compile_any_value(element, compiler, **kwargs):
411
+ # use bare column to return any value from the group,
412
+ # this is documented behavior for sqlite,
413
+ # see https://www.sqlite.org/lang_select.html#bare_columns_in_an_aggregate_query
414
+ return compiler.process(*element.clauses.clauses, **kwargs)
415
+
416
+
417
+ def compile_collect(element, compiler, **kwargs):
418
+ return compiler.process(func.json_group_array(*element.clauses.clauses), **kwargs)
419
+
420
+
403
421
  def load_usearch_extension(conn) -> bool:
404
422
  try:
405
423
  # usearch is part of the vector optional dependencies
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -46,7 +46,7 @@ Requires-Dist: iterative-telemetry >=0.0.9
46
46
  Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
47
47
  Provides-Extra: dev
48
48
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
49
- Requires-Dist: mypy ==1.11.2 ; extra == 'dev'
49
+ Requires-Dist: mypy ==1.12.0 ; extra == 'dev'
50
50
  Requires-Dist: types-python-dateutil ; extra == 'dev'
51
51
  Requires-Dist: types-pytz ; extra == 'dev'
52
52
  Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -1,4 +1,4 @@
1
- datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
1
+ datachain/__init__.py,sha256=OGzc8xZWtwqxiiutjU4AxCRPY0lrX_csgERiTrq4G0o,908
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
@@ -18,7 +18,7 @@ datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=BsMyk2RQibQYHgrmovFZeSEpPVMTwgb_7ntVYdc7t-E,64090
21
+ datachain/catalog/catalog.py,sha256=r5lkwwZDh8cETNniBdzPCY9Ix8G-1RdkehjvUe3d2nE,63834
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -36,14 +36,14 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
36
36
  datachain/data_storage/metastore.py,sha256=HfCxk4lmDUg2Q4WsFNQGMWxllP0mToA00fxkFTwdNIE,52919
37
37
  datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
38
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
39
- datachain/data_storage/sqlite.py,sha256=fW08P7AbJ0cDbTbcTKuAGpvMXvBjg-QkGsKT_Dslyws,28383
40
- datachain/data_storage/warehouse.py,sha256=fXhVfao3NfWFGbbG5uJ-Ga4bX1FiKVfcbDyQgECYfk8,32122
39
+ datachain/data_storage/sqlite.py,sha256=V8fGRPjSwIT7kdw1qyQfUfdqGjXB8dE68npkyXfKW0o,28702
40
+ datachain/data_storage/warehouse.py,sha256=Ea0wVcWxe7Bu-8V8eqrPJ8Ov5-DT1dvv1MgxMINettc,31931
41
41
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  datachain/lib/arrow.py,sha256=0R2CYsN82nNa5_03iS6jVix9EKeeqNZNAMgpSQP2hfo,9482
43
43
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
44
44
  datachain/lib/data_model.py,sha256=ECTbvlnzM98hp2mZ4fo82Yi0-MuoqTIQasQKGIyd89I,2040
45
45
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
46
- datachain/lib/dc.py,sha256=XmAFU9k79wUHIh0gYab8j-wF4vIlyW6opJcOy8fmoVc,76666
46
+ datachain/lib/dc.py,sha256=wEqBDCENfBmeow0-uu8R4qJhQa8taEIzveUiNdr2CyY,78341
47
47
  datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
48
48
  datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
49
49
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
@@ -53,30 +53,33 @@ datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw
53
53
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
54
54
  datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
55
55
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
56
- datachain/lib/signal_schema.py,sha256=gj45dRQuOsKDmaKaJxb5j63HYVGw-Ks1fyAS1FpyOWA,24145
56
+ datachain/lib/signal_schema.py,sha256=6fgQIZz4jFvuiaL1mqK5Cq6yr4WC57o2ptHxk36MRNY,24438
57
57
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
58
58
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
59
59
  datachain/lib/udf.py,sha256=GvhWLCXZUY7sz1QMRBj1AJDSzzhyj15xs3Ia9hjJrJE,12697
60
60
  datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
61
- datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
61
+ datachain/lib/utils.py,sha256=12elAX6eTFgMGKIf2UfZ4IW07kRwjK6wz8yGE41RtNM,618
62
62
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
64
64
  datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
65
65
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
67
67
  datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
68
- datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
68
+ datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
69
69
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
70
70
  datachain/lib/convert/values_to_tuples.py,sha256=varRCnSMT_pZmHznrd2Yi05qXLLz_v9YH_pOCpHSkdc,3921
71
- datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
72
- datachain/query/batch.py,sha256=3QlwshhpUc1amZRtXWVXEEuq47hEQgQlY0Ji48DR6hg,3508
73
- datachain/query/dataset.py,sha256=MF_E7yjbFQV6NcP4gKbJFXiWuoQkpQ7-Jmxa59FxenE,53630
71
+ datachain/lib/func/__init__.py,sha256=ucJ15J_Q5Hy--boKV-tPuhKagVD3NpnuUPhLtDp7doI,230
72
+ datachain/lib/func/aggregate.py,sha256=B5VV6WoSYYiO_9uN4_nXPMkF9OOkgyE6suJ7XD-JiPI,938
73
+ datachain/lib/func/func.py,sha256=kFhVZlWZzgAfM7-DpkpZWf5zzdEutp_3NxIFWxXww_I,1956
74
+ datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
75
+ datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
76
+ datachain/query/dataset.py,sha256=-J8t8XGUQveh-4aM5HrnbYx9xLfMQ8p6P9sKmBaTpLU,52683
74
77
  datachain/query/dispatch.py,sha256=wjjTWw6sFQbB9SKRh78VbfvwSMgJXCfqJklS3-9KnCU,12025
75
78
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
79
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
77
80
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
78
- datachain/query/schema.py,sha256=I8zLWJuWl5N332ni9mAzDYtcxMJupVPgWkSDe8spNEk,8019
79
- datachain/query/session.py,sha256=kpFFJMfWBnxaMPojMGhJRbk-BOsSYI8Ckl6vvqnx7d0,5787
81
+ datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
82
+ datachain/query/session.py,sha256=50SOdLNCjqHHKI-L4xGXyzTVxzMWfANqKqjeYre-c2k,5959
80
83
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
84
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
82
85
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
@@ -85,20 +88,21 @@ datachain/sql/types.py,sha256=3aXpoxkmCYbw0Dlta5J1enwS8_FuvjfSqyrNZO-dWj4,13383
85
88
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
86
89
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
87
90
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
88
- datachain/sql/functions/__init__.py,sha256=Ioyy7nSetrTLVnHGcGcmZU99HxUFcx-5PFbrh2dPNH0,396
89
- datachain/sql/functions/array.py,sha256=EB7nJSncUc1PuxlHyzU2gVhF8DuXaxpGlxb5e8X2KFY,1297
91
+ datachain/sql/functions/__init__.py,sha256=-vIkU0AqwOW5FX6P89xYl-uBIUdt46CEnCtshmN85gM,400
92
+ datachain/sql/functions/aggregate.py,sha256=3AQdA8YHPFdtCEfwZKQXTT8SlQWdG9gD5PBtGN3Odqs,944
93
+ datachain/sql/functions/array.py,sha256=rvH27SWN9gdh_mFnp0GIiXuCrNW6n8ZbY4I_JUS-_e0,1140
90
94
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
91
95
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
92
96
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
93
97
  datachain/sql/functions/string.py,sha256=DYgiw8XSk7ge7GXvyRI1zbaMruIizNeI-puOjriQGZQ,1148
94
98
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
95
- datachain/sql/sqlite/base.py,sha256=3gDMLKSWkxnbiZ1dykYa5VuHSSlg5sLY9ihMqcH_o1M,13578
99
+ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,14375
96
100
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
97
101
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
98
102
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
99
- datachain-0.6.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
- datachain-0.6.0.dist-info/METADATA,sha256=4nxP9eUg6o9ymkwy-hz4DsqRM5IBtqhInNE7vsE0lxY,17156
101
- datachain-0.6.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
102
- datachain-0.6.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
- datachain-0.6.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
- datachain-0.6.0.dist-info/RECORD,,
103
+ datachain-0.6.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
104
+ datachain-0.6.1.dist-info/METADATA,sha256=kOEDXkaNjPHB-A1fLt60s_EJvnjuLIU3xdfp5UhflUA,17156
105
+ datachain-0.6.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
106
+ datachain-0.6.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
107
+ datachain-0.6.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
108
+ datachain-0.6.1.dist-info/RECORD,,