datachain 0.25.1__py3-none-any.whl → 0.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -21,6 +21,9 @@ from datachain.lib.dc import (
21
21
  )
22
22
  from datachain.lib.file import (
23
23
  ArrowRow,
24
+ Audio,
25
+ AudioFile,
26
+ AudioFragment,
24
27
  File,
25
28
  FileError,
26
29
  Image,
@@ -43,6 +46,9 @@ __all__ = [
43
46
  "AbstractUDF",
44
47
  "Aggregator",
45
48
  "ArrowRow",
49
+ "Audio",
50
+ "AudioFile",
51
+ "AudioFragment",
46
52
  "C",
47
53
  "Column",
48
54
  "DataChain",
datachain/lib/audio.py ADDED
@@ -0,0 +1,151 @@
1
+ import posixpath
2
+ from typing import TYPE_CHECKING, Optional, Union
3
+
4
+ from datachain.lib.file import FileError
5
+
6
+ if TYPE_CHECKING:
7
+ from numpy import ndarray
8
+
9
+ from datachain.lib.file import Audio, AudioFile, File
10
+
11
+ try:
12
+ import torchaudio
13
+ except ImportError as exc:
14
+ raise ImportError(
15
+ "Missing dependencies for processing audio.\n"
16
+ "To install run:\n\n"
17
+ " pip install 'datachain[audio]'\n"
18
+ ) from exc
19
+
20
+
21
+ def audio_info(file: "Union[File, AudioFile]") -> "Audio":
22
+ """Extract metadata like sample rate, channels, duration, and format."""
23
+ from datachain.lib.file import Audio
24
+
25
+ file = file.as_audio_file()
26
+
27
+ try:
28
+ with file.open() as f:
29
+ info = torchaudio.info(f)
30
+
31
+ sample_rate = int(info.sample_rate)
32
+ channels = int(info.num_channels)
33
+ frames = int(info.num_frames)
34
+ duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
35
+
36
+ # Get format information
37
+ format_name = getattr(info, "format", "")
38
+ codec_name = getattr(info, "encoding", "")
39
+ bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
40
+
41
+ except Exception as exc:
42
+ raise FileError(
43
+ "unable to extract metadata from audio file", file.source, file.path
44
+ ) from exc
45
+
46
+ return Audio(
47
+ sample_rate=sample_rate,
48
+ channels=channels,
49
+ duration=duration,
50
+ samples=frames,
51
+ format=format_name,
52
+ codec=codec_name,
53
+ bit_rate=bit_rate,
54
+ )
55
+
56
+
57
+ def audio_fragment_np(
58
+ audio: "AudioFile", start: float = 0, duration: Optional[float] = None
59
+ ) -> "tuple[ndarray, int]":
60
+ """Load audio fragment as numpy array.
61
+ Multi-channel audio is transposed to (samples, channels)."""
62
+ if start < 0:
63
+ raise ValueError("start must be a non-negative float")
64
+
65
+ if duration is not None and duration <= 0:
66
+ raise ValueError("duration must be a positive float")
67
+
68
+ if hasattr(audio, "as_audio_file"):
69
+ audio = audio.as_audio_file()
70
+
71
+ try:
72
+ with audio.open() as f:
73
+ info = torchaudio.info(f)
74
+ sample_rate = info.sample_rate
75
+
76
+ frame_offset = int(start * sample_rate)
77
+ num_frames = int(duration * sample_rate) if duration is not None else -1
78
+
79
+ # Reset file pointer to the beginning
80
+ # This is important to ensure we read from the correct position later
81
+ f.seek(0)
82
+
83
+ waveform, sr = torchaudio.load(
84
+ f, frame_offset=frame_offset, num_frames=num_frames
85
+ )
86
+
87
+ audio_np = waveform.numpy()
88
+
89
+ if audio_np.shape[0] > 1:
90
+ audio_np = audio_np.T
91
+ else:
92
+ audio_np = audio_np.squeeze()
93
+
94
+ return audio_np, int(sr)
95
+ except Exception as exc:
96
+ raise FileError(
97
+ "unable to read audio fragment", audio.source, audio.path
98
+ ) from exc
99
+
100
+
101
+ def audio_fragment_bytes(
102
+ audio: "AudioFile",
103
+ start: float = 0,
104
+ duration: Optional[float] = None,
105
+ format: str = "wav",
106
+ ) -> bytes:
107
+ """Convert audio fragment to bytes using soundfile."""
108
+ y, sr = audio_fragment_np(audio, start, duration)
109
+
110
+ import io
111
+
112
+ import soundfile as sf
113
+
114
+ buffer = io.BytesIO()
115
+ sf.write(buffer, y, sr, format=format)
116
+ return buffer.getvalue()
117
+
118
+
119
+ def save_audio_fragment(
120
+ audio: "AudioFile",
121
+ start: float,
122
+ end: float,
123
+ output: str,
124
+ format: Optional[str] = None,
125
+ ) -> "AudioFile":
126
+ """Save audio fragment with timestamped filename.
127
+ Supports local and remote storage upload."""
128
+ if start < 0 or end < 0 or start >= end:
129
+ raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
130
+
131
+ if format is None:
132
+ format = audio.get_file_ext()
133
+
134
+ duration = end - start
135
+ start_ms = int(start * 1000)
136
+ end_ms = int(end * 1000)
137
+ output_file = posixpath.join(
138
+ output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
139
+ )
140
+
141
+ try:
142
+ audio_bytes = audio_fragment_bytes(audio, start, duration, format)
143
+
144
+ from datachain.lib.file import AudioFile
145
+
146
+ return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
147
+
148
+ except Exception as exc:
149
+ raise FileError(
150
+ "unable to save audio fragment", audio.source, audio.path
151
+ ) from exc
@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
9
9
  type_ = sql_exp.type.python_type
10
10
  if type_ == Decimal:
11
11
  type_ = float
12
+ elif type_ is list:
13
+ if hasattr(sql_exp.type, "item_type") and hasattr(
14
+ sql_exp.type.item_type, "python_type"
15
+ ):
16
+ item_type = getattr(sql_exp.type.item_type, "python_type", Any)
17
+ type_ = list[item_type] # type: ignore[valid-type]
18
+ else:
19
+ type_ = list
12
20
  except NotImplementedError:
13
21
  type_ = str
14
22
  return type_
@@ -15,6 +15,7 @@ from typing import (
15
15
  Optional,
16
16
  TypeVar,
17
17
  Union,
18
+ cast,
18
19
  overload,
19
20
  )
20
21
 
@@ -39,14 +40,15 @@ from datachain.lib.file import (
39
40
  FileExporter,
40
41
  )
41
42
  from datachain.lib.file import ExportPlacement as FileExportPlacement
43
+ from datachain.lib.model_store import ModelStore
42
44
  from datachain.lib.settings import Settings
43
- from datachain.lib.signal_schema import SignalSchema
45
+ from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
44
46
  from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
45
47
  from datachain.lib.udf_signature import UdfSignature
46
48
  from datachain.lib.utils import DataChainColumnError, DataChainParamsError
47
49
  from datachain.query import Session
48
50
  from datachain.query.dataset import DatasetQuery, PartitionByType
49
- from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
51
+ from datachain.query.schema import DEFAULT_DELIMITER, Column
50
52
  from datachain.sql.functions import path as pathfunc
51
53
  from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
52
54
 
@@ -758,11 +760,12 @@ class DataChain:
758
760
  @delta_disabled
759
761
  def agg(
760
762
  self,
763
+ /,
761
764
  func: Optional[Callable] = None,
762
765
  partition_by: Optional[PartitionByType] = None,
763
766
  params: Union[None, str, Sequence[str]] = None,
764
767
  output: OutputType = None,
765
- **signal_map,
768
+ **signal_map: Callable,
766
769
  ) -> "Self":
767
770
  """Aggregate rows using `partition_by` statement and apply a function to the
768
771
  groups of aggregated rows. The function needs to return new objects for each
@@ -772,12 +775,28 @@ class DataChain:
772
775
 
773
776
  This method bears similarity to `gen()` and `map()`, employing a comparable set
774
777
  of parameters, yet differs in two crucial aspects:
778
+
775
779
  1. The `partition_by` parameter: This specifies the column name or a list of
776
780
  column names that determine the grouping criteria for aggregation.
777
781
  2. Group-based UDF function input: Instead of individual rows, the function
778
- receives a list all rows within each group defined by `partition_by`.
782
+ receives a list of all rows within each group defined by `partition_by`.
783
+
784
+ If `partition_by` is not set or is an empty list, all rows will be placed
785
+ into a single group.
786
+
787
+ Parameters:
788
+ func: Function applied to each group of rows.
789
+ partition_by: Column name(s) to group by. If None, all rows go
790
+ into one group.
791
+ params: List of column names used as input for the function. Default is
792
+ taken from function signature.
793
+ output: Dictionary defining new signals and their corresponding types.
794
+ Default type is taken from function signature.
795
+ **signal_map: kwargs can be used to define `func` together with its return
796
+ signal name in format of `agg(result_column=my_func)`.
779
797
 
780
798
  Examples:
799
+ Basic aggregation with lambda function:
781
800
  ```py
782
801
  chain = chain.agg(
783
802
  total=lambda category, amount: [sum(amount)],
@@ -788,7 +807,6 @@ class DataChain:
788
807
  ```
789
808
 
790
809
  An alternative syntax, when you need to specify a more complex function:
791
-
792
810
  ```py
793
811
  # It automatically resolves which columns to pass to the function
794
812
  # by looking at the function signature.
@@ -806,10 +824,43 @@ class DataChain:
806
824
  )
807
825
  chain.save("new_dataset")
808
826
  ```
827
+
828
+ Using complex signals for partitioning (`File` or any Pydantic `BaseModel`):
829
+ ```py
830
+ def my_agg(files: list[File]) -> Iterator[tuple[File, int]]:
831
+ yield files[0], sum(f.size for f in files)
832
+
833
+ chain = chain.agg(
834
+ my_agg,
835
+ params=("file",),
836
+ output={"file": File, "total": int},
837
+ partition_by="file", # Column referring to all sub-columns of File
838
+ )
839
+ chain.save("new_dataset")
840
+ ```
841
+
842
+ Aggregating all rows into a single group (when `partition_by` is not set):
843
+ ```py
844
+ chain = chain.agg(
845
+ total_size=lambda file, size: [sum(size)],
846
+ output=int,
847
+ # No partition_by specified - all rows go into one group
848
+ )
849
+ chain.save("new_dataset")
850
+ ```
851
+
852
+ Multiple partition columns:
853
+ ```py
854
+ chain = chain.agg(
855
+ total=lambda category, subcategory, amount: [sum(amount)],
856
+ output=float,
857
+ partition_by=["category", "subcategory"],
858
+ )
859
+ chain.save("new_dataset")
860
+ ```
809
861
  """
810
- # Convert string partition_by parameters to Column objects
811
- processed_partition_by = partition_by
812
862
  if partition_by is not None:
863
+ # Convert string partition_by parameters to Column objects
813
864
  if isinstance(partition_by, (str, Function, ColumnElement)):
814
865
  list_partition_by = [partition_by]
815
866
  else:
@@ -818,10 +869,10 @@ class DataChain:
818
869
  processed_partition_columns: list[ColumnElement] = []
819
870
  for col in list_partition_by:
820
871
  if isinstance(col, str):
821
- col_db_name = ColumnMeta.to_db_name(col)
822
- col_type = self.signals_schema.get_column_type(col_db_name)
823
- column = Column(col_db_name, python_to_sql(col_type))
824
- processed_partition_columns.append(column)
872
+ columns = self.signals_schema.db_signals(name=col, as_columns=True)
873
+ if not columns:
874
+ raise SignalResolvingError([col], "is not found")
875
+ processed_partition_columns.extend(cast("list[Column]", columns))
825
876
  elif isinstance(col, Function):
826
877
  column = col.get_column(self.signals_schema)
827
878
  processed_partition_columns.append(column)
@@ -830,6 +881,8 @@ class DataChain:
830
881
  processed_partition_columns.append(col)
831
882
 
832
883
  processed_partition_by = processed_partition_columns
884
+ else:
885
+ processed_partition_by = []
833
886
 
834
887
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
835
888
  return self._evolve(
@@ -969,7 +1022,7 @@ class DataChain:
969
1022
  )
970
1023
 
971
1024
  @delta_disabled # type: ignore[arg-type]
972
- def group_by(
1025
+ def group_by( # noqa: C901, PLR0912
973
1026
  self,
974
1027
  *,
975
1028
  partition_by: Optional[Union[str, Func, Sequence[Union[str, Func]]]] = None,
@@ -988,6 +1041,15 @@ class DataChain:
988
1041
  partition_by=("file_source", "file_ext"),
989
1042
  )
990
1043
  ```
1044
+
1045
+ Using complex signals:
1046
+ ```py
1047
+ chain = chain.group_by(
1048
+ total_size=func.sum("file.size"),
1049
+ count=func.count(),
1050
+ partition_by="file", # Uses column name, expands to File's unique keys
1051
+ )
1052
+ ```
991
1053
  """
992
1054
  if partition_by is None:
993
1055
  partition_by = []
@@ -998,20 +1060,61 @@ class DataChain:
998
1060
  signal_columns: list[Column] = []
999
1061
  schema_fields: dict[str, DataType] = {}
1000
1062
  keep_columns: list[str] = []
1063
+ partial_fields: list[str] = [] # Track specific fields for partial creation
1064
+ schema_partition_by: list[str] = []
1001
1065
 
1002
- # validate partition_by columns and add them to the schema
1003
1066
  for col in partition_by:
1004
1067
  if isinstance(col, str):
1005
- col_db_name = ColumnMeta.to_db_name(col)
1006
- col_type = self.signals_schema.get_column_type(col_db_name)
1007
- column = Column(col_db_name, python_to_sql(col_type))
1008
- if col not in keep_columns:
1009
- keep_columns.append(col)
1068
+ columns = self.signals_schema.db_signals(name=col, as_columns=True)
1069
+ if not columns:
1070
+ raise SignalResolvingError([col], "is not found")
1071
+ partition_by_columns.extend(cast("list[Column]", columns))
1072
+
1073
+ # For nested field references (e.g., "nested.level1.name"),
1074
+ # we need to distinguish between:
1075
+ # 1. References to fields within a complex signal (create partials)
1076
+ # 2. Deep nested references that should be flattened
1077
+ if "." in col:
1078
+ # Split the column reference to analyze it
1079
+ parts = col.split(".")
1080
+ parent_signal = parts[0]
1081
+ parent_type = self.signals_schema.values.get(parent_signal)
1082
+
1083
+ if ModelStore.is_partial(parent_type):
1084
+ if parent_signal not in keep_columns:
1085
+ keep_columns.append(parent_signal)
1086
+ partial_fields.append(col)
1087
+ schema_partition_by.append(col)
1088
+ else:
1089
+ # BaseModel or other - add flattened columns directly
1090
+ for column in cast("list[Column]", columns):
1091
+ col_type = self.signals_schema.get_column_type(column.name)
1092
+ schema_fields[column.name] = col_type
1093
+ schema_partition_by.append(col)
1094
+ else:
1095
+ # simple signal - but we need to check if it's a complex signal
1096
+ # complex signal - only include the columns used for partitioning
1097
+ col_type = self.signals_schema.get_column_type(
1098
+ col, with_subtree=True
1099
+ )
1100
+ if isinstance(col_type, type) and issubclass(col_type, BaseModel):
1101
+ # Complex signal - add only the partitioning columns
1102
+ for column in cast("list[Column]", columns):
1103
+ col_type = self.signals_schema.get_column_type(column.name)
1104
+ schema_fields[column.name] = col_type
1105
+ schema_partition_by.append(col)
1106
+ # Simple signal - keep the entire signal
1107
+ else:
1108
+ if col not in keep_columns:
1109
+ keep_columns.append(col)
1110
+ schema_partition_by.append(col)
1010
1111
  elif isinstance(col, Function):
1011
1112
  column = col.get_column(self.signals_schema)
1012
1113
  col_db_name = column.name
1013
1114
  col_type = column.type.python_type
1014
1115
  schema_fields[col_db_name] = col_type
1116
+ partition_by_columns.append(column)
1117
+ signal_columns.append(column)
1015
1118
  else:
1016
1119
  raise DataChainColumnError(
1017
1120
  col,
@@ -1020,9 +1123,7 @@ class DataChain:
1020
1123
  " but expected str or Function"
1021
1124
  ),
1022
1125
  )
1023
- partition_by_columns.append(column)
1024
1126
 
1025
- # validate signal columns and add them to the schema
1026
1127
  if not kwargs:
1027
1128
  raise ValueError("At least one column should be provided for group_by")
1028
1129
  for col_name, func in kwargs.items():
@@ -1035,9 +1136,9 @@ class DataChain:
1035
1136
  signal_columns.append(column)
1036
1137
  schema_fields[col_name] = func.get_result_type(self.signals_schema)
1037
1138
 
1038
- signal_schema = SignalSchema(schema_fields)
1039
- if keep_columns:
1040
- signal_schema |= self.signals_schema.to_partial(*keep_columns)
1139
+ signal_schema = self.signals_schema.group_by(
1140
+ schema_partition_by, signal_columns
1141
+ )
1041
1142
 
1042
1143
  return self._evolve(
1043
1144
  query=self._query.group_by(signal_columns, partition_by_columns),
@@ -1166,6 +1267,7 @@ class DataChain:
1166
1267
  db_signals = self._effective_signals_schema.db_signals(
1167
1268
  include_hidden=include_hidden
1168
1269
  )
1270
+
1169
1271
  with self._query.ordered_select(*db_signals).as_iterable() as rows:
1170
1272
  if row_factory:
1171
1273
  rows = (row_factory(db_signals, r) for r in rows) # type: ignore[assignment]
@@ -376,7 +376,7 @@ def move_dataset(
376
376
  the namespace and project, or a regular name. If a regular name is used,
377
377
  default values will be applied. The source dataset will no longer exist
378
378
  after the move.
379
- dst: The destination dataset name. This can also be a fully qualified
379
+ dest: The destination dataset name. This can also be a fully qualified
380
380
  name with a namespace and project, or just a regular name (default values
381
381
  will be used in that case). The original dataset will be moved here.
382
382
  session: An optional session instance. If not provided, the default session
datachain/lib/file.py CHANGED
@@ -43,7 +43,7 @@ logger = logging.getLogger("datachain")
43
43
  # how to create file path when exporting
44
44
  ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
45
45
 
46
- FileType = Literal["binary", "text", "image", "video"]
46
+ FileType = Literal["binary", "text", "image", "video", "audio"]
47
47
  EXPORT_FILES_MAX_THREADS = 5
48
48
 
49
49
 
@@ -312,6 +312,14 @@ class File(DataModel):
312
312
  file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
313
313
  return file
314
314
 
315
+ def as_audio_file(self) -> "AudioFile":
316
+ """Convert the file to a `AudioFile` object."""
317
+ if isinstance(self, AudioFile):
318
+ return self
319
+ file = AudioFile(**self.model_dump())
320
+ file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
321
+ return file
322
+
315
323
  @classmethod
316
324
  def upload(
317
325
  cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
@@ -851,6 +859,157 @@ class VideoFile(File):
851
859
  start += duration
852
860
 
853
861
 
862
+ class AudioFile(File):
863
+ """
864
+ A data model for handling audio files.
865
+
866
+ This model inherits from the `File` model and provides additional functionality
867
+ for reading audio files, extracting audio fragments, and splitting audio into
868
+ fragments.
869
+ """
870
+
871
+ def get_info(self) -> "Audio":
872
+ """
873
+ Retrieves metadata and information about the audio file. It does not
874
+ download the file if possible, only reads its header. It is thus might be
875
+ a good idea to disable caching and prefetching for UDF if you only need
876
+ audio metadata.
877
+
878
+ Returns:
879
+ Audio: A Model containing audio metadata such as duration,
880
+ sample rate, channels, and codec details.
881
+ """
882
+ from .audio import audio_info
883
+
884
+ return audio_info(self)
885
+
886
+ def get_fragment(self, start: float, end: float) -> "AudioFragment":
887
+ """
888
+ Returns an audio fragment from the specified time range. It does not
889
+ download the file, neither it actually extracts the fragment. It returns
890
+ a Model representing the audio fragment, which can be used to read or save
891
+ it later.
892
+
893
+ Args:
894
+ start (float): The start time of the fragment in seconds.
895
+ end (float): The end time of the fragment in seconds.
896
+
897
+ Returns:
898
+ AudioFragment: A Model representing the audio fragment.
899
+ """
900
+ if start < 0 or end < 0 or start >= end:
901
+ raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
902
+
903
+ return AudioFragment(audio=self, start=start, end=end)
904
+
905
+ def get_fragments(
906
+ self,
907
+ duration: float,
908
+ start: float = 0,
909
+ end: Optional[float] = None,
910
+ ) -> "Iterator[AudioFragment]":
911
+ """
912
+ Splits the audio into multiple fragments of a specified duration.
913
+
914
+ Args:
915
+ duration (float): The duration of each audio fragment in seconds.
916
+ start (float): The starting time in seconds (default: 0).
917
+ end (float, optional): The ending time in seconds. If None, the entire
918
+ remaining audio is processed (default: None).
919
+
920
+ Returns:
921
+ Iterator[AudioFragment]: An iterator yielding audio fragments.
922
+
923
+ Note:
924
+ If end is not specified, number of samples will be taken from the
925
+ audio file, this means audio file needs to be downloaded.
926
+ """
927
+ if duration <= 0:
928
+ raise ValueError("duration must be a positive float")
929
+ if start < 0:
930
+ raise ValueError("start must be a non-negative float")
931
+
932
+ if end is None:
933
+ end = self.get_info().duration
934
+
935
+ if end < 0:
936
+ raise ValueError("end must be a non-negative float")
937
+ if start >= end:
938
+ raise ValueError("start must be less than end")
939
+
940
+ while start < end:
941
+ yield self.get_fragment(start, min(start + duration, end))
942
+ start += duration
943
+
944
+
945
+ class AudioFragment(DataModel):
946
+ """
947
+ A data model for representing an audio fragment.
948
+
949
+ This model represents a specific fragment within an audio file with defined
950
+ start and end times. It allows access to individual fragments and provides
951
+ functionality for reading and saving audio fragments as separate audio files.
952
+
953
+ Attributes:
954
+ audio (AudioFile): The audio file containing the audio fragment.
955
+ start (float): The starting time of the audio fragment in seconds.
956
+ end (float): The ending time of the audio fragment in seconds.
957
+ """
958
+
959
+ audio: AudioFile
960
+ start: float
961
+ end: float
962
+
963
+ def get_np(self) -> tuple["ndarray", int]:
964
+ """
965
+ Returns the audio fragment as a NumPy array with sample rate.
966
+
967
+ Returns:
968
+ tuple[ndarray, int]: A tuple containing the audio data as a NumPy array
969
+ and the sample rate.
970
+ """
971
+ from .audio import audio_fragment_np
972
+
973
+ duration = self.end - self.start
974
+ return audio_fragment_np(self.audio, self.start, duration)
975
+
976
+ def read_bytes(self, format: str = "wav") -> bytes:
977
+ """
978
+ Returns the audio fragment as audio bytes.
979
+
980
+ Args:
981
+ format (str): The desired audio format (e.g., 'wav', 'mp3').
982
+ Defaults to 'wav'.
983
+
984
+ Returns:
985
+ bytes: The encoded audio fragment as bytes.
986
+ """
987
+ from .audio import audio_fragment_bytes
988
+
989
+ duration = self.end - self.start
990
+ return audio_fragment_bytes(self.audio, self.start, duration, format)
991
+
992
+ def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
993
+ """
994
+ Saves the audio fragment as a new audio file.
995
+
996
+ If `output` is a remote path, the audio file will be uploaded to remote storage.
997
+
998
+ Args:
999
+ output (str): The destination path, which can be a local file path
1000
+ or a remote URL.
1001
+ format (str, optional): The output audio format (e.g., 'wav', 'mp3').
1002
+ If None, the format is inferred from the
1003
+ file extension.
1004
+
1005
+ Returns:
1006
+ AudioFile: A Model representing the saved audio file.
1007
+ """
1008
+ from .audio import save_audio_fragment
1009
+
1010
+ return save_audio_fragment(self.audio, self.start, self.end, output, format)
1011
+
1012
+
854
1013
  class VideoFrame(DataModel):
855
1014
  """
856
1015
  A data model for representing a video frame.
@@ -981,6 +1140,34 @@ class Video(DataModel):
981
1140
  codec: str = Field(default="")
982
1141
 
983
1142
 
1143
+ class Audio(DataModel):
1144
+ """
1145
+ A data model representing metadata for an audio file.
1146
+
1147
+ Attributes:
1148
+ sample_rate (int): The sample rate of the audio (samples per second).
1149
+ Defaults to -1 if unknown.
1150
+ channels (int): The number of audio channels. Defaults to -1 if unknown.
1151
+ duration (float): The total duration of the audio in seconds.
1152
+ Defaults to -1.0 if unknown.
1153
+ samples (int): The total number of samples in the audio.
1154
+ Defaults to -1 if unknown.
1155
+ format (str): The format of the audio file (e.g., 'wav', 'mp3').
1156
+ Defaults to an empty string.
1157
+ codec (str): The codec used for encoding the audio. Defaults to an empty string.
1158
+ bit_rate (int): The bit rate of the audio in bits per second.
1159
+ Defaults to -1 if unknown.
1160
+ """
1161
+
1162
+ sample_rate: int = Field(default=-1)
1163
+ channels: int = Field(default=-1)
1164
+ duration: float = Field(default=-1.0)
1165
+ samples: int = Field(default=-1)
1166
+ format: str = Field(default="")
1167
+ codec: str = Field(default="")
1168
+ bit_rate: int = Field(default=-1)
1169
+
1170
+
984
1171
  class ArrowRow(DataModel):
985
1172
  """`DataModel` for reading row from Arrow-supported file."""
986
1173
 
@@ -1018,5 +1205,7 @@ def get_file_type(type_: FileType = "binary") -> type[File]:
1018
1205
  file = ImageFile # type: ignore[assignment]
1019
1206
  elif type_ == "video":
1020
1207
  file = VideoFile
1208
+ elif type_ == "audio":
1209
+ file = AudioFile
1021
1210
 
1022
1211
  return file
@@ -81,3 +81,11 @@ class ModelStore:
81
81
  if val is None or not ModelStore.is_pydantic(val):
82
82
  return None
83
83
  return val
84
+
85
+ @staticmethod
86
+ def is_partial(parent_type) -> bool:
87
+ return (
88
+ parent_type
89
+ and ModelStore.is_pydantic(parent_type)
90
+ and "@" in ModelStore.get_name(parent_type)
91
+ )
@@ -446,14 +446,14 @@ class SignalSchema:
446
446
  res[db_name] = python_to_sql(type_)
447
447
  return res
448
448
 
449
- def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
449
+ def row_to_objs(self, row: Sequence[Any]) -> list[Any]:
450
450
  self._init_setup_values()
451
451
 
452
- objs: list[DataValue] = []
452
+ objs: list[Any] = []
453
453
  pos = 0
454
454
  for name, fr_type in self.values.items():
455
- if self.setup_values and (val := self.setup_values.get(name, None)):
456
- objs.append(val)
455
+ if self.setup_values and name in self.setup_values:
456
+ objs.append(self.setup_values.get(name))
457
457
  elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
458
458
  j, pos = unflatten_to_json_pos(fr, row, pos)
459
459
  objs.append(fr(**j))
@@ -589,6 +589,9 @@ class SignalSchema:
589
589
  ]
590
590
 
591
591
  if name:
592
+ if "." in name:
593
+ name = name.replace(".", "__")
594
+
592
595
  signals = [
593
596
  s
594
597
  for s in signals
@@ -625,6 +628,15 @@ class SignalSchema:
625
628
 
626
629
  return curr_type
627
630
 
631
+ def group_by(
632
+ self, partition_by: Sequence[str], new_column: Sequence[Column]
633
+ ) -> "SignalSchema":
634
+ orig_schema = SignalSchema(copy.deepcopy(self.values))
635
+ schema = orig_schema.to_partial(*partition_by)
636
+
637
+ vals = {c.name: sql_to_python(c) for c in new_column}
638
+ return SignalSchema(schema.values | vals)
639
+
628
640
  def select_except_signals(self, *args: str) -> "SignalSchema":
629
641
  def has_signal(signal: str):
630
642
  signal = signal.replace(".", DEFAULT_DELIMITER)
@@ -888,7 +900,7 @@ class SignalSchema:
888
900
 
889
901
  return res
890
902
 
891
- def to_partial(self, *columns: str) -> "SignalSchema":
903
+ def to_partial(self, *columns: str) -> "SignalSchema": # noqa: C901
892
904
  """
893
905
  Convert the schema to a partial schema with only the specified columns.
894
906
 
@@ -931,9 +943,15 @@ class SignalSchema:
931
943
  partial_versions: dict[str, int] = {}
932
944
 
933
945
  def _type_name_to_partial(signal_name: str, type_name: str) -> str:
934
- if "@" not in type_name:
946
+ # Check if we need to create a partial for this type
947
+ # Only create partials for custom types that are in the custom_types dict
948
+ if type_name not in custom_types:
935
949
  return type_name
936
- model_name, _ = ModelStore.parse_name_version(type_name)
950
+
951
+ if "@" in type_name:
952
+ model_name, _ = ModelStore.parse_name_version(type_name)
953
+ else:
954
+ model_name = type_name
937
955
 
938
956
  if signal_name not in signal_partials:
939
957
  partial_versions.setdefault(model_name, 0)
@@ -957,6 +975,14 @@ class SignalSchema:
957
975
  parent_type_partial = _type_name_to_partial(signal, parent_type)
958
976
 
959
977
  schema[signal] = parent_type_partial
978
+
979
+ # If this is a complex signal without field specifier (just "file")
980
+ # and it's a custom type, include the entire complex signal
981
+ if len(column_parts) == 1 and parent_type in custom_types:
982
+ # Include the entire complex signal - no need to create partial
983
+ schema[signal] = parent_type
984
+ continue
985
+
960
986
  continue
961
987
 
962
988
  if parent_type not in custom_types:
@@ -971,6 +997,20 @@ class SignalSchema:
971
997
  f"Field {signal} not found in custom type {parent_type}"
972
998
  )
973
999
 
1000
+ # Check if this is the last part and if the column type is a complex
1001
+ is_last_part = i == len(column_parts) - 1
1002
+ is_complex_signal = signal_type in custom_types
1003
+
1004
+ if is_last_part and is_complex_signal:
1005
+ schema[column] = signal_type
1006
+ # Also need to remove the partial schema entry we created for the
1007
+ # parent since we're promoting the nested complex column to root
1008
+ parent_signal = column_parts[0]
1009
+ schema.pop(parent_signal, None)
1010
+ # Don't create partial types for this case
1011
+ break
1012
+
1013
+ # Create partial type for this field
974
1014
  partial_type = _type_name_to_partial(
975
1015
  ".".join(column_parts[: i + 1]),
976
1016
  signal_type,
datachain/lib/udf.py CHANGED
@@ -13,8 +13,7 @@ from datachain.asyn import AsyncMapper
13
13
  from datachain.cache import temporary_cache
14
14
  from datachain.dataset import RowDict
15
15
  from datachain.lib.convert.flatten import flatten
16
- from datachain.lib.data_model import DataValue
17
- from datachain.lib.file import File
16
+ from datachain.lib.file import DataModel, File
18
17
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
19
18
  from datachain.query.batch import (
20
19
  Batch,
@@ -266,15 +265,28 @@ class UDFBase(AbstractUDF):
266
265
 
267
266
  def _parse_row(
268
267
  self, row_dict: RowDict, catalog: "Catalog", cache: bool, download_cb: Callback
269
- ) -> list[DataValue]:
268
+ ) -> list[Any]:
270
269
  assert self.params
271
270
  row = [row_dict[p] for p in self.params.to_udf_spec()]
272
271
  obj_row = self.params.row_to_objs(row)
273
272
  for obj in obj_row:
274
- if isinstance(obj, File):
275
- obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
273
+ self._set_stream_recursive(obj, catalog, cache, download_cb)
276
274
  return obj_row
277
275
 
276
+ def _set_stream_recursive(
277
+ self, obj: Any, catalog: "Catalog", cache: bool, download_cb: Callback
278
+ ) -> None:
279
+ """Recursively set the catalog stream on all File objects within an object."""
280
+ if isinstance(obj, File):
281
+ obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
282
+
283
+ # Check all fields for nested File objects, but only for DataModel objects
284
+ if isinstance(obj, DataModel):
285
+ for field_name in obj.model_fields:
286
+ field_value = getattr(obj, field_name, None)
287
+ if isinstance(field_value, DataModel):
288
+ self._set_stream_recursive(field_value, catalog, cache, download_cb)
289
+
278
290
  def _prepare_row(self, row, udf_fields, catalog, cache, download_cb):
279
291
  row_dict = RowDict(zip(udf_fields, row))
280
292
  return self._parse_row(row_dict, catalog, cache, download_cb)
@@ -1031,16 +1031,22 @@ class SQLGroupBy(SQLClause):
1031
1031
  c.get_column() if isinstance(c, Function) else c for c in self.group_by
1032
1032
  ]
1033
1033
 
1034
- cols = [
1035
- c.get_column()
1036
- if isinstance(c, Function)
1037
- else subquery.c[str(c)]
1038
- if isinstance(c, (str, C))
1039
- else c
1040
- for c in (*group_by, *self.cols)
1041
- ]
1034
+ cols_dict: dict[str, Any] = {}
1035
+ for c in (*group_by, *self.cols):
1036
+ if isinstance(c, Function):
1037
+ key = c.name
1038
+ value = c.get_column()
1039
+ elif isinstance(c, (str, C)):
1040
+ key = str(c)
1041
+ value = subquery.c[str(c)]
1042
+ else:
1043
+ key = c.name
1044
+ value = c # type: ignore[assignment]
1045
+ cols_dict[key] = value
1046
+
1047
+ unique_cols = cols_dict.values()
1042
1048
 
1043
- return sqlalchemy.select(*cols).select_from(subquery).group_by(*group_by)
1049
+ return sqlalchemy.select(*unique_cols).select_from(subquery).group_by(*group_by)
1044
1050
 
1045
1051
 
1046
1052
  def _validate_columns(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.25.1
3
+ Version: 0.26.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -63,6 +63,9 @@ Provides-Extra: torch
63
63
  Requires-Dist: torch>=2.1.0; extra == "torch"
64
64
  Requires-Dist: torchvision; extra == "torch"
65
65
  Requires-Dist: transformers>=4.36.0; extra == "torch"
66
+ Provides-Extra: audio
67
+ Requires-Dist: torchaudio; extra == "audio"
68
+ Requires-Dist: soundfile; extra == "audio"
66
69
  Provides-Extra: remote
67
70
  Requires-Dist: lz4; extra == "remote"
68
71
  Requires-Dist: requests>=2.22.0; extra == "remote"
@@ -78,7 +81,7 @@ Requires-Dist: ffmpeg-python; extra == "video"
78
81
  Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
79
82
  Requires-Dist: opencv-python; extra == "video"
80
83
  Provides-Extra: tests
81
- Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
84
+ Requires-Dist: datachain[audio,hf,remote,torch,vector,video]; extra == "tests"
82
85
  Requires-Dist: pytest<9,>=8; extra == "tests"
83
86
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
84
87
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -108,6 +111,7 @@ Requires-Dist: accelerate; extra == "examples"
108
111
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
109
112
  Requires-Dist: ultralytics; extra == "examples"
110
113
  Requires-Dist: open_clip_torch; extra == "examples"
114
+ Requires-Dist: openai; extra == "examples"
111
115
  Dynamic: license-file
112
116
 
113
117
  ================
@@ -1,4 +1,4 @@
1
- datachain/__init__.py,sha256=ofXacfzLKYzTqU1oyHz5xZi1L4skQCoJdUMC4YARenk,1616
1
+ datachain/__init__.py,sha256=2TZ8ptSB9BtnYF31mDEhWG9N16EQ5pf9vNqQaFr2txs,1712
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
@@ -71,24 +71,25 @@ datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
71
71
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
72
72
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
73
  datachain/lib/arrow.py,sha256=hdEQ8I1JgNmEAaXTaqaU1qvZDi5dgtes1IC69ycthz8,10753
74
+ datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
74
75
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
75
76
  datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
76
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
77
- datachain/lib/file.py,sha256=gTzJXaGIyFOrw_B4yiOEs7U23n4oAQuWDI2v9KWwp2o,33889
78
+ datachain/lib/file.py,sha256=tHBBacsh1580UPFC6fAINBNwNiyymNgzj89rpsz1LKc,40817
78
79
  datachain/lib/hf.py,sha256=_dCoGTv7n5cBgxhCDfZI-t3hnMCXGHd6sEsxRThcizE,5754
79
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
80
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
81
82
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
82
83
  datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
83
- datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
84
+ datachain/lib/model_store.py,sha256=dkL2rcT5ag-kbgkhQPL_byEs-TCYr29qvdltroL5NxM,2734
84
85
  datachain/lib/namespaces.py,sha256=it52UbbwB8dzhesO2pMs_nThXiPQ1Ph9sD9I3GQkg5s,2099
85
86
  datachain/lib/projects.py,sha256=8lN0qV8czX1LGtWURCUvRlSJk-RpO9w9Rra_pOZus6g,2595
86
87
  datachain/lib/pytorch.py,sha256=oBBd6cxYrcwaFz7IQajKqhGqDdNnwUZWs0wJPRizrjk,7712
87
88
  datachain/lib/settings.py,sha256=9wi0FoHxRxNiyn99pR28IYsMkoo47jQxeXuObQr2Ar0,2929
88
- datachain/lib/signal_schema.py,sha256=dVEqqrQQ_BS3yzU_49-Gari7IjVyMl1UT8h1WIsZabs,36489
89
+ datachain/lib/signal_schema.py,sha256=UGbjG6yJKIU2i4B6z9AK1rqaPWtxRjsPnCV6GYbNqGg,38329
89
90
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
90
91
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
91
- datachain/lib/udf.py,sha256=3uITkhO8IZnX49aePheObzd5ORYi2DIDYZVMQlBAJ-s,16687
92
+ datachain/lib/udf.py,sha256=nkcB3HNtSteUspwsGmOKyy3mH2F-Sfo6iW64-Ep47-I,17299
92
93
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
93
94
  datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
94
95
  datachain/lib/video.py,sha256=u6fLJWj5G6QqsVkpfHnKGklBNpG3BRRg6v3izngnNcU,6767
@@ -97,14 +98,14 @@ datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0E
97
98
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
98
99
  datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1NY4,1505
99
100
  datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djMP3t8CHJLo,3188
100
- datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
101
+ datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
101
102
  datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
102
103
  datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
103
104
  datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
104
105
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
105
106
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
106
- datachain/lib/dc/datachain.py,sha256=_FJnpgNN_b2xz39MsgeS0NTto0hzpcFPbJlaUBLcqTs,87094
107
- datachain/lib/dc/datasets.py,sha256=eBhcybEeXHcQ_7RweRCh5uJyF5Ym1EEDPmD0YWYDPHw,15097
107
+ datachain/lib/dc/datachain.py,sha256=YJYHp94yTWjd_ZmBXEUOHVeEvOb5jOhjIxgtqu1dnW4,91746
108
+ datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
108
109
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
109
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
110
111
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
@@ -125,7 +126,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
125
126
  datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
126
127
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
127
128
  datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
128
- datachain/query/dataset.py,sha256=t9EWZkJGPRPcBvKOsFO7ZiaTeUXc8YuTZydRbcv83_E,61350
129
+ datachain/query/dataset.py,sha256=bhJpm53tNLQzGECuR1nC1tg2Vd6foq6AKST5h1rb41U,61606
129
130
  datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
130
131
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
131
132
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -157,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
157
158
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
158
159
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
159
160
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
160
- datachain-0.25.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
- datachain-0.25.1.dist-info/METADATA,sha256=NaMV5K1wxCrOI7zW8agwmNfDMMkJJgaQ2fNX2PsuHnc,13385
162
- datachain-0.25.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
- datachain-0.25.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
- datachain-0.25.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
- datachain-0.25.1.dist-info/RECORD,,
161
+ datachain-0.26.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
+ datachain-0.26.0.dist-info/METADATA,sha256=4-DhUSU6ciIc8iUiB4UAh1ZKyFczvN5rHZnvd1x2Y9U,13543
163
+ datachain-0.26.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ datachain-0.26.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
+ datachain-0.26.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
+ datachain-0.26.0.dist-info/RECORD,,