datachain 0.28.0__py3-none-any.whl → 0.28.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -2419,9 +2419,11 @@ class DataChain:
2419
2419
  ds.to_storage("gs://mybucket", placement="filename")
2420
2420
  ```
2421
2421
  """
2422
+ chain = self.persist()
2423
+ count = chain.count()
2424
+
2422
2425
  if placement == "filename" and (
2423
- self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
2424
- != self._query.count()
2426
+ chain._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() != count
2425
2427
  ):
2426
2428
  raise ValueError("Files with the same name found")
2427
2429
 
@@ -2433,7 +2435,7 @@ class DataChain:
2433
2435
  unit=" files",
2434
2436
  unit_scale=True,
2435
2437
  unit_divisor=10,
2436
- total=self.count(),
2438
+ total=count,
2437
2439
  leave=False,
2438
2440
  )
2439
2441
  file_exporter = FileExporter(
@@ -2444,7 +2446,10 @@ class DataChain:
2444
2446
  max_threads=num_threads or 1,
2445
2447
  client_config=client_config,
2446
2448
  )
2447
- file_exporter.run(self.to_values(signal), progress_bar)
2449
+ file_exporter.run(
2450
+ (rows[0] for rows in chain.to_iter(signal)),
2451
+ progress_bar,
2452
+ )
2448
2453
 
2449
2454
  def shuffle(self) -> "Self":
2450
2455
  """Shuffle the rows of the chain deterministically."""
datachain/lib/file.py CHANGED
@@ -23,7 +23,7 @@ from pydantic import Field, field_validator
23
23
 
24
24
  from datachain.client.fileslice import FileSlice
25
25
  from datachain.lib.data_model import DataModel
26
- from datachain.lib.utils import DataChainError
26
+ from datachain.lib.utils import DataChainError, rebase_path
27
27
  from datachain.nodes_thread_pool import NodesThreadPool
28
28
  from datachain.sql.types import JSON, Boolean, DateTime, Int, String
29
29
  from datachain.utils import TIME_ZERO
@@ -634,6 +634,40 @@ class File(DataModel):
634
634
  location=self.location,
635
635
  )
636
636
 
637
+ def rebase(
638
+ self,
639
+ old_base: str,
640
+ new_base: str,
641
+ suffix: str = "",
642
+ extension: str = "",
643
+ ) -> str:
644
+ """
645
+ Rebase the file's URI from one base directory to another.
646
+
647
+ Args:
648
+ old_base: Base directory to remove from the file's URI
649
+ new_base: New base directory to prepend
650
+ suffix: Optional suffix to add before file extension
651
+ extension: Optional new file extension (without dot)
652
+
653
+ Returns:
654
+ str: Rebased URI with new base directory
655
+
656
+ Raises:
657
+ ValueError: If old_base is not found in the file's URI
658
+
659
+ Examples:
660
+ >>> file = File(source="s3://bucket", path="data/2025-05-27/file.wav")
661
+ >>> file.rebase("s3://bucket/data", "s3://output-bucket/processed", \
662
+ extension="mp3")
663
+ 's3://output-bucket/processed/2025-05-27/file.mp3'
664
+
665
+ >>> file.rebase("data/audio", "/local/output", suffix="_ch1",
666
+ extension="npy")
667
+ '/local/output/file_ch1.npy'
668
+ """
669
+ return rebase_path(self.get_uri(), old_base, new_base, suffix, extension)
670
+
637
671
 
638
672
  def resolve(file: File) -> File:
639
673
  """
@@ -1219,6 +1253,24 @@ class Audio(DataModel):
1219
1253
  codec: str = Field(default="")
1220
1254
  bit_rate: int = Field(default=-1)
1221
1255
 
1256
+ @staticmethod
1257
+ def get_channel_name(num_channels: int, channel_idx: int) -> str:
1258
+ """Map channel index to meaningful name based on common audio formats"""
1259
+ channel_mappings = {
1260
+ 1: ["Mono"],
1261
+ 2: ["Left", "Right"],
1262
+ 4: ["W", "X", "Y", "Z"], # First-order Ambisonics
1263
+ 6: ["FL", "FR", "FC", "LFE", "BL", "BR"], # 5.1 surround
1264
+ 8: ["FL", "FR", "FC", "LFE", "BL", "BR", "SL", "SR"], # 7.1 surround
1265
+ }
1266
+
1267
+ if num_channels in channel_mappings:
1268
+ channels = channel_mappings[num_channels]
1269
+ if 0 <= channel_idx < len(channels):
1270
+ return channels[channel_idx]
1271
+
1272
+ return f"Ch{channel_idx + 1}"
1273
+
1222
1274
 
1223
1275
  class ArrowRow(DataModel):
1224
1276
  """`DataModel` for reading row from Arrow-supported file."""
datachain/lib/utils.py CHANGED
@@ -1,6 +1,8 @@
1
1
  import re
2
2
  from abc import ABC, abstractmethod
3
3
  from collections.abc import Sequence
4
+ from pathlib import PurePosixPath
5
+ from urllib.parse import urlparse
4
6
 
5
7
 
6
8
  class AbstractUDF(ABC):
@@ -57,3 +59,97 @@ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
57
59
  new_col_names[generated_column] = org_column
58
60
 
59
61
  return new_col_names
62
+
63
+
64
+ def rebase_path(
65
+ src_path: str,
66
+ old_base: str,
67
+ new_base: str,
68
+ suffix: str = "",
69
+ extension: str = "",
70
+ ) -> str:
71
+ """
72
+ Rebase a file path from one base directory to another.
73
+
74
+ Args:
75
+ src_path: Source file path (can include URI scheme like s3://)
76
+ old_base: Base directory to remove from src_path
77
+ new_base: New base directory to prepend
78
+ suffix: Optional suffix to add before file extension
79
+ extension: Optional new file extension (without dot)
80
+
81
+ Returns:
82
+ str: Rebased path with new base directory
83
+
84
+ Raises:
85
+ ValueError: If old_base is not found in src_path
86
+ """
87
+ # Parse URIs to handle schemes properly
88
+ src_parsed = urlparse(src_path)
89
+ old_base_parsed = urlparse(old_base)
90
+ new_base_parsed = urlparse(new_base)
91
+
92
+ # Get the path component (without scheme)
93
+ if src_parsed.scheme:
94
+ src_path_only = src_parsed.netloc + src_parsed.path
95
+ else:
96
+ src_path_only = src_path
97
+
98
+ if old_base_parsed.scheme:
99
+ old_base_only = old_base_parsed.netloc + old_base_parsed.path
100
+ else:
101
+ old_base_only = old_base
102
+
103
+ # Normalize paths
104
+ src_path_norm = PurePosixPath(src_path_only).as_posix()
105
+ old_base_norm = PurePosixPath(old_base_only).as_posix()
106
+
107
+ # Find where old_base appears in src_path
108
+ if old_base_norm in src_path_norm:
109
+ # Find the index where old_base appears
110
+ idx = src_path_norm.find(old_base_norm)
111
+ if idx == -1:
112
+ raise ValueError(f"old_base '{old_base}' not found in src_path")
113
+
114
+ # Extract the relative path after old_base
115
+ relative_start = idx + len(old_base_norm)
116
+ # Skip leading slash if present
117
+ if relative_start < len(src_path_norm) and src_path_norm[relative_start] == "/":
118
+ relative_start += 1
119
+ relative_path = src_path_norm[relative_start:]
120
+ else:
121
+ raise ValueError(f"old_base '{old_base}' not found in src_path")
122
+
123
+ # Parse the filename
124
+ path_obj = PurePosixPath(relative_path)
125
+ stem = path_obj.stem
126
+ current_ext = path_obj.suffix
127
+
128
+ # Apply suffix and extension changes
129
+ new_stem = stem + suffix if suffix else stem
130
+ if extension:
131
+ new_ext = f".{extension}"
132
+ elif current_ext:
133
+ new_ext = current_ext
134
+ else:
135
+ new_ext = ""
136
+
137
+ # Build new filename
138
+ new_name = new_stem + new_ext
139
+
140
+ # Reconstruct path with new base
141
+ parent = str(path_obj.parent)
142
+ if parent == ".":
143
+ new_relative_path = new_name
144
+ else:
145
+ new_relative_path = str(PurePosixPath(parent) / new_name)
146
+
147
+ # Handle new_base URI scheme
148
+ if new_base_parsed.scheme:
149
+ # Has schema like s3://
150
+ base_path = new_base_parsed.netloc + new_base_parsed.path
151
+ base_path = PurePosixPath(base_path).as_posix()
152
+ full_path = str(PurePosixPath(base_path) / new_relative_path)
153
+ return f"{new_base_parsed.scheme}://{full_path}"
154
+ # Regular path
155
+ return str(PurePosixPath(new_base) / new_relative_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.28.0
3
+ Version: 0.28.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -75,7 +75,7 @@ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
75
75
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
76
76
  datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
77
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
78
- datachain/lib/file.py,sha256=_ch7xYcpl0kzImgEwccbQ-a5qb9rbEvx1vcuWerOn9k,42608
78
+ datachain/lib/file.py,sha256=IGwpCwjsSOpZXlRsatcMKToMmuvYiX6_UtaTjUKAAdg,44511
79
79
  datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
80
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
@@ -91,7 +91,7 @@ datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
91
91
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
92
92
  datachain/lib/udf.py,sha256=SUnJWRDC3TlLhvpi8iqqJbeZGn5DChot7DyH-0Q-z20,17305
93
93
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
94
- datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
94
+ datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
95
95
  datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
96
96
  datachain/lib/webdataset.py,sha256=CkW8FfGigNx6wo2EEK4KMjhEE8FamRHWGs2HZuH7jDY,7214
97
97
  datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
@@ -104,7 +104,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
104
104
  datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
105
105
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
106
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
107
- datachain/lib/dc/datachain.py,sha256=mLE5v4KhzEQm7HVWBTxY6EwJ2J-YeFVcLUY4I21216c,93212
107
+ datachain/lib/dc/datachain.py,sha256=U2CV8-ewfu-sW1D2BysdqCtbnEA7uNL1ZhYLWPAFB1o,93298
108
108
  datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
109
109
  datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
@@ -158,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
158
158
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
159
159
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
160
160
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
161
- datachain-0.28.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
- datachain-0.28.0.dist-info/METADATA,sha256=lA3lv9RX2NeQPobrEjoEbAwg5K3zmnAnbDJ_hjR8KLw,13766
163
- datachain-0.28.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- datachain-0.28.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
- datachain-0.28.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
- datachain-0.28.0.dist-info/RECORD,,
161
+ datachain-0.28.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
+ datachain-0.28.1.dist-info/METADATA,sha256=9rZc1mFjNj6S3v6FjgrhM7bUdi6kO_5606CB7HQCfeo,13766
163
+ datachain-0.28.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ datachain-0.28.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
+ datachain-0.28.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
+ datachain-0.28.1.dist-info/RECORD,,