datachain 0.28.0__py3-none-any.whl → 0.28.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/lib/dc/datachain.py +9 -4
- datachain/lib/file.py +53 -1
- datachain/lib/utils.py +96 -0
- {datachain-0.28.0.dist-info → datachain-0.28.1.dist-info}/METADATA +1 -1
- {datachain-0.28.0.dist-info → datachain-0.28.1.dist-info}/RECORD +9 -9
- {datachain-0.28.0.dist-info → datachain-0.28.1.dist-info}/WHEEL +0 -0
- {datachain-0.28.0.dist-info → datachain-0.28.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.28.0.dist-info → datachain-0.28.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.28.0.dist-info → datachain-0.28.1.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datachain.py
CHANGED
|
@@ -2419,9 +2419,11 @@ class DataChain:
|
|
|
2419
2419
|
ds.to_storage("gs://mybucket", placement="filename")
|
|
2420
2420
|
```
|
|
2421
2421
|
"""
|
|
2422
|
+
chain = self.persist()
|
|
2423
|
+
count = chain.count()
|
|
2424
|
+
|
|
2422
2425
|
if placement == "filename" and (
|
|
2423
|
-
|
|
2424
|
-
!= self._query.count()
|
|
2426
|
+
chain._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() != count
|
|
2425
2427
|
):
|
|
2426
2428
|
raise ValueError("Files with the same name found")
|
|
2427
2429
|
|
|
@@ -2433,7 +2435,7 @@ class DataChain:
|
|
|
2433
2435
|
unit=" files",
|
|
2434
2436
|
unit_scale=True,
|
|
2435
2437
|
unit_divisor=10,
|
|
2436
|
-
total=
|
|
2438
|
+
total=count,
|
|
2437
2439
|
leave=False,
|
|
2438
2440
|
)
|
|
2439
2441
|
file_exporter = FileExporter(
|
|
@@ -2444,7 +2446,10 @@ class DataChain:
|
|
|
2444
2446
|
max_threads=num_threads or 1,
|
|
2445
2447
|
client_config=client_config,
|
|
2446
2448
|
)
|
|
2447
|
-
file_exporter.run(
|
|
2449
|
+
file_exporter.run(
|
|
2450
|
+
(rows[0] for rows in chain.to_iter(signal)),
|
|
2451
|
+
progress_bar,
|
|
2452
|
+
)
|
|
2448
2453
|
|
|
2449
2454
|
def shuffle(self) -> "Self":
|
|
2450
2455
|
"""Shuffle the rows of the chain deterministically."""
|
datachain/lib/file.py
CHANGED
|
@@ -23,7 +23,7 @@ from pydantic import Field, field_validator
|
|
|
23
23
|
|
|
24
24
|
from datachain.client.fileslice import FileSlice
|
|
25
25
|
from datachain.lib.data_model import DataModel
|
|
26
|
-
from datachain.lib.utils import DataChainError
|
|
26
|
+
from datachain.lib.utils import DataChainError, rebase_path
|
|
27
27
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
28
28
|
from datachain.sql.types import JSON, Boolean, DateTime, Int, String
|
|
29
29
|
from datachain.utils import TIME_ZERO
|
|
@@ -634,6 +634,40 @@ class File(DataModel):
|
|
|
634
634
|
location=self.location,
|
|
635
635
|
)
|
|
636
636
|
|
|
637
|
+
def rebase(
|
|
638
|
+
self,
|
|
639
|
+
old_base: str,
|
|
640
|
+
new_base: str,
|
|
641
|
+
suffix: str = "",
|
|
642
|
+
extension: str = "",
|
|
643
|
+
) -> str:
|
|
644
|
+
"""
|
|
645
|
+
Rebase the file's URI from one base directory to another.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
old_base: Base directory to remove from the file's URI
|
|
649
|
+
new_base: New base directory to prepend
|
|
650
|
+
suffix: Optional suffix to add before file extension
|
|
651
|
+
extension: Optional new file extension (without dot)
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
str: Rebased URI with new base directory
|
|
655
|
+
|
|
656
|
+
Raises:
|
|
657
|
+
ValueError: If old_base is not found in the file's URI
|
|
658
|
+
|
|
659
|
+
Examples:
|
|
660
|
+
>>> file = File(source="s3://bucket", path="data/2025-05-27/file.wav")
|
|
661
|
+
>>> file.rebase("s3://bucket/data", "s3://output-bucket/processed", \
|
|
662
|
+
extension="mp3")
|
|
663
|
+
's3://output-bucket/processed/2025-05-27/file.mp3'
|
|
664
|
+
|
|
665
|
+
>>> file.rebase("data/audio", "/local/output", suffix="_ch1",
|
|
666
|
+
extension="npy")
|
|
667
|
+
'/local/output/file_ch1.npy'
|
|
668
|
+
"""
|
|
669
|
+
return rebase_path(self.get_uri(), old_base, new_base, suffix, extension)
|
|
670
|
+
|
|
637
671
|
|
|
638
672
|
def resolve(file: File) -> File:
|
|
639
673
|
"""
|
|
@@ -1219,6 +1253,24 @@ class Audio(DataModel):
|
|
|
1219
1253
|
codec: str = Field(default="")
|
|
1220
1254
|
bit_rate: int = Field(default=-1)
|
|
1221
1255
|
|
|
1256
|
+
@staticmethod
|
|
1257
|
+
def get_channel_name(num_channels: int, channel_idx: int) -> str:
|
|
1258
|
+
"""Map channel index to meaningful name based on common audio formats"""
|
|
1259
|
+
channel_mappings = {
|
|
1260
|
+
1: ["Mono"],
|
|
1261
|
+
2: ["Left", "Right"],
|
|
1262
|
+
4: ["W", "X", "Y", "Z"], # First-order Ambisonics
|
|
1263
|
+
6: ["FL", "FR", "FC", "LFE", "BL", "BR"], # 5.1 surround
|
|
1264
|
+
8: ["FL", "FR", "FC", "LFE", "BL", "BR", "SL", "SR"], # 7.1 surround
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
if num_channels in channel_mappings:
|
|
1268
|
+
channels = channel_mappings[num_channels]
|
|
1269
|
+
if 0 <= channel_idx < len(channels):
|
|
1270
|
+
return channels[channel_idx]
|
|
1271
|
+
|
|
1272
|
+
return f"Ch{channel_idx + 1}"
|
|
1273
|
+
|
|
1222
1274
|
|
|
1223
1275
|
class ArrowRow(DataModel):
|
|
1224
1276
|
"""`DataModel` for reading row from Arrow-supported file."""
|
datachain/lib/utils.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from collections.abc import Sequence
|
|
4
|
+
from pathlib import PurePosixPath
|
|
5
|
+
from urllib.parse import urlparse
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
class AbstractUDF(ABC):
|
|
@@ -57,3 +59,97 @@ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
|
|
|
57
59
|
new_col_names[generated_column] = org_column
|
|
58
60
|
|
|
59
61
|
return new_col_names
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def rebase_path(
|
|
65
|
+
src_path: str,
|
|
66
|
+
old_base: str,
|
|
67
|
+
new_base: str,
|
|
68
|
+
suffix: str = "",
|
|
69
|
+
extension: str = "",
|
|
70
|
+
) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Rebase a file path from one base directory to another.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
src_path: Source file path (can include URI scheme like s3://)
|
|
76
|
+
old_base: Base directory to remove from src_path
|
|
77
|
+
new_base: New base directory to prepend
|
|
78
|
+
suffix: Optional suffix to add before file extension
|
|
79
|
+
extension: Optional new file extension (without dot)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
str: Rebased path with new base directory
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
ValueError: If old_base is not found in src_path
|
|
86
|
+
"""
|
|
87
|
+
# Parse URIs to handle schemes properly
|
|
88
|
+
src_parsed = urlparse(src_path)
|
|
89
|
+
old_base_parsed = urlparse(old_base)
|
|
90
|
+
new_base_parsed = urlparse(new_base)
|
|
91
|
+
|
|
92
|
+
# Get the path component (without scheme)
|
|
93
|
+
if src_parsed.scheme:
|
|
94
|
+
src_path_only = src_parsed.netloc + src_parsed.path
|
|
95
|
+
else:
|
|
96
|
+
src_path_only = src_path
|
|
97
|
+
|
|
98
|
+
if old_base_parsed.scheme:
|
|
99
|
+
old_base_only = old_base_parsed.netloc + old_base_parsed.path
|
|
100
|
+
else:
|
|
101
|
+
old_base_only = old_base
|
|
102
|
+
|
|
103
|
+
# Normalize paths
|
|
104
|
+
src_path_norm = PurePosixPath(src_path_only).as_posix()
|
|
105
|
+
old_base_norm = PurePosixPath(old_base_only).as_posix()
|
|
106
|
+
|
|
107
|
+
# Find where old_base appears in src_path
|
|
108
|
+
if old_base_norm in src_path_norm:
|
|
109
|
+
# Find the index where old_base appears
|
|
110
|
+
idx = src_path_norm.find(old_base_norm)
|
|
111
|
+
if idx == -1:
|
|
112
|
+
raise ValueError(f"old_base '{old_base}' not found in src_path")
|
|
113
|
+
|
|
114
|
+
# Extract the relative path after old_base
|
|
115
|
+
relative_start = idx + len(old_base_norm)
|
|
116
|
+
# Skip leading slash if present
|
|
117
|
+
if relative_start < len(src_path_norm) and src_path_norm[relative_start] == "/":
|
|
118
|
+
relative_start += 1
|
|
119
|
+
relative_path = src_path_norm[relative_start:]
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"old_base '{old_base}' not found in src_path")
|
|
122
|
+
|
|
123
|
+
# Parse the filename
|
|
124
|
+
path_obj = PurePosixPath(relative_path)
|
|
125
|
+
stem = path_obj.stem
|
|
126
|
+
current_ext = path_obj.suffix
|
|
127
|
+
|
|
128
|
+
# Apply suffix and extension changes
|
|
129
|
+
new_stem = stem + suffix if suffix else stem
|
|
130
|
+
if extension:
|
|
131
|
+
new_ext = f".{extension}"
|
|
132
|
+
elif current_ext:
|
|
133
|
+
new_ext = current_ext
|
|
134
|
+
else:
|
|
135
|
+
new_ext = ""
|
|
136
|
+
|
|
137
|
+
# Build new filename
|
|
138
|
+
new_name = new_stem + new_ext
|
|
139
|
+
|
|
140
|
+
# Reconstruct path with new base
|
|
141
|
+
parent = str(path_obj.parent)
|
|
142
|
+
if parent == ".":
|
|
143
|
+
new_relative_path = new_name
|
|
144
|
+
else:
|
|
145
|
+
new_relative_path = str(PurePosixPath(parent) / new_name)
|
|
146
|
+
|
|
147
|
+
# Handle new_base URI scheme
|
|
148
|
+
if new_base_parsed.scheme:
|
|
149
|
+
# Has schema like s3://
|
|
150
|
+
base_path = new_base_parsed.netloc + new_base_parsed.path
|
|
151
|
+
base_path = PurePosixPath(base_path).as_posix()
|
|
152
|
+
full_path = str(PurePosixPath(base_path) / new_relative_path)
|
|
153
|
+
return f"{new_base_parsed.scheme}://{full_path}"
|
|
154
|
+
# Regular path
|
|
155
|
+
return str(PurePosixPath(new_base) / new_relative_path)
|
|
@@ -75,7 +75,7 @@ datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
|
|
|
75
75
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
76
76
|
datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
|
|
77
77
|
datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
|
|
78
|
-
datachain/lib/file.py,sha256=
|
|
78
|
+
datachain/lib/file.py,sha256=IGwpCwjsSOpZXlRsatcMKToMmuvYiX6_UtaTjUKAAdg,44511
|
|
79
79
|
datachain/lib/hf.py,sha256=3xdvPQPilnJiGv3H4S4bTGqvrGGlZgZmqjE1n_SMJZg,7293
|
|
80
80
|
datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
81
81
|
datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
|
|
@@ -91,7 +91,7 @@ datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
|
|
|
91
91
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
92
92
|
datachain/lib/udf.py,sha256=SUnJWRDC3TlLhvpi8iqqJbeZGn5DChot7DyH-0Q-z20,17305
|
|
93
93
|
datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
|
|
94
|
-
datachain/lib/utils.py,sha256=
|
|
94
|
+
datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
|
|
95
95
|
datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
|
|
96
96
|
datachain/lib/webdataset.py,sha256=CkW8FfGigNx6wo2EEK4KMjhEE8FamRHWGs2HZuH7jDY,7214
|
|
97
97
|
datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
|
|
@@ -104,7 +104,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
104
104
|
datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
|
|
105
105
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
106
106
|
datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
|
|
107
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
107
|
+
datachain/lib/dc/datachain.py,sha256=U2CV8-ewfu-sW1D2BysdqCtbnEA7uNL1ZhYLWPAFB1o,93298
|
|
108
108
|
datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
|
|
109
109
|
datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
|
|
110
110
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
@@ -158,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
158
158
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
159
159
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
160
160
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
161
|
-
datachain-0.28.
|
|
162
|
-
datachain-0.28.
|
|
163
|
-
datachain-0.28.
|
|
164
|
-
datachain-0.28.
|
|
165
|
-
datachain-0.28.
|
|
166
|
-
datachain-0.28.
|
|
161
|
+
datachain-0.28.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
162
|
+
datachain-0.28.1.dist-info/METADATA,sha256=9rZc1mFjNj6S3v6FjgrhM7bUdi6kO_5606CB7HQCfeo,13766
|
|
163
|
+
datachain-0.28.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
164
|
+
datachain-0.28.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
165
|
+
datachain-0.28.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
166
|
+
datachain-0.28.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|