datachain 0.10.0__py3-none-any.whl → 0.11.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cli/__init__.py +1 -0
- datachain/cli/commands/show.py +12 -1
- datachain/cli/parser/utils.py +6 -0
- datachain/lib/data_model.py +6 -0
- datachain/lib/dc.py +91 -20
- datachain/lib/file.py +52 -11
- datachain/lib/signal_schema.py +194 -15
- datachain/nodes_thread_pool.py +32 -11
- datachain/script_meta.py +147 -0
- datachain/utils.py +3 -0
- {datachain-0.10.0.dist-info → datachain-0.11.11.dist-info}/METADATA +5 -4
- {datachain-0.10.0.dist-info → datachain-0.11.11.dist-info}/RECORD +16 -15
- {datachain-0.10.0.dist-info → datachain-0.11.11.dist-info}/WHEEL +1 -1
- {datachain-0.10.0.dist-info → datachain-0.11.11.dist-info}/LICENSE +0 -0
- {datachain-0.10.0.dist-info → datachain-0.11.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.10.0.dist-info → datachain-0.11.11.dist-info}/top_level.txt +0 -0
datachain/cli/__init__.py
CHANGED
datachain/cli/commands/show.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from typing import TYPE_CHECKING, Optional
|
|
3
3
|
|
|
4
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
5
|
+
|
|
4
6
|
if TYPE_CHECKING:
|
|
5
7
|
from datachain.catalog import Catalog
|
|
6
8
|
|
|
@@ -14,6 +16,7 @@ def show(
|
|
|
14
16
|
columns: Sequence[str] = (),
|
|
15
17
|
no_collapse: bool = False,
|
|
16
18
|
schema: bool = False,
|
|
19
|
+
include_hidden: bool = False,
|
|
17
20
|
) -> None:
|
|
18
21
|
from datachain import Session
|
|
19
22
|
from datachain.lib.dc import DataChain
|
|
@@ -23,6 +26,13 @@ def show(
|
|
|
23
26
|
dataset = catalog.get_dataset(name)
|
|
24
27
|
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
25
28
|
|
|
29
|
+
if include_hidden:
|
|
30
|
+
hidden_fields = []
|
|
31
|
+
else:
|
|
32
|
+
hidden_fields = SignalSchema.get_flatten_hidden_fields(
|
|
33
|
+
dataset_version.feature_schema
|
|
34
|
+
)
|
|
35
|
+
|
|
26
36
|
query = (
|
|
27
37
|
DatasetQuery(name=name, version=version, catalog=catalog)
|
|
28
38
|
.select(*columns)
|
|
@@ -30,7 +40,8 @@ def show(
|
|
|
30
40
|
.offset(offset)
|
|
31
41
|
)
|
|
32
42
|
records = query.to_db_records()
|
|
33
|
-
show_records(records, collapse_columns=not no_collapse)
|
|
43
|
+
show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
|
|
44
|
+
|
|
34
45
|
if schema and dataset_version.feature_schema:
|
|
35
46
|
print("\nSchema:")
|
|
36
47
|
session = Session.get(catalog=catalog)
|
datachain/cli/parser/utils.py
CHANGED
datachain/lib/data_model.py
CHANGED
|
@@ -26,6 +26,7 @@ class DataModel(BaseModel):
|
|
|
26
26
|
"""Pydantic model wrapper that registers model with `DataChain`."""
|
|
27
27
|
|
|
28
28
|
_version: ClassVar[int] = 1
|
|
29
|
+
_hidden_fields: ClassVar[list[str]] = []
|
|
29
30
|
|
|
30
31
|
@classmethod
|
|
31
32
|
def __pydantic_init_subclass__(cls):
|
|
@@ -41,6 +42,11 @@ class DataModel(BaseModel):
|
|
|
41
42
|
for val in models:
|
|
42
43
|
ModelStore.register(val)
|
|
43
44
|
|
|
45
|
+
@classmethod
|
|
46
|
+
def hidden_fields(cls) -> list[str]:
|
|
47
|
+
"""Returns a list of fields that should be hidden from the user."""
|
|
48
|
+
return cls._hidden_fields
|
|
49
|
+
|
|
44
50
|
|
|
45
51
|
def is_chain_type(t: type) -> bool:
|
|
46
52
|
"""Return true if type is supported by `DataChain`."""
|
datachain/lib/dc.py
CHANGED
|
@@ -23,6 +23,7 @@ import sqlalchemy
|
|
|
23
23
|
from pydantic import BaseModel
|
|
24
24
|
from sqlalchemy.sql.functions import GenericFunction
|
|
25
25
|
from sqlalchemy.sql.sqltypes import NullType
|
|
26
|
+
from tqdm import tqdm
|
|
26
27
|
|
|
27
28
|
from datachain.dataset import DatasetRecord
|
|
28
29
|
from datachain.func import literal
|
|
@@ -32,7 +33,14 @@ from datachain.lib.convert.python_to_sql import python_to_sql
|
|
|
32
33
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
33
34
|
from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
|
|
34
35
|
from datachain.lib.dataset_info import DatasetInfo
|
|
35
|
-
from datachain.lib.file import
|
|
36
|
+
from datachain.lib.file import (
|
|
37
|
+
EXPORT_FILES_MAX_THREADS,
|
|
38
|
+
ArrowRow,
|
|
39
|
+
File,
|
|
40
|
+
FileExporter,
|
|
41
|
+
FileType,
|
|
42
|
+
get_file_type,
|
|
43
|
+
)
|
|
36
44
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
37
45
|
from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
|
|
38
46
|
from datachain.lib.listing_info import ListingInfo
|
|
@@ -65,7 +73,6 @@ _T = TypeVar("_T")
|
|
|
65
73
|
D = TypeVar("D", bound="DataChain")
|
|
66
74
|
UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
|
|
67
75
|
|
|
68
|
-
|
|
69
76
|
DEFAULT_PARQUET_CHUNK_SIZE = 100_000
|
|
70
77
|
|
|
71
78
|
|
|
@@ -1050,7 +1057,7 @@ class DataChain:
|
|
|
1050
1057
|
def select(self, *args: str, _sys: bool = True) -> "Self":
|
|
1051
1058
|
"""Select only a specified set of signals."""
|
|
1052
1059
|
new_schema = self.signals_schema.resolve(*args)
|
|
1053
|
-
if _sys:
|
|
1060
|
+
if self._sys and _sys:
|
|
1054
1061
|
new_schema = SignalSchema({"sys": Sys}) | new_schema
|
|
1055
1062
|
columns = new_schema.db_signals()
|
|
1056
1063
|
return self._evolve(
|
|
@@ -1093,6 +1100,7 @@ class DataChain:
|
|
|
1093
1100
|
partition_by_columns: list[Column] = []
|
|
1094
1101
|
signal_columns: list[Column] = []
|
|
1095
1102
|
schema_fields: dict[str, DataType] = {}
|
|
1103
|
+
keep_columns: list[str] = []
|
|
1096
1104
|
|
|
1097
1105
|
# validate partition_by columns and add them to the schema
|
|
1098
1106
|
for col in partition_by:
|
|
@@ -1100,10 +1108,13 @@ class DataChain:
|
|
|
1100
1108
|
col_db_name = ColumnMeta.to_db_name(col)
|
|
1101
1109
|
col_type = self.signals_schema.get_column_type(col_db_name)
|
|
1102
1110
|
column = Column(col_db_name, python_to_sql(col_type))
|
|
1111
|
+
if col not in keep_columns:
|
|
1112
|
+
keep_columns.append(col)
|
|
1103
1113
|
elif isinstance(col, Function):
|
|
1104
1114
|
column = col.get_column(self.signals_schema)
|
|
1105
1115
|
col_db_name = column.name
|
|
1106
1116
|
col_type = column.type.python_type
|
|
1117
|
+
schema_fields[col_db_name] = col_type
|
|
1107
1118
|
else:
|
|
1108
1119
|
raise DataChainColumnError(
|
|
1109
1120
|
col,
|
|
@@ -1113,7 +1124,6 @@ class DataChain:
|
|
|
1113
1124
|
),
|
|
1114
1125
|
)
|
|
1115
1126
|
partition_by_columns.append(column)
|
|
1116
|
-
schema_fields[col_db_name] = col_type
|
|
1117
1127
|
|
|
1118
1128
|
# validate signal columns and add them to the schema
|
|
1119
1129
|
if not kwargs:
|
|
@@ -1128,9 +1138,13 @@ class DataChain:
|
|
|
1128
1138
|
signal_columns.append(column)
|
|
1129
1139
|
schema_fields[col_name] = func.get_result_type(self.signals_schema)
|
|
1130
1140
|
|
|
1141
|
+
signal_schema = SignalSchema(schema_fields)
|
|
1142
|
+
if keep_columns:
|
|
1143
|
+
signal_schema |= self.signals_schema.to_partial(*keep_columns)
|
|
1144
|
+
|
|
1131
1145
|
return self._evolve(
|
|
1132
1146
|
query=self._query.group_by(signal_columns, partition_by_columns),
|
|
1133
|
-
signal_schema=
|
|
1147
|
+
signal_schema=signal_schema,
|
|
1134
1148
|
)
|
|
1135
1149
|
|
|
1136
1150
|
def mutate(self, **kwargs) -> "Self":
|
|
@@ -1225,23 +1239,37 @@ class DataChain:
|
|
|
1225
1239
|
@overload
|
|
1226
1240
|
def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
|
|
1227
1241
|
|
|
1242
|
+
@overload
|
|
1243
|
+
def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
|
|
1244
|
+
|
|
1228
1245
|
@overload
|
|
1229
1246
|
def collect_flatten(
|
|
1230
1247
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1231
1248
|
) -> Iterator[_T]: ...
|
|
1232
1249
|
|
|
1233
|
-
|
|
1250
|
+
@overload
|
|
1251
|
+
def collect_flatten(
|
|
1252
|
+
self,
|
|
1253
|
+
*,
|
|
1254
|
+
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1255
|
+
include_hidden: bool,
|
|
1256
|
+
) -> Iterator[_T]: ...
|
|
1257
|
+
|
|
1258
|
+
def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
|
|
1234
1259
|
"""Yields flattened rows of values as a tuple.
|
|
1235
1260
|
|
|
1236
1261
|
Args:
|
|
1237
1262
|
row_factory : A callable to convert row to a custom format.
|
|
1238
1263
|
It should accept two arguments: a list of column names and
|
|
1239
1264
|
a tuple of row values.
|
|
1265
|
+
include_hidden: Whether to include hidden signals from the schema.
|
|
1240
1266
|
"""
|
|
1241
|
-
db_signals = self._effective_signals_schema.db_signals(
|
|
1267
|
+
db_signals = self._effective_signals_schema.db_signals(
|
|
1268
|
+
include_hidden=include_hidden
|
|
1269
|
+
)
|
|
1242
1270
|
with self._query.ordered_select(*db_signals).as_iterable() as rows:
|
|
1243
1271
|
if row_factory:
|
|
1244
|
-
rows = (row_factory(db_signals, r) for r in rows)
|
|
1272
|
+
rows = (row_factory(db_signals, r) for r in rows) # type: ignore[assignment]
|
|
1245
1273
|
yield from rows
|
|
1246
1274
|
|
|
1247
1275
|
def to_columnar_data_with_names(
|
|
@@ -1275,10 +1303,23 @@ class DataChain:
|
|
|
1275
1303
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1276
1304
|
) -> list[_T]: ...
|
|
1277
1305
|
|
|
1278
|
-
|
|
1306
|
+
@overload
|
|
1307
|
+
def results(
|
|
1308
|
+
self,
|
|
1309
|
+
*,
|
|
1310
|
+
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1311
|
+
include_hidden: bool,
|
|
1312
|
+
) -> list[_T]: ...
|
|
1313
|
+
|
|
1314
|
+
@overload
|
|
1315
|
+
def results(self, *, include_hidden: bool) -> list[tuple[Any, ...]]: ...
|
|
1316
|
+
|
|
1317
|
+
def results(self, *, row_factory=None, include_hidden=True): # noqa: D102
|
|
1279
1318
|
if row_factory is None:
|
|
1280
|
-
return list(self.collect_flatten())
|
|
1281
|
-
return list(
|
|
1319
|
+
return list(self.collect_flatten(include_hidden=include_hidden))
|
|
1320
|
+
return list(
|
|
1321
|
+
self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
|
|
1322
|
+
)
|
|
1282
1323
|
|
|
1283
1324
|
def to_records(self) -> list[dict[str, Any]]:
|
|
1284
1325
|
"""Convert every row to a dictionary."""
|
|
@@ -1788,21 +1829,25 @@ class DataChain:
|
|
|
1788
1829
|
**fr_map,
|
|
1789
1830
|
)
|
|
1790
1831
|
|
|
1791
|
-
def to_pandas(self, flatten=False) -> "pd.DataFrame":
|
|
1832
|
+
def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
|
|
1792
1833
|
"""Return a pandas DataFrame from the chain.
|
|
1793
1834
|
|
|
1794
1835
|
Parameters:
|
|
1795
1836
|
flatten : Whether to use a multiindex or flatten column names.
|
|
1837
|
+
include_hidden : Whether to include hidden columns.
|
|
1796
1838
|
"""
|
|
1797
1839
|
import pandas as pd
|
|
1798
1840
|
|
|
1799
|
-
headers, max_length = self._effective_signals_schema.get_headers_with_length(
|
|
1841
|
+
headers, max_length = self._effective_signals_schema.get_headers_with_length(
|
|
1842
|
+
include_hidden=include_hidden
|
|
1843
|
+
)
|
|
1800
1844
|
if flatten or max_length < 2:
|
|
1801
1845
|
columns = [".".join(filter(None, header)) for header in headers]
|
|
1802
1846
|
else:
|
|
1803
1847
|
columns = pd.MultiIndex.from_tuples(map(tuple, headers))
|
|
1804
1848
|
|
|
1805
|
-
|
|
1849
|
+
results = self.results(include_hidden=include_hidden)
|
|
1850
|
+
return pd.DataFrame.from_records(results, columns=columns)
|
|
1806
1851
|
|
|
1807
1852
|
def show(
|
|
1808
1853
|
self,
|
|
@@ -1810,6 +1855,7 @@ class DataChain:
|
|
|
1810
1855
|
flatten=False,
|
|
1811
1856
|
transpose=False,
|
|
1812
1857
|
truncate=True,
|
|
1858
|
+
include_hidden=False,
|
|
1813
1859
|
) -> None:
|
|
1814
1860
|
"""Show a preview of the chain results.
|
|
1815
1861
|
|
|
@@ -1818,11 +1864,12 @@ class DataChain:
|
|
|
1818
1864
|
flatten : Whether to use a multiindex or flatten column names.
|
|
1819
1865
|
transpose : Whether to transpose rows and columns.
|
|
1820
1866
|
truncate : Whether or not to truncate the contents of columns.
|
|
1867
|
+
include_hidden : Whether to include hidden columns.
|
|
1821
1868
|
"""
|
|
1822
1869
|
import pandas as pd
|
|
1823
1870
|
|
|
1824
1871
|
dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
|
|
1825
|
-
df = dc.to_pandas(flatten)
|
|
1872
|
+
df = dc.to_pandas(flatten, include_hidden=include_hidden)
|
|
1826
1873
|
|
|
1827
1874
|
if df.empty:
|
|
1828
1875
|
print("Empty result")
|
|
@@ -2498,19 +2545,25 @@ class DataChain:
|
|
|
2498
2545
|
output: str,
|
|
2499
2546
|
signal: str = "file",
|
|
2500
2547
|
placement: FileExportPlacement = "fullpath",
|
|
2501
|
-
use_cache: bool = True,
|
|
2502
2548
|
link_type: Literal["copy", "symlink"] = "copy",
|
|
2549
|
+
num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
|
|
2550
|
+
anon: bool = False,
|
|
2551
|
+
client_config: Optional[dict] = None,
|
|
2503
2552
|
) -> None:
|
|
2504
|
-
"""Export files from a specified signal to a directory.
|
|
2553
|
+
"""Export files from a specified signal to a directory. Files can be
|
|
2554
|
+
exported to a local or cloud directory.
|
|
2505
2555
|
|
|
2506
2556
|
Args:
|
|
2507
2557
|
output: Path to the target directory for exporting files.
|
|
2508
2558
|
signal: Name of the signal to export files from.
|
|
2509
2559
|
placement: The method to use for naming exported files.
|
|
2510
2560
|
The possible values are: "filename", "etag", "fullpath", and "checksum".
|
|
2511
|
-
use_cache: If `True`, cache the files before exporting.
|
|
2512
2561
|
link_type: Method to use for exporting files.
|
|
2513
2562
|
Falls back to `'copy'` if symlinking fails.
|
|
2563
|
+
num_threads : number of threads to use for exporting files.
|
|
2564
|
+
By default it uses 5 threads.
|
|
2565
|
+
anon: If true, we will treat cloud bucket as public one
|
|
2566
|
+
client_config: Optional configuration for the destination storage client
|
|
2514
2567
|
|
|
2515
2568
|
Example:
|
|
2516
2569
|
Cross cloud transfer
|
|
@@ -2525,8 +2578,26 @@ class DataChain:
|
|
|
2525
2578
|
):
|
|
2526
2579
|
raise ValueError("Files with the same name found")
|
|
2527
2580
|
|
|
2528
|
-
|
|
2529
|
-
|
|
2581
|
+
if anon:
|
|
2582
|
+
client_config = (client_config or {}) | {"anon": True}
|
|
2583
|
+
|
|
2584
|
+
progress_bar = tqdm(
|
|
2585
|
+
desc=f"Exporting files to {output}: ",
|
|
2586
|
+
unit=" files",
|
|
2587
|
+
unit_scale=True,
|
|
2588
|
+
unit_divisor=10,
|
|
2589
|
+
total=self.count(),
|
|
2590
|
+
leave=False,
|
|
2591
|
+
)
|
|
2592
|
+
file_exporter = FileExporter(
|
|
2593
|
+
output,
|
|
2594
|
+
placement,
|
|
2595
|
+
self._settings.cache if self._settings else False,
|
|
2596
|
+
link_type,
|
|
2597
|
+
max_threads=num_threads or 1,
|
|
2598
|
+
client_config=client_config,
|
|
2599
|
+
)
|
|
2600
|
+
file_exporter.run(self.collect(signal), progress_bar)
|
|
2530
2601
|
|
|
2531
2602
|
def shuffle(self) -> "Self":
|
|
2532
2603
|
"""Shuffle the rows of the chain deterministically."""
|
datachain/lib/file.py
CHANGED
|
@@ -24,6 +24,7 @@ from pydantic import Field, field_validator
|
|
|
24
24
|
from datachain.client.fileslice import FileSlice
|
|
25
25
|
from datachain.lib.data_model import DataModel
|
|
26
26
|
from datachain.lib.utils import DataChainError
|
|
27
|
+
from datachain.nodes_thread_pool import NodesThreadPool
|
|
27
28
|
from datachain.sql.types import JSON, Boolean, DateTime, Int, String
|
|
28
29
|
from datachain.utils import TIME_ZERO
|
|
29
30
|
|
|
@@ -43,6 +44,41 @@ logger = logging.getLogger("datachain")
|
|
|
43
44
|
ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
44
45
|
|
|
45
46
|
FileType = Literal["binary", "text", "image", "video"]
|
|
47
|
+
EXPORT_FILES_MAX_THREADS = 5
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FileExporter(NodesThreadPool):
|
|
51
|
+
"""Class that does file exporting concurrently with thread pool"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
output: str,
|
|
56
|
+
placement: ExportPlacement,
|
|
57
|
+
use_cache: bool,
|
|
58
|
+
link_type: Literal["copy", "symlink"],
|
|
59
|
+
max_threads: int = EXPORT_FILES_MAX_THREADS,
|
|
60
|
+
client_config: Optional[dict] = None,
|
|
61
|
+
):
|
|
62
|
+
super().__init__(max_threads)
|
|
63
|
+
self.output = output
|
|
64
|
+
self.placement = placement
|
|
65
|
+
self.use_cache = use_cache
|
|
66
|
+
self.link_type = link_type
|
|
67
|
+
self.client_config = client_config
|
|
68
|
+
|
|
69
|
+
def done_task(self, done):
|
|
70
|
+
for task in done:
|
|
71
|
+
task.result()
|
|
72
|
+
|
|
73
|
+
def do_task(self, file):
|
|
74
|
+
file.export(
|
|
75
|
+
self.output,
|
|
76
|
+
self.placement,
|
|
77
|
+
self.use_cache,
|
|
78
|
+
link_type=self.link_type,
|
|
79
|
+
client_config=self.client_config,
|
|
80
|
+
)
|
|
81
|
+
self.increase_counter(1)
|
|
46
82
|
|
|
47
83
|
|
|
48
84
|
class VFileError(DataChainError):
|
|
@@ -158,6 +194,7 @@ class File(DataModel):
|
|
|
158
194
|
"last_modified": DateTime,
|
|
159
195
|
"location": JSON,
|
|
160
196
|
}
|
|
197
|
+
_hidden_fields: ClassVar[list[str]] = ["version", "source"]
|
|
161
198
|
|
|
162
199
|
_unique_id_keys: ClassVar[list[str]] = [
|
|
163
200
|
"source",
|
|
@@ -269,11 +306,15 @@ class File(DataModel):
|
|
|
269
306
|
with self.open(mode="r") as stream:
|
|
270
307
|
return stream.read()
|
|
271
308
|
|
|
272
|
-
def save(self, destination: str):
|
|
309
|
+
def save(self, destination: str, client_config: Optional[dict] = None):
|
|
273
310
|
"""Writes it's content to destination"""
|
|
274
311
|
destination = stringify_path(destination)
|
|
275
|
-
client: Client = self._catalog.get_client(
|
|
276
|
-
|
|
312
|
+
client: Client = self._catalog.get_client(destination, **(client_config or {}))
|
|
313
|
+
|
|
314
|
+
if client.PREFIX == "file://" and not destination.startswith(client.PREFIX):
|
|
315
|
+
destination = Path(destination).absolute().as_uri()
|
|
316
|
+
|
|
317
|
+
client.upload(self.read(), destination)
|
|
277
318
|
|
|
278
319
|
def _symlink_to(self, destination: str):
|
|
279
320
|
if self.location:
|
|
@@ -296,13 +337,13 @@ class File(DataModel):
|
|
|
296
337
|
placement: ExportPlacement = "fullpath",
|
|
297
338
|
use_cache: bool = True,
|
|
298
339
|
link_type: Literal["copy", "symlink"] = "copy",
|
|
340
|
+
client_config: Optional[dict] = None,
|
|
299
341
|
) -> None:
|
|
300
342
|
"""Export file to new location."""
|
|
301
|
-
|
|
302
|
-
self._caching_enabled = use_cache
|
|
343
|
+
self._caching_enabled = use_cache
|
|
303
344
|
dst = self.get_destination_path(output, placement)
|
|
304
345
|
dst_dir = os.path.dirname(dst)
|
|
305
|
-
client: Client = self._catalog.get_client(dst_dir)
|
|
346
|
+
client: Client = self._catalog.get_client(dst_dir, **(client_config or {}))
|
|
306
347
|
client.fs.makedirs(dst_dir, exist_ok=True)
|
|
307
348
|
|
|
308
349
|
if link_type == "symlink":
|
|
@@ -312,7 +353,7 @@ class File(DataModel):
|
|
|
312
353
|
if exc.errno not in (errno.ENOTSUP, errno.EXDEV, errno.ENOSYS):
|
|
313
354
|
raise
|
|
314
355
|
|
|
315
|
-
self.save(dst)
|
|
356
|
+
self.save(dst, client_config=client_config)
|
|
316
357
|
|
|
317
358
|
def _set_stream(
|
|
318
359
|
self,
|
|
@@ -498,11 +539,11 @@ class TextFile(File):
|
|
|
498
539
|
with self.open() as stream:
|
|
499
540
|
return stream.read()
|
|
500
541
|
|
|
501
|
-
def save(self, destination: str):
|
|
542
|
+
def save(self, destination: str, client_config: Optional[dict] = None):
|
|
502
543
|
"""Writes it's content to destination"""
|
|
503
544
|
destination = stringify_path(destination)
|
|
504
545
|
|
|
505
|
-
client: Client = self._catalog.get_client(destination)
|
|
546
|
+
client: Client = self._catalog.get_client(destination, **(client_config or {}))
|
|
506
547
|
with client.fs.open(destination, mode="w") as f:
|
|
507
548
|
f.write(self.read_text())
|
|
508
549
|
|
|
@@ -515,11 +556,11 @@ class ImageFile(File):
|
|
|
515
556
|
fobj = super().read()
|
|
516
557
|
return PilImage.open(BytesIO(fobj))
|
|
517
558
|
|
|
518
|
-
def save(self, destination: str):
|
|
559
|
+
def save(self, destination: str, client_config: Optional[dict] = None):
|
|
519
560
|
"""Writes it's content to destination"""
|
|
520
561
|
destination = stringify_path(destination)
|
|
521
562
|
|
|
522
|
-
client: Client = self._catalog.get_client(destination)
|
|
563
|
+
client: Client = self._catalog.get_client(destination, **(client_config or {}))
|
|
523
564
|
with client.fs.open(destination, mode="wb") as f:
|
|
524
565
|
self.read().save(f)
|
|
525
566
|
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -91,6 +91,7 @@ class CustomType(BaseModel):
|
|
|
91
91
|
name: str
|
|
92
92
|
fields: dict[str, str]
|
|
93
93
|
bases: list[tuple[str, str, Optional[str]]]
|
|
94
|
+
hidden_fields: Optional[list[str]] = None
|
|
94
95
|
|
|
95
96
|
@classmethod
|
|
96
97
|
def deserialize(cls, data: dict[str, Any], type_name: str) -> "CustomType":
|
|
@@ -102,6 +103,7 @@ class CustomType(BaseModel):
|
|
|
102
103
|
"name": type_name,
|
|
103
104
|
"fields": data,
|
|
104
105
|
"bases": [],
|
|
106
|
+
"hidden_fields": [],
|
|
105
107
|
}
|
|
106
108
|
|
|
107
109
|
return cls(**data)
|
|
@@ -179,6 +181,16 @@ class SignalSchema:
|
|
|
179
181
|
)
|
|
180
182
|
return SignalSchema(signals)
|
|
181
183
|
|
|
184
|
+
@staticmethod
|
|
185
|
+
def _get_bases(fr: type) -> list[tuple[str, str, Optional[str]]]:
|
|
186
|
+
bases: list[tuple[str, str, Optional[str]]] = []
|
|
187
|
+
for base in fr.__mro__:
|
|
188
|
+
model_store_name = (
|
|
189
|
+
ModelStore.get_name(base) if issubclass(base, DataModel) else None
|
|
190
|
+
)
|
|
191
|
+
bases.append((base.__name__, base.__module__, model_store_name))
|
|
192
|
+
return bases
|
|
193
|
+
|
|
182
194
|
@staticmethod
|
|
183
195
|
def _serialize_custom_model(
|
|
184
196
|
version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
|
|
@@ -196,14 +208,15 @@ class SignalSchema:
|
|
|
196
208
|
assert field_type
|
|
197
209
|
fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
|
|
198
210
|
|
|
199
|
-
bases
|
|
200
|
-
for type_ in fr.__mro__:
|
|
201
|
-
model_store_name = (
|
|
202
|
-
ModelStore.get_name(type_) if issubclass(type_, DataModel) else None
|
|
203
|
-
)
|
|
204
|
-
bases.append((type_.__name__, type_.__module__, model_store_name))
|
|
211
|
+
bases = SignalSchema._get_bases(fr)
|
|
205
212
|
|
|
206
|
-
ct = CustomType(
|
|
213
|
+
ct = CustomType(
|
|
214
|
+
schema_version=2,
|
|
215
|
+
name=version_name,
|
|
216
|
+
fields=fields,
|
|
217
|
+
bases=bases,
|
|
218
|
+
hidden_fields=getattr(fr, "_hidden_fields", []),
|
|
219
|
+
)
|
|
207
220
|
custom_types[version_name] = ct.model_dump()
|
|
208
221
|
|
|
209
222
|
return version_name
|
|
@@ -384,6 +397,37 @@ class SignalSchema:
|
|
|
384
397
|
|
|
385
398
|
return SignalSchema(signals)
|
|
386
399
|
|
|
400
|
+
@staticmethod
|
|
401
|
+
def get_flatten_hidden_fields(schema):
|
|
402
|
+
custom_types = schema.get("_custom_types", {})
|
|
403
|
+
if not custom_types:
|
|
404
|
+
return []
|
|
405
|
+
|
|
406
|
+
hidden_by_types = {
|
|
407
|
+
name: schema.get("hidden_fields", [])
|
|
408
|
+
for name, schema in custom_types.items()
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
hidden_fields = []
|
|
412
|
+
|
|
413
|
+
def traverse(prefix, schema_info):
|
|
414
|
+
for field, field_type in schema_info.items():
|
|
415
|
+
if field == "_custom_types":
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
if field_type in custom_types:
|
|
419
|
+
hidden_fields.extend(
|
|
420
|
+
f"{prefix}{field}__{f}" for f in hidden_by_types[field_type]
|
|
421
|
+
)
|
|
422
|
+
traverse(
|
|
423
|
+
prefix + field + "__",
|
|
424
|
+
custom_types[field_type].get("fields", {}),
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
traverse("", schema)
|
|
428
|
+
|
|
429
|
+
return hidden_fields
|
|
430
|
+
|
|
387
431
|
def to_udf_spec(self) -> dict[str, type]:
|
|
388
432
|
res = {}
|
|
389
433
|
for path, type_, has_subtree, _ in self.get_flat_tree():
|
|
@@ -479,7 +523,7 @@ class SignalSchema:
|
|
|
479
523
|
raise SignalResolvingError([col_name], "is not found")
|
|
480
524
|
|
|
481
525
|
def db_signals(
|
|
482
|
-
self, name: Optional[str] = None, as_columns=False
|
|
526
|
+
self, name: Optional[str] = None, as_columns=False, include_hidden: bool = True
|
|
483
527
|
) -> Union[list[str], list[Column]]:
|
|
484
528
|
"""
|
|
485
529
|
Returns DB columns as strings or Column objects with proper types
|
|
@@ -489,7 +533,9 @@ class SignalSchema:
|
|
|
489
533
|
DEFAULT_DELIMITER.join(path)
|
|
490
534
|
if not as_columns
|
|
491
535
|
else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
492
|
-
for path, _type, has_subtree, _ in self.get_flat_tree(
|
|
536
|
+
for path, _type, has_subtree, _ in self.get_flat_tree(
|
|
537
|
+
include_hidden=include_hidden
|
|
538
|
+
)
|
|
493
539
|
if not has_subtree
|
|
494
540
|
]
|
|
495
541
|
|
|
@@ -624,19 +670,31 @@ class SignalSchema:
|
|
|
624
670
|
for name, val in values.items()
|
|
625
671
|
}
|
|
626
672
|
|
|
627
|
-
def get_flat_tree(
|
|
628
|
-
|
|
673
|
+
def get_flat_tree(
|
|
674
|
+
self, include_hidden: bool = True
|
|
675
|
+
) -> Iterator[tuple[list[str], DataType, bool, int]]:
|
|
676
|
+
yield from self._get_flat_tree(self.tree, [], 0, include_hidden)
|
|
629
677
|
|
|
630
678
|
def _get_flat_tree(
|
|
631
|
-
self, tree: dict, prefix: list[str], depth: int
|
|
679
|
+
self, tree: dict, prefix: list[str], depth: int, include_hidden: bool
|
|
632
680
|
) -> Iterator[tuple[list[str], DataType, bool, int]]:
|
|
633
681
|
for name, (type_, substree) in tree.items():
|
|
634
682
|
suffix = name.split(".")
|
|
635
683
|
new_prefix = prefix + suffix
|
|
684
|
+
hidden_fields = getattr(type_, "_hidden_fields", None)
|
|
685
|
+
if hidden_fields and substree and not include_hidden:
|
|
686
|
+
substree = {
|
|
687
|
+
field: info
|
|
688
|
+
for field, info in substree.items()
|
|
689
|
+
if field not in hidden_fields
|
|
690
|
+
}
|
|
691
|
+
|
|
636
692
|
has_subtree = substree is not None
|
|
637
693
|
yield new_prefix, type_, has_subtree, depth
|
|
638
694
|
if substree is not None:
|
|
639
|
-
yield from self._get_flat_tree(
|
|
695
|
+
yield from self._get_flat_tree(
|
|
696
|
+
substree, new_prefix, depth + 1, include_hidden
|
|
697
|
+
)
|
|
640
698
|
|
|
641
699
|
def print_tree(self, indent: int = 4, start_at: int = 0):
|
|
642
700
|
for path, type_, _, depth in self.get_flat_tree():
|
|
@@ -649,9 +707,13 @@ class SignalSchema:
|
|
|
649
707
|
sub_schema = SignalSchema({"* list of": args[0]})
|
|
650
708
|
sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
|
|
651
709
|
|
|
652
|
-
def get_headers_with_length(self):
|
|
710
|
+
def get_headers_with_length(self, include_hidden: bool = True):
|
|
653
711
|
paths = [
|
|
654
|
-
path
|
|
712
|
+
path
|
|
713
|
+
for path, _, has_subtree, _ in self.get_flat_tree(
|
|
714
|
+
include_hidden=include_hidden
|
|
715
|
+
)
|
|
716
|
+
if not has_subtree
|
|
655
717
|
]
|
|
656
718
|
max_length = max([len(path) for path in paths], default=0)
|
|
657
719
|
return [
|
|
@@ -749,3 +811,120 @@ class SignalSchema:
|
|
|
749
811
|
res[name] = (anno, subtree) # type: ignore[assignment]
|
|
750
812
|
|
|
751
813
|
return res
|
|
814
|
+
|
|
815
|
+
def to_partial(self, *columns: str) -> "SignalSchema":
|
|
816
|
+
"""
|
|
817
|
+
Convert the schema to a partial schema with only the specified columns.
|
|
818
|
+
|
|
819
|
+
E.g. if original schema is:
|
|
820
|
+
|
|
821
|
+
```
|
|
822
|
+
signal: Foo@v1
|
|
823
|
+
name: str
|
|
824
|
+
value: float
|
|
825
|
+
count: int
|
|
826
|
+
```
|
|
827
|
+
|
|
828
|
+
Then `to_partial("signal.name", "count")` will return a partial schema:
|
|
829
|
+
|
|
830
|
+
```
|
|
831
|
+
signal: FooPartial@v1
|
|
832
|
+
name: str
|
|
833
|
+
count: int
|
|
834
|
+
```
|
|
835
|
+
|
|
836
|
+
Note that partial schema will have a different name for the custom types
|
|
837
|
+
(e.g. `FooPartial@v1` instead of `Foo@v1`) to avoid conflicts
|
|
838
|
+
with the original schema.
|
|
839
|
+
|
|
840
|
+
Args:
|
|
841
|
+
*columns (str): The columns to include in the partial schema.
|
|
842
|
+
|
|
843
|
+
Returns:
|
|
844
|
+
SignalSchema: The new partial schema.
|
|
845
|
+
"""
|
|
846
|
+
serialized = self.serialize()
|
|
847
|
+
custom_types = serialized.get("_custom_types", {})
|
|
848
|
+
|
|
849
|
+
schema: dict[str, Any] = {}
|
|
850
|
+
schema_custom_types: dict[str, CustomType] = {}
|
|
851
|
+
|
|
852
|
+
data_model_bases: Optional[list[tuple[str, str, Optional[str]]]] = None
|
|
853
|
+
|
|
854
|
+
signal_partials: dict[str, str] = {}
|
|
855
|
+
partial_versions: dict[str, int] = {}
|
|
856
|
+
|
|
857
|
+
def _type_name_to_partial(signal_name: str, type_name: str) -> str:
|
|
858
|
+
if "@" not in type_name:
|
|
859
|
+
return type_name
|
|
860
|
+
model_name, _ = ModelStore.parse_name_version(type_name)
|
|
861
|
+
|
|
862
|
+
if signal_name not in signal_partials:
|
|
863
|
+
partial_versions.setdefault(model_name, 0)
|
|
864
|
+
partial_versions[model_name] += 1
|
|
865
|
+
version = partial_versions[model_name]
|
|
866
|
+
signal_partials[signal_name] = f"{model_name}Partial{version}"
|
|
867
|
+
|
|
868
|
+
return signal_partials[signal_name]
|
|
869
|
+
|
|
870
|
+
for column in columns:
|
|
871
|
+
parent_type, parent_type_partial = "", ""
|
|
872
|
+
column_parts = column.split(".")
|
|
873
|
+
for i, signal in enumerate(column_parts):
|
|
874
|
+
if i == 0:
|
|
875
|
+
if signal not in serialized:
|
|
876
|
+
raise SignalSchemaError(
|
|
877
|
+
f"Column {column} not found in the schema"
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
parent_type = serialized[signal]
|
|
881
|
+
parent_type_partial = _type_name_to_partial(signal, parent_type)
|
|
882
|
+
|
|
883
|
+
schema[signal] = parent_type_partial
|
|
884
|
+
continue
|
|
885
|
+
|
|
886
|
+
if parent_type not in custom_types:
|
|
887
|
+
raise SignalSchemaError(
|
|
888
|
+
f"Custom type {parent_type} not found in the schema"
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
custom_type = custom_types[parent_type]
|
|
892
|
+
signal_type = custom_type["fields"].get(signal)
|
|
893
|
+
if not signal_type:
|
|
894
|
+
raise SignalSchemaError(
|
|
895
|
+
f"Field {signal} not found in custom type {parent_type}"
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
partial_type = _type_name_to_partial(
|
|
899
|
+
".".join(column_parts[: i + 1]),
|
|
900
|
+
signal_type,
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
if parent_type_partial in schema_custom_types:
|
|
904
|
+
schema_custom_types[parent_type_partial].fields[signal] = (
|
|
905
|
+
partial_type
|
|
906
|
+
)
|
|
907
|
+
else:
|
|
908
|
+
if data_model_bases is None:
|
|
909
|
+
data_model_bases = SignalSchema._get_bases(DataModel)
|
|
910
|
+
|
|
911
|
+
partial_type_name, _ = ModelStore.parse_name_version(partial_type)
|
|
912
|
+
schema_custom_types[parent_type_partial] = CustomType(
|
|
913
|
+
schema_version=2,
|
|
914
|
+
name=partial_type_name,
|
|
915
|
+
fields={signal: partial_type},
|
|
916
|
+
bases=[
|
|
917
|
+
(partial_type_name, "__main__", partial_type),
|
|
918
|
+
*data_model_bases,
|
|
919
|
+
],
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
parent_type, parent_type_partial = signal_type, partial_type
|
|
923
|
+
|
|
924
|
+
if schema_custom_types:
|
|
925
|
+
schema["_custom_types"] = {
|
|
926
|
+
type_name: ct.model_dump()
|
|
927
|
+
for type_name, ct in schema_custom_types.items()
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
return SignalSchema.deserialize(schema)
|
datachain/nodes_thread_pool.py
CHANGED
|
@@ -57,6 +57,9 @@ class NodesThreadPool(ABC):
|
|
|
57
57
|
self._max_threads = max_threads
|
|
58
58
|
self._thread_counter = 0
|
|
59
59
|
self._thread_lock = threading.Lock()
|
|
60
|
+
self.tasks = set()
|
|
61
|
+
self.canceled = False
|
|
62
|
+
self.th_pool = None
|
|
60
63
|
|
|
61
64
|
def run(
|
|
62
65
|
self,
|
|
@@ -64,37 +67,55 @@ class NodesThreadPool(ABC):
|
|
|
64
67
|
progress_bar=None,
|
|
65
68
|
):
|
|
66
69
|
results = []
|
|
67
|
-
|
|
68
|
-
|
|
70
|
+
self.th_pool = concurrent.futures.ThreadPoolExecutor(self._max_threads)
|
|
71
|
+
try:
|
|
69
72
|
self._thread_counter = 0
|
|
70
73
|
for chunk in chunk_gen:
|
|
71
|
-
|
|
74
|
+
if self.canceled:
|
|
75
|
+
break
|
|
76
|
+
while len(self.tasks) >= self._max_threads:
|
|
72
77
|
done, _ = concurrent.futures.wait(
|
|
73
|
-
tasks, timeout=1, return_when="FIRST_COMPLETED"
|
|
78
|
+
self.tasks, timeout=1, return_when="FIRST_COMPLETED"
|
|
74
79
|
)
|
|
75
80
|
self.done_task(done)
|
|
76
81
|
|
|
77
|
-
tasks = tasks - done
|
|
82
|
+
self.tasks = self.tasks - done
|
|
78
83
|
self.update_progress_bar(progress_bar)
|
|
79
84
|
|
|
80
|
-
tasks.add(th_pool.submit(self.do_task, chunk))
|
|
85
|
+
self.tasks.add(self.th_pool.submit(self.do_task, chunk))
|
|
81
86
|
self.update_progress_bar(progress_bar)
|
|
82
87
|
|
|
83
|
-
while tasks:
|
|
88
|
+
while self.tasks:
|
|
89
|
+
if self.canceled:
|
|
90
|
+
break
|
|
84
91
|
done, _ = concurrent.futures.wait(
|
|
85
|
-
tasks, timeout=1, return_when="FIRST_COMPLETED"
|
|
92
|
+
self.tasks, timeout=1, return_when="FIRST_COMPLETED"
|
|
86
93
|
)
|
|
87
94
|
task_results = self.done_task(done)
|
|
88
95
|
if task_results:
|
|
89
96
|
results.extend(task_results)
|
|
90
97
|
|
|
91
|
-
tasks = tasks - done
|
|
98
|
+
self.tasks = self.tasks - done
|
|
92
99
|
self.update_progress_bar(progress_bar)
|
|
93
|
-
|
|
94
|
-
|
|
100
|
+
except:
|
|
101
|
+
self.cancel_all()
|
|
102
|
+
raise
|
|
103
|
+
else:
|
|
104
|
+
self.th_pool.shutdown()
|
|
95
105
|
|
|
96
106
|
return results
|
|
97
107
|
|
|
108
|
+
def cancel_all(self):
|
|
109
|
+
self.cancel = True
|
|
110
|
+
# Canceling tasks just in case any of them is scheduled to run.
|
|
111
|
+
# Note that running tasks cannot be canceled, instead we will wait for
|
|
112
|
+
# them to finish when shutting down thread loop executor by calling
|
|
113
|
+
# shutdown() method.
|
|
114
|
+
for task in self.tasks:
|
|
115
|
+
task.cancel()
|
|
116
|
+
if self.th_pool:
|
|
117
|
+
self.th_pool.shutdown() # this will wait for running tasks to finish
|
|
118
|
+
|
|
98
119
|
def update_progress_bar(self, progress_bar):
|
|
99
120
|
if progress_bar is not None:
|
|
100
121
|
with self._thread_lock:
|
datachain/script_meta.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import tomllib
|
|
7
|
+
except ModuleNotFoundError:
|
|
8
|
+
# tomllib is in standard library from python 3.11 so for earlier versions
|
|
9
|
+
# we need tomli
|
|
10
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ScriptConfigParsingError(Exception):
|
|
14
|
+
def __init__(self, message):
|
|
15
|
+
super().__init__(message)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ScriptConfig:
|
|
20
|
+
"""
|
|
21
|
+
Class that is parsing inline script metadata to get some basic information for
|
|
22
|
+
running datachain script like python version, dependencies, attachments etc.
|
|
23
|
+
Inline script metadata must follow the format described in https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata.
|
|
24
|
+
Example of script with inline metadata:
|
|
25
|
+
# /// script
|
|
26
|
+
# requires-python = ">=3.12"
|
|
27
|
+
#
|
|
28
|
+
# dependencies = [
|
|
29
|
+
# "pandas < 2.1.0",
|
|
30
|
+
# "numpy == 1.26.4"
|
|
31
|
+
# ]
|
|
32
|
+
#
|
|
33
|
+
# [tools.datachain.workers]
|
|
34
|
+
# num_workers = 3
|
|
35
|
+
#
|
|
36
|
+
# [tools.datachain.attachments]
|
|
37
|
+
# image1 = "s3://ldb-public/image1.jpg"
|
|
38
|
+
# file1 = "s3://ldb-public/file.pdf"
|
|
39
|
+
#
|
|
40
|
+
# [tools.datachain.params]
|
|
41
|
+
# min_length_sec = 1
|
|
42
|
+
# cache = false
|
|
43
|
+
#
|
|
44
|
+
# [tools.datachain.inputs]
|
|
45
|
+
# threshold = 0.5
|
|
46
|
+
# start_ds_name = "ds://start"
|
|
47
|
+
#
|
|
48
|
+
# [tools.datachain.outputs]
|
|
49
|
+
# result_dataset = "ds://res"
|
|
50
|
+
# result_dir = "/temp"
|
|
51
|
+
#
|
|
52
|
+
# ///
|
|
53
|
+
|
|
54
|
+
import sys
|
|
55
|
+
import pandas as pd
|
|
56
|
+
|
|
57
|
+
print(f"Python version: {sys.version_info}")
|
|
58
|
+
print(f"Pandas version: {pd.__version__}")
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
python_version: Optional[str]
|
|
63
|
+
dependencies: list[str]
|
|
64
|
+
attachments: dict[str, str]
|
|
65
|
+
params: dict[str, Any]
|
|
66
|
+
inputs: dict[str, Any]
|
|
67
|
+
outputs: dict[str, Any]
|
|
68
|
+
num_workers: Optional[int] = None
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
python_version: Optional[str] = None,
|
|
73
|
+
dependencies: Optional[list[str]] = None,
|
|
74
|
+
attachments: Optional[dict[str, str]] = None,
|
|
75
|
+
params: Optional[dict[str, Any]] = None,
|
|
76
|
+
inputs: Optional[dict[str, Any]] = None,
|
|
77
|
+
outputs: Optional[dict[str, Any]] = None,
|
|
78
|
+
num_workers: Optional[int] = None,
|
|
79
|
+
):
|
|
80
|
+
self.python_version = python_version
|
|
81
|
+
self.dependencies = dependencies or []
|
|
82
|
+
self.attachments = attachments or {}
|
|
83
|
+
self.params = params or {}
|
|
84
|
+
self.inputs = inputs or {}
|
|
85
|
+
self.outputs = outputs or {}
|
|
86
|
+
self.num_workers = num_workers
|
|
87
|
+
|
|
88
|
+
def get_param(self, name: str, default: Any) -> Any:
|
|
89
|
+
return self.params.get(name, default)
|
|
90
|
+
|
|
91
|
+
def get_input(self, name: str, default: Any) -> Any:
|
|
92
|
+
return self.inputs.get(name, default)
|
|
93
|
+
|
|
94
|
+
def get_output(self, name: str, default: Any) -> Any:
|
|
95
|
+
return self.outputs.get(name, default)
|
|
96
|
+
|
|
97
|
+
def get_attachment(self, name: str, default: Any) -> Any:
|
|
98
|
+
return self.attachments.get(name, default)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def read(script: str) -> Optional[dict]:
|
|
102
|
+
"""Converts inline script metadata to dict with all found data"""
|
|
103
|
+
regex = (
|
|
104
|
+
r"(?m)^# \/\/\/ (?P<type>[a-zA-Z0-9-]+)[ \t]*$[\r\n|\r|\n]"
|
|
105
|
+
"(?P<content>(?:^#(?:| .*)$[\r\n|\r|\n])+)^# \\/\\/\\/[ \t]*$"
|
|
106
|
+
)
|
|
107
|
+
name = "script"
|
|
108
|
+
matches = list(
|
|
109
|
+
filter(lambda m: m.group("type") == name, re.finditer(regex, script))
|
|
110
|
+
)
|
|
111
|
+
if len(matches) > 1:
|
|
112
|
+
raise ValueError(f"Multiple {name} blocks found")
|
|
113
|
+
if len(matches) == 1:
|
|
114
|
+
content = "".join(
|
|
115
|
+
line[2:] if line.startswith("# ") else line[1:]
|
|
116
|
+
for line in matches[0].group("content").splitlines(keepends=True)
|
|
117
|
+
)
|
|
118
|
+
return tomllib.loads(content)
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def parse(script: str) -> Optional["ScriptConfig"]:
|
|
123
|
+
"""
|
|
124
|
+
Method that is parsing inline script metadata from datachain script and
|
|
125
|
+
instantiating ScriptConfig class with found data. If no inline metadata is
|
|
126
|
+
found, it returns None
|
|
127
|
+
"""
|
|
128
|
+
try:
|
|
129
|
+
meta = ScriptConfig.read(script)
|
|
130
|
+
if not meta:
|
|
131
|
+
return None
|
|
132
|
+
custom = meta.get("tools", {}).get("datachain", {})
|
|
133
|
+
return ScriptConfig(
|
|
134
|
+
python_version=meta.get("requires-python"),
|
|
135
|
+
dependencies=meta.get("dependencies"),
|
|
136
|
+
num_workers=custom.get("workers", {}).get("num_workers"),
|
|
137
|
+
attachments=custom.get("attachments"),
|
|
138
|
+
params={k: str(v) for k, v in custom.get("params").items()}
|
|
139
|
+
if custom.get("params")
|
|
140
|
+
else None,
|
|
141
|
+
inputs=custom.get("inputs"),
|
|
142
|
+
outputs=custom.get("outputs"),
|
|
143
|
+
)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise ScriptConfigParsingError(
|
|
146
|
+
f"Error when parsing script meta: {e}"
|
|
147
|
+
) from e
|
datachain/utils.py
CHANGED
|
@@ -362,6 +362,7 @@ def show_records(
|
|
|
362
362
|
records: Optional[list[dict]],
|
|
363
363
|
collapse_columns: bool = False,
|
|
364
364
|
system_columns: bool = False,
|
|
365
|
+
hidden_fields: Optional[list[str]] = None,
|
|
365
366
|
) -> None:
|
|
366
367
|
import pandas as pd
|
|
367
368
|
|
|
@@ -369,6 +370,8 @@ def show_records(
|
|
|
369
370
|
return
|
|
370
371
|
|
|
371
372
|
df = pd.DataFrame.from_records(records)
|
|
373
|
+
if hidden_fields:
|
|
374
|
+
df = df.drop(columns=hidden_fields, errors="ignore")
|
|
372
375
|
return show_df(df, collapse_columns=collapse_columns, system_columns=system_columns)
|
|
373
376
|
|
|
374
377
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
16
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
17
|
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/x-rst
|
|
@@ -49,6 +50,7 @@ Requires-Dist: platformdirs
|
|
|
49
50
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
50
51
|
Requires-Dist: tabulate
|
|
51
52
|
Requires-Dist: websockets
|
|
53
|
+
Requires-Dist: tomli; python_version < "3.11"
|
|
52
54
|
Provides-Extra: docs
|
|
53
55
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
54
56
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -70,9 +72,8 @@ Provides-Extra: hf
|
|
|
70
72
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
71
73
|
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
72
74
|
Provides-Extra: video
|
|
73
|
-
Requires-Dist: av<14; extra == "video"
|
|
74
75
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
75
|
-
Requires-Dist: imageio[ffmpeg]; extra == "video"
|
|
76
|
+
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
76
77
|
Requires-Dist: opencv-python; extra == "video"
|
|
77
78
|
Provides-Extra: tests
|
|
78
79
|
Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
|
|
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
102
103
|
Requires-Dist: defusedxml; extra == "examples"
|
|
103
104
|
Requires-Dist: accelerate; extra == "examples"
|
|
104
105
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
105
|
-
Requires-Dist: ultralytics==8.3.
|
|
106
|
+
Requires-Dist: ultralytics==8.3.82; extra == "examples"
|
|
106
107
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
107
108
|
|
|
108
109
|
================
|
|
@@ -9,17 +9,18 @@ datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
|
9
9
|
datachain/listing.py,sha256=HNB-xeKA6aUA-HTWr--H22S6jVOxP2OVQ-3d07ISqAk,7109
|
|
10
10
|
datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
|
|
11
11
|
datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,1095
|
|
12
|
-
datachain/nodes_thread_pool.py,sha256=
|
|
12
|
+
datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2SmM,3989
|
|
13
13
|
datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
|
|
14
14
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
15
16
|
datachain/studio.py,sha256=Coo_6murSjh-RypiHDWNsVXGmfsopyMPCpPS1sA6uUc,9844
|
|
16
17
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
17
|
-
datachain/utils.py,sha256
|
|
18
|
+
datachain/utils.py,sha256=-vhV9LMUcUxDSBmyeJH4WJcfLTO416usD6igXS8c49k,14563
|
|
18
19
|
datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
|
|
19
20
|
datachain/catalog/catalog.py,sha256=xZC6drw4opoYcxTTiAFv6nbhNOzBb-UZZ_VqY9dqdIs,59458
|
|
20
21
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
21
22
|
datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
|
|
22
|
-
datachain/cli/__init__.py,sha256=
|
|
23
|
+
datachain/cli/__init__.py,sha256=OLoDOYm7M23bLdMJhw3_GsJDGPl8pWYzcjpwgxEdFDs,8326
|
|
23
24
|
datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
|
|
24
25
|
datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
|
|
25
26
|
datachain/cli/commands/datasets.py,sha256=865ui6q4UVPbL_-jk18C-lYi_bGMlh7XhfRaHbbNyhk,5796
|
|
@@ -28,11 +29,11 @@ datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR
|
|
|
28
29
|
datachain/cli/commands/ls.py,sha256=Wb8hXyBwyhb62Zk6ZhNFPFrj2lJhdbRcnBQQkgL_qyw,5174
|
|
29
30
|
datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibVE,600
|
|
30
31
|
datachain/cli/commands/query.py,sha256=2S7hQxialt1fkbocxi6JXZI6jS5QnFrD1aOjKgZkzfI,1471
|
|
31
|
-
datachain/cli/commands/show.py,sha256=
|
|
32
|
+
datachain/cli/commands/show.py,sha256=d-DDw4hA3TWA2vqIS-FkEXrzqvttcTdh2QPaahtLdy0,1445
|
|
32
33
|
datachain/cli/parser/__init__.py,sha256=rtjlqSsDd4LZH9WdgvluO27M4sID1wD7YkQ4cKhNXzw,15721
|
|
33
34
|
datachain/cli/parser/job.py,sha256=kvQkSfieyUmvJpOK8p78UgS8sygHhQXztRlOtVcgtaU,3449
|
|
34
35
|
datachain/cli/parser/studio.py,sha256=4HEE1K93WDJxMLfgqAA4mHdigpSzC7SLUx-qPF0NgYQ,3254
|
|
35
|
-
datachain/cli/parser/utils.py,sha256=
|
|
36
|
+
datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
|
|
36
37
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
37
38
|
datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
|
|
38
39
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
@@ -66,10 +67,10 @@ datachain/func/window.py,sha256=0MB1yjpVbwOrl_WNLZ8V3jkJz3o0XlYinpAcZQJuxiA,1688
|
|
|
66
67
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
68
|
datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
|
|
68
69
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
69
|
-
datachain/lib/data_model.py,sha256=
|
|
70
|
+
datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
|
|
70
71
|
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
71
|
-
datachain/lib/dc.py,sha256=
|
|
72
|
-
datachain/lib/file.py,sha256=
|
|
72
|
+
datachain/lib/dc.py,sha256=XU4VmRjm7CR37YuEKMhtU_DGxb1a7agXoNVU5WsaLRc,97772
|
|
73
|
+
datachain/lib/file.py,sha256=LwpRWsDvO3ZvUBAtS29mFotp_arfEy-HhPQ0jaL_2Rc,29006
|
|
73
74
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
74
75
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
75
76
|
datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
|
|
@@ -78,7 +79,7 @@ datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM
|
|
|
78
79
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
79
80
|
datachain/lib/pytorch.py,sha256=QxXBhrn2-D0RiFA2rdxZ7wKMxyuQ0WWHKfiFEWAA760,7710
|
|
80
81
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
81
|
-
datachain/lib/signal_schema.py,sha256=
|
|
82
|
+
datachain/lib/signal_schema.py,sha256=WyVTXUsa4DVTIZRAX2-MdjOe4deat_Fufsd9n8ycrXQ,33629
|
|
82
83
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
83
84
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
84
85
|
datachain/lib/udf.py,sha256=TlvikKTFvkIKaqqSkSriOyXhQ0rwRHV2ZRs1LHZOCmo,16107
|
|
@@ -135,9 +136,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
135
136
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
136
137
|
datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
|
|
137
138
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
138
|
-
datachain-0.
|
|
139
|
-
datachain-0.
|
|
140
|
-
datachain-0.
|
|
141
|
-
datachain-0.
|
|
142
|
-
datachain-0.
|
|
143
|
-
datachain-0.
|
|
139
|
+
datachain-0.11.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
140
|
+
datachain-0.11.11.dist-info/METADATA,sha256=iF194pmsP-vh7ITTJG62w-VbTQbWGDckY-GJfempDBg,11267
|
|
141
|
+
datachain-0.11.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
142
|
+
datachain-0.11.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
143
|
+
datachain-0.11.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
144
|
+
datachain-0.11.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|