datachain 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +13 -91
- datachain/cli.py +6 -38
- datachain/client/fsspec.py +3 -0
- datachain/client/hf.py +47 -0
- datachain/data_storage/metastore.py +2 -29
- datachain/data_storage/sqlite.py +3 -12
- datachain/data_storage/warehouse.py +20 -29
- datachain/dataset.py +44 -32
- datachain/lib/arrow.py +22 -6
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc.py +149 -35
- datachain/lib/file.py +10 -33
- datachain/lib/hf.py +2 -1
- datachain/lib/listing.py +102 -94
- datachain/lib/listing_info.py +32 -0
- datachain/lib/meta_formats.py +4 -4
- datachain/lib/signal_schema.py +5 -2
- datachain/lib/webdataset.py +1 -1
- datachain/node.py +13 -0
- datachain/query/dataset.py +25 -87
- datachain/query/metrics.py +8 -0
- datachain/utils.py +5 -0
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/METADATA +14 -14
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/RECORD +28 -26
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/WHEEL +1 -1
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/LICENSE +0 -0
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.8.dist-info → datachain-0.3.10.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -156,8 +156,6 @@ class QueryResult(NamedTuple):
|
|
|
156
156
|
dataset: Optional[DatasetRecord]
|
|
157
157
|
version: Optional[int]
|
|
158
158
|
output: str
|
|
159
|
-
preview: Optional[list[dict]]
|
|
160
|
-
metrics: dict[str, Any]
|
|
161
159
|
|
|
162
160
|
|
|
163
161
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -1020,20 +1018,6 @@ class Catalog:
|
|
|
1020
1018
|
|
|
1021
1019
|
return node_groups
|
|
1022
1020
|
|
|
1023
|
-
def unlist_source(self, uri: StorageURI) -> None:
|
|
1024
|
-
self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
|
|
1025
|
-
|
|
1026
|
-
def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
|
|
1027
|
-
"""
|
|
1028
|
-
Returns tuple with storage stats: total number of rows and total dataset size.
|
|
1029
|
-
"""
|
|
1030
|
-
partial_path = self.metastore.get_last_partial_path(uri)
|
|
1031
|
-
if partial_path is None:
|
|
1032
|
-
return None
|
|
1033
|
-
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
1034
|
-
|
|
1035
|
-
return self.dataset_stats(dataset.name, dataset.latest_version)
|
|
1036
|
-
|
|
1037
1021
|
def create_dataset(
|
|
1038
1022
|
self,
|
|
1039
1023
|
name: str,
|
|
@@ -1297,19 +1281,6 @@ class Catalog:
|
|
|
1297
1281
|
|
|
1298
1282
|
return self.get_dataset(name)
|
|
1299
1283
|
|
|
1300
|
-
def register_new_dataset(
|
|
1301
|
-
self,
|
|
1302
|
-
source_dataset: DatasetRecord,
|
|
1303
|
-
source_version: int,
|
|
1304
|
-
target_name: str,
|
|
1305
|
-
) -> DatasetRecord:
|
|
1306
|
-
target_dataset = self.metastore.create_dataset(
|
|
1307
|
-
target_name,
|
|
1308
|
-
query_script=source_dataset.query_script,
|
|
1309
|
-
schema=source_dataset.serialized_schema,
|
|
1310
|
-
)
|
|
1311
|
-
return self.register_dataset(source_dataset, source_version, target_dataset, 1)
|
|
1312
|
-
|
|
1313
1284
|
def register_dataset(
|
|
1314
1285
|
self,
|
|
1315
1286
|
dataset: DatasetRecord,
|
|
@@ -1422,17 +1393,18 @@ class Catalog:
|
|
|
1422
1393
|
|
|
1423
1394
|
return direct_dependencies
|
|
1424
1395
|
|
|
1425
|
-
def ls_datasets(self) -> Iterator[DatasetRecord]:
|
|
1396
|
+
def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
|
|
1426
1397
|
datasets = self.metastore.list_datasets()
|
|
1427
1398
|
for d in datasets:
|
|
1428
|
-
if not d.is_bucket_listing:
|
|
1399
|
+
if not d.is_bucket_listing or include_listing:
|
|
1429
1400
|
yield d
|
|
1430
1401
|
|
|
1431
1402
|
def list_datasets_versions(
|
|
1432
1403
|
self,
|
|
1404
|
+
include_listing: bool = False,
|
|
1433
1405
|
) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
|
|
1434
1406
|
"""Iterate over all dataset versions with related jobs."""
|
|
1435
|
-
datasets = list(self.ls_datasets())
|
|
1407
|
+
datasets = list(self.ls_datasets(include_listing=include_listing))
|
|
1436
1408
|
|
|
1437
1409
|
# preselect dataset versions jobs from db to avoid multiple queries
|
|
1438
1410
|
jobs_ids: set[str] = {
|
|
@@ -1560,17 +1532,8 @@ class Catalog:
|
|
|
1560
1532
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1561
1533
|
|
|
1562
1534
|
file_signals_values = {}
|
|
1563
|
-
file_schemas = {}
|
|
1564
|
-
# TODO: To remove after we properly fix deserialization
|
|
1565
|
-
for signal, type_name in version.feature_schema.items():
|
|
1566
|
-
from datachain.lib.model_store import ModelStore
|
|
1567
|
-
|
|
1568
|
-
type_name_parsed, v = ModelStore.parse_name_version(type_name)
|
|
1569
|
-
fr = ModelStore.get(type_name_parsed, v)
|
|
1570
|
-
if fr and issubclass(fr, File):
|
|
1571
|
-
file_schemas[signal] = type_name
|
|
1572
1535
|
|
|
1573
|
-
schema = SignalSchema.deserialize(
|
|
1536
|
+
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1574
1537
|
for file_signals in schema.get_signals(File):
|
|
1575
1538
|
prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1576
1539
|
file_signals_values[file_signals] = {
|
|
@@ -1641,15 +1604,6 @@ class Catalog:
|
|
|
1641
1604
|
for source in data_sources: # type: ignore [union-attr]
|
|
1642
1605
|
yield source, source.ls(fields)
|
|
1643
1606
|
|
|
1644
|
-
def ls_storage_uris(self) -> Iterator[str]:
|
|
1645
|
-
yield from self.metastore.get_all_storage_uris()
|
|
1646
|
-
|
|
1647
|
-
def get_storage(self, uri: StorageURI) -> Storage:
|
|
1648
|
-
return self.metastore.get_storage(uri)
|
|
1649
|
-
|
|
1650
|
-
def ls_storages(self) -> list[Storage]:
|
|
1651
|
-
return self.metastore.list_storages()
|
|
1652
|
-
|
|
1653
1607
|
def pull_dataset(
|
|
1654
1608
|
self,
|
|
1655
1609
|
dataset_uri: str,
|
|
@@ -1883,10 +1837,6 @@ class Catalog:
|
|
|
1883
1837
|
envs: Optional[Mapping[str, str]] = None,
|
|
1884
1838
|
python_executable: Optional[str] = None,
|
|
1885
1839
|
save: bool = False,
|
|
1886
|
-
save_as: Optional[str] = None,
|
|
1887
|
-
preview_limit: int = 10,
|
|
1888
|
-
preview_offset: int = 0,
|
|
1889
|
-
preview_columns: Optional[list[str]] = None,
|
|
1890
1840
|
capture_output: bool = True,
|
|
1891
1841
|
output_hook: Callable[[str], None] = noop,
|
|
1892
1842
|
params: Optional[dict[str, str]] = None,
|
|
@@ -1914,9 +1864,8 @@ class Catalog:
|
|
|
1914
1864
|
C.size > 1000
|
|
1915
1865
|
)
|
|
1916
1866
|
"""
|
|
1917
|
-
from datachain.query.dataset import ExecutionResult
|
|
1918
1867
|
|
|
1919
|
-
feature_file = tempfile.NamedTemporaryFile(
|
|
1868
|
+
feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
|
|
1920
1869
|
dir=os.getcwd(), suffix=".py", delete=False
|
|
1921
1870
|
)
|
|
1922
1871
|
_, feature_module = os.path.split(feature_file.name)
|
|
@@ -1931,11 +1880,7 @@ class Catalog:
|
|
|
1931
1880
|
feature_module,
|
|
1932
1881
|
output_hook,
|
|
1933
1882
|
params,
|
|
1934
|
-
preview_columns,
|
|
1935
|
-
preview_limit,
|
|
1936
|
-
preview_offset,
|
|
1937
1883
|
save,
|
|
1938
|
-
save_as,
|
|
1939
1884
|
job_id,
|
|
1940
1885
|
)
|
|
1941
1886
|
finally:
|
|
@@ -1964,25 +1909,18 @@ class Catalog:
|
|
|
1964
1909
|
)
|
|
1965
1910
|
|
|
1966
1911
|
try:
|
|
1967
|
-
|
|
1912
|
+
result = json.loads(response_text)
|
|
1968
1913
|
except ValueError:
|
|
1969
|
-
|
|
1970
|
-
exec_result = ExecutionResult(**response)
|
|
1914
|
+
result = None
|
|
1971
1915
|
|
|
1972
1916
|
dataset: Optional[DatasetRecord] = None
|
|
1973
1917
|
version: Optional[int] = None
|
|
1974
|
-
if save
|
|
1918
|
+
if save:
|
|
1975
1919
|
dataset, version = self.save_result(
|
|
1976
|
-
query_script,
|
|
1920
|
+
query_script, result, output, version, job_id
|
|
1977
1921
|
)
|
|
1978
1922
|
|
|
1979
|
-
return QueryResult(
|
|
1980
|
-
dataset=dataset,
|
|
1981
|
-
version=version,
|
|
1982
|
-
output=output,
|
|
1983
|
-
preview=exec_result.preview,
|
|
1984
|
-
metrics=exec_result.metrics,
|
|
1985
|
-
)
|
|
1923
|
+
return QueryResult(dataset=dataset, version=version, output=output)
|
|
1986
1924
|
|
|
1987
1925
|
def run_query(
|
|
1988
1926
|
self,
|
|
@@ -1994,11 +1932,7 @@ class Catalog:
|
|
|
1994
1932
|
feature_module: str,
|
|
1995
1933
|
output_hook: Callable[[str], None],
|
|
1996
1934
|
params: Optional[dict[str, str]],
|
|
1997
|
-
preview_columns: Optional[list[str]],
|
|
1998
|
-
preview_limit: int,
|
|
1999
|
-
preview_offset: int,
|
|
2000
1935
|
save: bool,
|
|
2001
|
-
save_as: Optional[str],
|
|
2002
1936
|
job_id: Optional[str],
|
|
2003
1937
|
) -> tuple[list[str], subprocess.Popen, str]:
|
|
2004
1938
|
try:
|
|
@@ -2013,10 +1947,6 @@ class Catalog:
|
|
|
2013
1947
|
raise QueryScriptCompileError(
|
|
2014
1948
|
f"Query script failed to compile, reason: {exc}"
|
|
2015
1949
|
) from exc
|
|
2016
|
-
if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
|
|
2017
|
-
raise ValueError(
|
|
2018
|
-
f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
|
|
2019
|
-
)
|
|
2020
1950
|
r, w = os.pipe()
|
|
2021
1951
|
if os.name == "nt":
|
|
2022
1952
|
import msvcrt
|
|
@@ -2039,15 +1969,7 @@ class Catalog:
|
|
|
2039
1969
|
{
|
|
2040
1970
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
2041
1971
|
"PYTHONPATH": os.getcwd(), # For local imports
|
|
2042
|
-
"DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
|
|
2043
|
-
{
|
|
2044
|
-
"limit": preview_limit,
|
|
2045
|
-
"offset": preview_offset,
|
|
2046
|
-
"columns": preview_columns,
|
|
2047
|
-
}
|
|
2048
|
-
),
|
|
2049
1972
|
"DATACHAIN_QUERY_SAVE": "1" if save else "",
|
|
2050
|
-
"DATACHAIN_QUERY_SAVE_AS": save_as or "",
|
|
2051
1973
|
"PYTHONUNBUFFERED": "1",
|
|
2052
1974
|
"DATACHAIN_OUTPUT_FD": str(handle),
|
|
2053
1975
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
@@ -2077,12 +1999,12 @@ class Catalog:
|
|
|
2077
1999
|
return lines, proc, response_text
|
|
2078
2000
|
|
|
2079
2001
|
def save_result(self, query_script, exec_result, output, version, job_id):
|
|
2080
|
-
if not exec_result
|
|
2002
|
+
if not exec_result:
|
|
2081
2003
|
raise QueryScriptDatasetNotFound(
|
|
2082
2004
|
"No dataset found after running Query script",
|
|
2083
2005
|
output=output,
|
|
2084
2006
|
)
|
|
2085
|
-
name, version = exec_result
|
|
2007
|
+
name, version = exec_result
|
|
2086
2008
|
# finding returning dataset
|
|
2087
2009
|
try:
|
|
2088
2010
|
dataset = self.get_dataset(name)
|
datachain/cli.py
CHANGED
|
@@ -14,6 +14,7 @@ import shtab
|
|
|
14
14
|
|
|
15
15
|
from datachain import utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
|
+
from datachain.lib.dc import DataChain
|
|
17
18
|
from datachain.utils import DataChainDir
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
472
473
|
query_parser.add_argument(
|
|
473
474
|
"script", metavar="<script.py>", type=str, help="Filepath for script"
|
|
474
475
|
)
|
|
475
|
-
query_parser.add_argument(
|
|
476
|
-
"dataset_name", nargs="?", type=str, help="Save result dataset as"
|
|
477
|
-
)
|
|
478
476
|
query_parser.add_argument(
|
|
479
477
|
"--parallel",
|
|
480
478
|
nargs="?",
|
|
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
487
485
|
"N defaults to the CPU count."
|
|
488
486
|
),
|
|
489
487
|
)
|
|
490
|
-
add_show_args(query_parser)
|
|
491
488
|
query_parser.add_argument(
|
|
492
489
|
"-p",
|
|
493
490
|
"--param",
|
|
@@ -619,18 +616,6 @@ def _ls_urls_flat(
|
|
|
619
616
|
raise FileNotFoundError(f"No such file or directory: {source}")
|
|
620
617
|
|
|
621
618
|
|
|
622
|
-
def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
|
|
623
|
-
from datachain.node import long_line_str
|
|
624
|
-
|
|
625
|
-
storage_uris = catalog.ls_storage_uris()
|
|
626
|
-
if long:
|
|
627
|
-
for uri in storage_uris:
|
|
628
|
-
# TODO: add Storage.created so it can be used here
|
|
629
|
-
yield long_line_str(uri, None, "")
|
|
630
|
-
else:
|
|
631
|
-
yield from storage_uris
|
|
632
|
-
|
|
633
|
-
|
|
634
619
|
def ls_local(
|
|
635
620
|
sources,
|
|
636
621
|
long: bool = False,
|
|
@@ -661,8 +646,9 @@ def ls_local(
|
|
|
661
646
|
for entry in entries:
|
|
662
647
|
print(format_ls_entry(entry))
|
|
663
648
|
else:
|
|
664
|
-
|
|
665
|
-
|
|
649
|
+
chain = DataChain.listings()
|
|
650
|
+
for ls in chain.collect("listing"):
|
|
651
|
+
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
666
652
|
|
|
667
653
|
|
|
668
654
|
def format_ls_entry(entry: str) -> str:
|
|
@@ -813,16 +799,10 @@ def show(
|
|
|
813
799
|
def query(
|
|
814
800
|
catalog: "Catalog",
|
|
815
801
|
script: str,
|
|
816
|
-
dataset_name: Optional[str] = None,
|
|
817
802
|
parallel: Optional[int] = None,
|
|
818
|
-
limit: int = 10,
|
|
819
|
-
offset: int = 0,
|
|
820
|
-
columns: Optional[list[str]] = None,
|
|
821
|
-
no_collapse: bool = False,
|
|
822
803
|
params: Optional[dict[str, str]] = None,
|
|
823
804
|
) -> None:
|
|
824
805
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
825
|
-
from datachain.utils import show_records
|
|
826
806
|
|
|
827
807
|
with open(script, encoding="utf-8") as f:
|
|
828
808
|
script_content = f.read()
|
|
@@ -843,13 +823,9 @@ def query(
|
|
|
843
823
|
)
|
|
844
824
|
|
|
845
825
|
try:
|
|
846
|
-
|
|
826
|
+
catalog.query(
|
|
847
827
|
script_content,
|
|
848
828
|
python_executable=python_executable,
|
|
849
|
-
save_as=dataset_name,
|
|
850
|
-
preview_limit=limit,
|
|
851
|
-
preview_offset=offset,
|
|
852
|
-
preview_columns=columns,
|
|
853
829
|
capture_output=False,
|
|
854
830
|
params=params,
|
|
855
831
|
job_id=job_id,
|
|
@@ -864,10 +840,7 @@ def query(
|
|
|
864
840
|
error_stack=error_stack,
|
|
865
841
|
)
|
|
866
842
|
raise
|
|
867
|
-
|
|
868
|
-
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
|
|
869
|
-
|
|
870
|
-
show_records(result.preview, collapse_columns=not no_collapse)
|
|
843
|
+
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
|
|
871
844
|
|
|
872
845
|
|
|
873
846
|
def clear_cache(catalog: "Catalog"):
|
|
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1042
1015
|
query(
|
|
1043
1016
|
catalog,
|
|
1044
1017
|
args.script,
|
|
1045
|
-
dataset_name=args.dataset_name,
|
|
1046
1018
|
parallel=args.parallel,
|
|
1047
|
-
limit=args.limit,
|
|
1048
|
-
offset=args.offset,
|
|
1049
|
-
columns=args.columns,
|
|
1050
|
-
no_collapse=args.no_collapse,
|
|
1051
1019
|
params=args.param,
|
|
1052
1020
|
)
|
|
1053
1021
|
elif args.command == "apply-udf":
|
datachain/client/fsspec.py
CHANGED
|
@@ -87,6 +87,7 @@ class Client(ABC):
|
|
|
87
87
|
def get_implementation(url: str) -> type["Client"]:
|
|
88
88
|
from .azure import AzureClient
|
|
89
89
|
from .gcs import GCSClient
|
|
90
|
+
from .hf import HfClient
|
|
90
91
|
from .local import FileClient
|
|
91
92
|
from .s3 import ClientS3
|
|
92
93
|
|
|
@@ -104,6 +105,8 @@ class Client(ABC):
|
|
|
104
105
|
return AzureClient
|
|
105
106
|
if protocol == FileClient.protocol:
|
|
106
107
|
return FileClient
|
|
108
|
+
if protocol == HfClient.protocol:
|
|
109
|
+
return HfClient
|
|
107
110
|
|
|
108
111
|
raise NotImplementedError(f"Unsupported protocol: {protocol}")
|
|
109
112
|
|
datachain/client/hf.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import posixpath
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from huggingface_hub import HfFileSystem
|
|
6
|
+
|
|
7
|
+
from datachain.lib.file import File
|
|
8
|
+
from datachain.node import Entry
|
|
9
|
+
|
|
10
|
+
from .fsspec import Client
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HfClient(Client):
|
|
14
|
+
FS_CLASS = HfFileSystem
|
|
15
|
+
PREFIX = "hf://"
|
|
16
|
+
protocol = "hf"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def create_fs(cls, **kwargs) -> HfFileSystem:
|
|
20
|
+
if os.environ.get("HF_TOKEN"):
|
|
21
|
+
kwargs["token"] = os.environ["HF_TOKEN"]
|
|
22
|
+
|
|
23
|
+
return cast(HfFileSystem, super().create_fs(**kwargs))
|
|
24
|
+
|
|
25
|
+
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
26
|
+
return Entry.from_file(
|
|
27
|
+
path=path,
|
|
28
|
+
size=v["size"],
|
|
29
|
+
version=v["last_commit"].oid,
|
|
30
|
+
etag=v.get("blob_id", ""),
|
|
31
|
+
last_modified=v["last_commit"].date,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
35
|
+
return File(
|
|
36
|
+
path=path,
|
|
37
|
+
size=v["size"],
|
|
38
|
+
version=v["last_commit"].oid,
|
|
39
|
+
etag=v.get("blob_id", ""),
|
|
40
|
+
last_modified=v["last_commit"].date,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
async def ls_dir(self, path):
|
|
44
|
+
return self.fs.ls(path, detail=True)
|
|
45
|
+
|
|
46
|
+
def rel_path(self, path):
|
|
47
|
+
return posixpath.relpath(path, self.name)
|
|
@@ -167,21 +167,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
167
167
|
This method should be called when index operation is finished.
|
|
168
168
|
"""
|
|
169
169
|
|
|
170
|
-
@abstractmethod
|
|
171
|
-
def mark_storage_not_indexed(self, uri: StorageURI) -> None:
|
|
172
|
-
"""
|
|
173
|
-
Mark storage as not indexed.
|
|
174
|
-
This method should be called when storage index is deleted.
|
|
175
|
-
"""
|
|
176
|
-
|
|
177
170
|
@abstractmethod
|
|
178
171
|
def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
|
|
179
172
|
"""Updates last inserted datetime in bucket with current time."""
|
|
180
173
|
|
|
181
|
-
@abstractmethod
|
|
182
|
-
def get_all_storage_uris(self) -> Iterator[StorageURI]:
|
|
183
|
-
"""Returns all storage uris."""
|
|
184
|
-
|
|
185
174
|
@abstractmethod
|
|
186
175
|
def get_storage(self, uri: StorageURI) -> Storage:
|
|
187
176
|
"""
|
|
@@ -189,10 +178,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
189
178
|
E.g. if s3 is used as storage this would be s3 bucket data.
|
|
190
179
|
"""
|
|
191
180
|
|
|
192
|
-
@abstractmethod
|
|
193
|
-
def list_storages(self) -> list[Storage]:
|
|
194
|
-
"""Returns all storages."""
|
|
195
|
-
|
|
196
181
|
@abstractmethod
|
|
197
182
|
def mark_storage_pending(self, storage: Storage) -> Storage:
|
|
198
183
|
"""Marks storage as pending."""
|
|
@@ -324,7 +309,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
324
309
|
self.add_dataset_dependency(
|
|
325
310
|
source_dataset_name,
|
|
326
311
|
source_dataset_version,
|
|
327
|
-
dependency.
|
|
312
|
+
dependency.dataset_name,
|
|
328
313
|
int(dependency.version),
|
|
329
314
|
)
|
|
330
315
|
else:
|
|
@@ -906,11 +891,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
906
891
|
self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
|
|
907
892
|
)
|
|
908
893
|
|
|
909
|
-
def get_all_storage_uris(self) -> Iterator[StorageURI]:
|
|
910
|
-
"""Returns all storage uris."""
|
|
911
|
-
s = self._storages
|
|
912
|
-
yield from (r[0] for r in self.db.execute(self._storages_select(s.c.uri)))
|
|
913
|
-
|
|
914
894
|
def get_storage(self, uri: StorageURI, conn=None) -> Storage:
|
|
915
895
|
"""
|
|
916
896
|
Gets storage representation from database.
|
|
@@ -926,13 +906,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
926
906
|
|
|
927
907
|
return self.storage_class._make(result)
|
|
928
908
|
|
|
929
|
-
def list_storages(self) -> list[Storage]:
|
|
930
|
-
result = self.db.execute(self._storages_select())
|
|
931
|
-
if not result:
|
|
932
|
-
return []
|
|
933
|
-
|
|
934
|
-
return [self.storage_class._make(r) for r in result]
|
|
935
|
-
|
|
936
909
|
def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
|
|
937
910
|
# Update status to pending and dates
|
|
938
911
|
updates = {
|
|
@@ -1503,7 +1476,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1503
1476
|
return self._jobs.update().where(*where)
|
|
1504
1477
|
|
|
1505
1478
|
def _parse_job(self, rows) -> Job:
|
|
1506
|
-
return
|
|
1479
|
+
return self.job_class.parse(*rows)
|
|
1507
1480
|
|
|
1508
1481
|
def _parse_jobs(self, rows) -> Iterator["Job"]:
|
|
1509
1482
|
for _, g in groupby(rows, lambda r: r[0]):
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -143,7 +143,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
143
143
|
db.execute("PRAGMA synchronous = NORMAL")
|
|
144
144
|
db.execute("PRAGMA case_sensitive_like = ON")
|
|
145
145
|
if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
|
|
146
|
-
|
|
146
|
+
import sys
|
|
147
|
+
|
|
148
|
+
db.set_trace_callback(sys.stderr.write)
|
|
147
149
|
|
|
148
150
|
load_usearch_extension(db)
|
|
149
151
|
|
|
@@ -515,17 +517,6 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
515
517
|
def _datasets_dependencies_insert(self) -> "Insert":
|
|
516
518
|
return sqlite.insert(self._datasets_dependencies)
|
|
517
519
|
|
|
518
|
-
#
|
|
519
|
-
# Storages
|
|
520
|
-
#
|
|
521
|
-
|
|
522
|
-
def mark_storage_not_indexed(self, uri: StorageURI) -> None:
|
|
523
|
-
"""
|
|
524
|
-
Mark storage as not indexed.
|
|
525
|
-
This method should be called when storage index is deleted.
|
|
526
|
-
"""
|
|
527
|
-
self.db.execute(self._storages_delete().where(self._storages.c.uri == uri))
|
|
528
|
-
|
|
529
520
|
#
|
|
530
521
|
# Dataset dependencies
|
|
531
522
|
#
|
|
@@ -218,35 +218,26 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
218
218
|
results = None
|
|
219
219
|
offset = 0
|
|
220
220
|
num_yielded = 0
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
finally:
|
|
242
|
-
# https://www2.sqlite.org/cvstrac/wiki?p=DatabaseIsLocked (SELECT not
|
|
243
|
-
# finalized or reset) to prevent database table is locked error when an
|
|
244
|
-
# exception is raised in the middle of processing the results (e.g.
|
|
245
|
-
# https://github.com/iterative/dvcx/issues/924). Connections close
|
|
246
|
-
# apparently is not enough in some cases, at least on sqlite
|
|
247
|
-
# https://www.sqlite.org/c3ref/close.html
|
|
248
|
-
if results and hasattr(results, "close"):
|
|
249
|
-
results.close()
|
|
221
|
+
|
|
222
|
+
while True:
|
|
223
|
+
if limit is not None:
|
|
224
|
+
limit -= num_yielded
|
|
225
|
+
if limit == 0:
|
|
226
|
+
break
|
|
227
|
+
if limit < page_size:
|
|
228
|
+
paginated_query = paginated_query.limit(None).limit(limit)
|
|
229
|
+
|
|
230
|
+
results = self.dataset_rows_select(paginated_query.offset(offset))
|
|
231
|
+
|
|
232
|
+
processed = False
|
|
233
|
+
for row in results:
|
|
234
|
+
processed = True
|
|
235
|
+
yield row
|
|
236
|
+
num_yielded += 1
|
|
237
|
+
|
|
238
|
+
if not processed:
|
|
239
|
+
break # no more results
|
|
240
|
+
offset += page_size
|
|
250
241
|
|
|
251
242
|
#
|
|
252
243
|
# Table Name Internal Functions
|
datachain/dataset.py
CHANGED
|
@@ -11,8 +11,6 @@ from typing import (
|
|
|
11
11
|
)
|
|
12
12
|
from urllib.parse import urlparse
|
|
13
13
|
|
|
14
|
-
from dateutil.parser import isoparse
|
|
15
|
-
|
|
16
14
|
from datachain.client import Client
|
|
17
15
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
18
16
|
|
|
@@ -25,6 +23,7 @@ DD = TypeVar("DD", bound="DatasetDependency")
|
|
|
25
23
|
|
|
26
24
|
DATASET_PREFIX = "ds://"
|
|
27
25
|
QUERY_DATASET_PREFIX = "ds_query_"
|
|
26
|
+
LISTING_PREFIX = "lst__"
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
|
|
@@ -72,11 +71,22 @@ class DatasetDependencyType:
|
|
|
72
71
|
class DatasetDependency:
|
|
73
72
|
id: int
|
|
74
73
|
type: str
|
|
75
|
-
name: str
|
|
76
|
-
version: str #
|
|
74
|
+
name: str
|
|
75
|
+
version: str # TODO change to int
|
|
77
76
|
created_at: datetime
|
|
78
77
|
dependencies: list[Optional["DatasetDependency"]]
|
|
79
78
|
|
|
79
|
+
@property
|
|
80
|
+
def dataset_name(self) -> str:
|
|
81
|
+
"""Returns clean dependency dataset name"""
|
|
82
|
+
from datachain.lib.listing import parse_listing_uri
|
|
83
|
+
|
|
84
|
+
if self.type == DatasetDependencyType.DATASET:
|
|
85
|
+
return self.name
|
|
86
|
+
|
|
87
|
+
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
|
|
88
|
+
return list_dataset_name
|
|
89
|
+
|
|
80
90
|
@classmethod
|
|
81
91
|
def parse(
|
|
82
92
|
cls: builtins.type[DD],
|
|
@@ -91,33 +101,31 @@ class DatasetDependency:
|
|
|
91
101
|
dataset_version_created_at: Optional[datetime],
|
|
92
102
|
bucket_uri: Optional["StorageURI"],
|
|
93
103
|
) -> Optional["DatasetDependency"]:
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# removing them from tables so that we can still have references
|
|
120
|
-
return None
|
|
104
|
+
from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
|
|
105
|
+
|
|
106
|
+
if not dataset_id:
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
assert dataset_name is not None
|
|
110
|
+
dependency_type = DatasetDependencyType.DATASET
|
|
111
|
+
dependency_name = dataset_name
|
|
112
|
+
|
|
113
|
+
if is_listing_dataset(dataset_name):
|
|
114
|
+
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
115
|
+
dependency_name = listing_uri_from_name(dataset_name)
|
|
116
|
+
|
|
117
|
+
return cls(
|
|
118
|
+
id,
|
|
119
|
+
dependency_type,
|
|
120
|
+
dependency_name,
|
|
121
|
+
(
|
|
122
|
+
str(dataset_version) # type: ignore[arg-type]
|
|
123
|
+
if dataset_version
|
|
124
|
+
else None
|
|
125
|
+
),
|
|
126
|
+
dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
|
|
127
|
+
[],
|
|
128
|
+
)
|
|
121
129
|
|
|
122
130
|
@property
|
|
123
131
|
def is_dataset(self) -> bool:
|
|
@@ -443,7 +451,11 @@ class DatasetRecord:
|
|
|
443
451
|
For bucket listing we implicitly create underlying dataset to hold data. This
|
|
444
452
|
method is checking if this is one of those datasets.
|
|
445
453
|
"""
|
|
446
|
-
|
|
454
|
+
# TODO refactor and maybe remove method in
|
|
455
|
+
# https://github.com/iterative/datachain/issues/318
|
|
456
|
+
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
|
457
|
+
LISTING_PREFIX
|
|
458
|
+
)
|
|
447
459
|
|
|
448
460
|
@property
|
|
449
461
|
def versions_values(self) -> list[int]:
|