datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +57 -212
- datachain/cli.py +6 -38
- datachain/client/fsspec.py +3 -0
- datachain/client/hf.py +47 -0
- datachain/data_storage/metastore.py +2 -29
- datachain/data_storage/sqlite.py +3 -12
- datachain/data_storage/warehouse.py +20 -29
- datachain/dataset.py +44 -32
- datachain/job.py +4 -3
- datachain/lib/arrow.py +21 -5
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc.py +183 -59
- datachain/lib/file.py +10 -33
- datachain/lib/hf.py +2 -1
- datachain/lib/listing.py +102 -94
- datachain/lib/listing_info.py +32 -0
- datachain/lib/meta_formats.py +39 -56
- datachain/lib/signal_schema.py +5 -2
- datachain/node.py +13 -0
- datachain/query/dataset.py +12 -105
- datachain/query/metrics.py +8 -0
- datachain/utils.py +5 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/METADATA +7 -3
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/RECORD +28 -27
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/WHEEL +1 -1
- datachain/catalog/subclass.py +0 -60
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/LICENSE +0 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -9,7 +9,6 @@ import os.path
|
|
|
9
9
|
import posixpath
|
|
10
10
|
import subprocess
|
|
11
11
|
import sys
|
|
12
|
-
import tempfile
|
|
13
12
|
import time
|
|
14
13
|
import traceback
|
|
15
14
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
@@ -77,7 +76,6 @@ from datachain.utils import (
|
|
|
77
76
|
)
|
|
78
77
|
|
|
79
78
|
from .datasource import DataSource
|
|
80
|
-
from .subclass import SubclassFinder
|
|
81
79
|
|
|
82
80
|
if TYPE_CHECKING:
|
|
83
81
|
from datachain.data_storage import (
|
|
@@ -92,7 +90,6 @@ logger = logging.getLogger("datachain")
|
|
|
92
90
|
|
|
93
91
|
DEFAULT_DATASET_DIR = "dataset"
|
|
94
92
|
DATASET_FILE_SUFFIX = ".edatachain"
|
|
95
|
-
FEATURE_CLASSES = ["DataModel"]
|
|
96
93
|
|
|
97
94
|
TTL_INT = 4 * 60 * 60
|
|
98
95
|
|
|
@@ -156,8 +153,6 @@ class QueryResult(NamedTuple):
|
|
|
156
153
|
dataset: Optional[DatasetRecord]
|
|
157
154
|
version: Optional[int]
|
|
158
155
|
output: str
|
|
159
|
-
preview: Optional[list[dict]]
|
|
160
|
-
metrics: dict[str, Any]
|
|
161
156
|
|
|
162
157
|
|
|
163
158
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -571,12 +566,6 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
571
566
|
return ""
|
|
572
567
|
|
|
573
568
|
|
|
574
|
-
def form_module_source(source_ast):
|
|
575
|
-
module = ast.Module(body=source_ast, type_ignores=[])
|
|
576
|
-
module = ast.fix_missing_locations(module)
|
|
577
|
-
return ast.unparse(module)
|
|
578
|
-
|
|
579
|
-
|
|
580
569
|
class Catalog:
|
|
581
570
|
def __init__(
|
|
582
571
|
self,
|
|
@@ -660,33 +649,12 @@ class Catalog:
|
|
|
660
649
|
),
|
|
661
650
|
]
|
|
662
651
|
code_ast.body[-1:] = new_expressions
|
|
663
|
-
else:
|
|
664
|
-
raise Exception("Last line in a script was not an expression")
|
|
665
652
|
return code_ast
|
|
666
653
|
|
|
667
|
-
def compile_query_script(
|
|
668
|
-
self, script: str, feature_module_name: str
|
|
669
|
-
) -> tuple[Union[str, None], str]:
|
|
654
|
+
def compile_query_script(self, script: str) -> str:
|
|
670
655
|
code_ast = ast.parse(script)
|
|
671
656
|
code_ast = self.attach_query_wrapper(code_ast)
|
|
672
|
-
|
|
673
|
-
finder.visit(code_ast)
|
|
674
|
-
|
|
675
|
-
if not finder.feature_class:
|
|
676
|
-
main_module = form_module_source([*finder.imports, *finder.main_body])
|
|
677
|
-
return None, main_module
|
|
678
|
-
|
|
679
|
-
feature_import = ast.ImportFrom(
|
|
680
|
-
module=feature_module_name,
|
|
681
|
-
names=[ast.alias(name="*", asname=None)],
|
|
682
|
-
level=0,
|
|
683
|
-
)
|
|
684
|
-
feature_module = form_module_source([*finder.imports, *finder.feature_class])
|
|
685
|
-
main_module = form_module_source(
|
|
686
|
-
[*finder.imports, feature_import, *finder.main_body]
|
|
687
|
-
)
|
|
688
|
-
|
|
689
|
-
return feature_module, main_module
|
|
657
|
+
return ast.unparse(code_ast)
|
|
690
658
|
|
|
691
659
|
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
692
660
|
config = config or self.client_config
|
|
@@ -1020,20 +988,6 @@ class Catalog:
|
|
|
1020
988
|
|
|
1021
989
|
return node_groups
|
|
1022
990
|
|
|
1023
|
-
def unlist_source(self, uri: StorageURI) -> None:
|
|
1024
|
-
self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
|
|
1025
|
-
|
|
1026
|
-
def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
|
|
1027
|
-
"""
|
|
1028
|
-
Returns tuple with storage stats: total number of rows and total dataset size.
|
|
1029
|
-
"""
|
|
1030
|
-
partial_path = self.metastore.get_last_partial_path(uri)
|
|
1031
|
-
if partial_path is None:
|
|
1032
|
-
return None
|
|
1033
|
-
dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
1034
|
-
|
|
1035
|
-
return self.dataset_stats(dataset.name, dataset.latest_version)
|
|
1036
|
-
|
|
1037
991
|
def create_dataset(
|
|
1038
992
|
self,
|
|
1039
993
|
name: str,
|
|
@@ -1297,19 +1251,6 @@ class Catalog:
|
|
|
1297
1251
|
|
|
1298
1252
|
return self.get_dataset(name)
|
|
1299
1253
|
|
|
1300
|
-
def register_new_dataset(
|
|
1301
|
-
self,
|
|
1302
|
-
source_dataset: DatasetRecord,
|
|
1303
|
-
source_version: int,
|
|
1304
|
-
target_name: str,
|
|
1305
|
-
) -> DatasetRecord:
|
|
1306
|
-
target_dataset = self.metastore.create_dataset(
|
|
1307
|
-
target_name,
|
|
1308
|
-
query_script=source_dataset.query_script,
|
|
1309
|
-
schema=source_dataset.serialized_schema,
|
|
1310
|
-
)
|
|
1311
|
-
return self.register_dataset(source_dataset, source_version, target_dataset, 1)
|
|
1312
|
-
|
|
1313
1254
|
def register_dataset(
|
|
1314
1255
|
self,
|
|
1315
1256
|
dataset: DatasetRecord,
|
|
@@ -1422,17 +1363,18 @@ class Catalog:
|
|
|
1422
1363
|
|
|
1423
1364
|
return direct_dependencies
|
|
1424
1365
|
|
|
1425
|
-
def ls_datasets(self) -> Iterator[DatasetRecord]:
|
|
1366
|
+
def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
|
|
1426
1367
|
datasets = self.metastore.list_datasets()
|
|
1427
1368
|
for d in datasets:
|
|
1428
|
-
if not d.is_bucket_listing:
|
|
1369
|
+
if not d.is_bucket_listing or include_listing:
|
|
1429
1370
|
yield d
|
|
1430
1371
|
|
|
1431
1372
|
def list_datasets_versions(
|
|
1432
1373
|
self,
|
|
1374
|
+
include_listing: bool = False,
|
|
1433
1375
|
) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
|
|
1434
1376
|
"""Iterate over all dataset versions with related jobs."""
|
|
1435
|
-
datasets = list(self.ls_datasets())
|
|
1377
|
+
datasets = list(self.ls_datasets(include_listing=include_listing))
|
|
1436
1378
|
|
|
1437
1379
|
# preselect dataset versions jobs from db to avoid multiple queries
|
|
1438
1380
|
jobs_ids: set[str] = {
|
|
@@ -1444,7 +1386,8 @@ class Catalog:
|
|
|
1444
1386
|
|
|
1445
1387
|
for d in datasets:
|
|
1446
1388
|
yield from (
|
|
1447
|
-
(d, v, jobs.get(v.job_id) if v.job_id else None)
|
|
1389
|
+
(d, v, jobs.get(str(v.job_id)) if v.job_id else None)
|
|
1390
|
+
for v in d.versions
|
|
1448
1391
|
)
|
|
1449
1392
|
|
|
1450
1393
|
def ls_dataset_rows(
|
|
@@ -1632,15 +1575,6 @@ class Catalog:
|
|
|
1632
1575
|
for source in data_sources: # type: ignore [union-attr]
|
|
1633
1576
|
yield source, source.ls(fields)
|
|
1634
1577
|
|
|
1635
|
-
def ls_storage_uris(self) -> Iterator[str]:
|
|
1636
|
-
yield from self.metastore.get_all_storage_uris()
|
|
1637
|
-
|
|
1638
|
-
def get_storage(self, uri: StorageURI) -> Storage:
|
|
1639
|
-
return self.metastore.get_storage(uri)
|
|
1640
|
-
|
|
1641
|
-
def ls_storages(self) -> list[Storage]:
|
|
1642
|
-
return self.metastore.list_storages()
|
|
1643
|
-
|
|
1644
1578
|
def pull_dataset(
|
|
1645
1579
|
self,
|
|
1646
1580
|
dataset_uri: str,
|
|
@@ -1874,10 +1808,6 @@ class Catalog:
|
|
|
1874
1808
|
envs: Optional[Mapping[str, str]] = None,
|
|
1875
1809
|
python_executable: Optional[str] = None,
|
|
1876
1810
|
save: bool = False,
|
|
1877
|
-
save_as: Optional[str] = None,
|
|
1878
|
-
preview_limit: int = 10,
|
|
1879
|
-
preview_offset: int = 0,
|
|
1880
|
-
preview_columns: Optional[list[str]] = None,
|
|
1881
1811
|
capture_output: bool = True,
|
|
1882
1812
|
output_hook: Callable[[str], None] = noop,
|
|
1883
1813
|
params: Optional[dict[str, str]] = None,
|
|
@@ -1905,34 +1835,25 @@ class Catalog:
|
|
|
1905
1835
|
C.size > 1000
|
|
1906
1836
|
)
|
|
1907
1837
|
"""
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
try:
|
|
1916
|
-
lines, proc, response_text = self.run_query(
|
|
1917
|
-
python_executable or sys.executable,
|
|
1918
|
-
query_script,
|
|
1919
|
-
envs,
|
|
1920
|
-
feature_file,
|
|
1921
|
-
capture_output,
|
|
1922
|
-
feature_module,
|
|
1923
|
-
output_hook,
|
|
1924
|
-
params,
|
|
1925
|
-
preview_columns,
|
|
1926
|
-
preview_limit,
|
|
1927
|
-
preview_offset,
|
|
1928
|
-
save,
|
|
1929
|
-
save_as,
|
|
1930
|
-
job_id,
|
|
1838
|
+
if not job_id:
|
|
1839
|
+
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
|
|
1840
|
+
job_id = self.metastore.create_job(
|
|
1841
|
+
name="",
|
|
1842
|
+
query=query_script,
|
|
1843
|
+
params=params,
|
|
1844
|
+
python_version=python_version,
|
|
1931
1845
|
)
|
|
1932
|
-
finally:
|
|
1933
|
-
feature_file.close()
|
|
1934
|
-
os.unlink(feature_file.name)
|
|
1935
1846
|
|
|
1847
|
+
lines, proc = self.run_query(
|
|
1848
|
+
python_executable or sys.executable,
|
|
1849
|
+
query_script,
|
|
1850
|
+
envs,
|
|
1851
|
+
capture_output,
|
|
1852
|
+
output_hook,
|
|
1853
|
+
params,
|
|
1854
|
+
save,
|
|
1855
|
+
job_id,
|
|
1856
|
+
)
|
|
1936
1857
|
output = "".join(lines)
|
|
1937
1858
|
|
|
1938
1859
|
if proc.returncode:
|
|
@@ -1942,105 +1863,69 @@ class Catalog:
|
|
|
1942
1863
|
return_code=proc.returncode,
|
|
1943
1864
|
output=output,
|
|
1944
1865
|
)
|
|
1945
|
-
if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
|
|
1946
|
-
raise QueryScriptRunError(
|
|
1947
|
-
"Last line in a script was not an instance of DataChain",
|
|
1948
|
-
return_code=proc.returncode,
|
|
1949
|
-
output=output,
|
|
1950
|
-
)
|
|
1951
1866
|
raise QueryScriptRunError(
|
|
1952
1867
|
f"Query script exited with error code {proc.returncode}",
|
|
1953
1868
|
return_code=proc.returncode,
|
|
1954
1869
|
output=output,
|
|
1955
1870
|
)
|
|
1956
1871
|
|
|
1872
|
+
def _get_dataset_versions_by_job_id():
|
|
1873
|
+
for dr, dv, job in self.list_datasets_versions():
|
|
1874
|
+
if job and str(job.id) == job_id:
|
|
1875
|
+
yield dr, dv
|
|
1876
|
+
|
|
1957
1877
|
try:
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
response = {}
|
|
1961
|
-
exec_result = ExecutionResult(**response)
|
|
1962
|
-
|
|
1963
|
-
dataset: Optional[DatasetRecord] = None
|
|
1964
|
-
version: Optional[int] = None
|
|
1965
|
-
if save or save_as:
|
|
1966
|
-
dataset, version = self.save_result(
|
|
1967
|
-
query_script, exec_result, output, version, job_id
|
|
1878
|
+
dr, dv = max(
|
|
1879
|
+
_get_dataset_versions_by_job_id(), key=lambda x: x[1].created_at
|
|
1968
1880
|
)
|
|
1881
|
+
except ValueError as e:
|
|
1882
|
+
if not save:
|
|
1883
|
+
return QueryResult(dataset=None, version=None, output=output)
|
|
1969
1884
|
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1885
|
+
raise QueryScriptDatasetNotFound(
|
|
1886
|
+
"No dataset found after running Query script",
|
|
1887
|
+
output=output,
|
|
1888
|
+
) from e
|
|
1889
|
+
|
|
1890
|
+
dr = self.update_dataset(
|
|
1891
|
+
dr,
|
|
1892
|
+
script_output=output,
|
|
1893
|
+
query_script=query_script,
|
|
1976
1894
|
)
|
|
1895
|
+
self.update_dataset_version_with_warehouse_info(
|
|
1896
|
+
dr,
|
|
1897
|
+
dv.version,
|
|
1898
|
+
script_output=output,
|
|
1899
|
+
query_script=query_script,
|
|
1900
|
+
job_id=job_id,
|
|
1901
|
+
is_job_result=True,
|
|
1902
|
+
)
|
|
1903
|
+
return QueryResult(dataset=dr, version=dv.version, output=output)
|
|
1977
1904
|
|
|
1978
1905
|
def run_query(
|
|
1979
1906
|
self,
|
|
1980
1907
|
python_executable: str,
|
|
1981
1908
|
query_script: str,
|
|
1982
1909
|
envs: Optional[Mapping[str, str]],
|
|
1983
|
-
feature_file: IO[bytes],
|
|
1984
1910
|
capture_output: bool,
|
|
1985
|
-
feature_module: str,
|
|
1986
1911
|
output_hook: Callable[[str], None],
|
|
1987
1912
|
params: Optional[dict[str, str]],
|
|
1988
|
-
preview_columns: Optional[list[str]],
|
|
1989
|
-
preview_limit: int,
|
|
1990
|
-
preview_offset: int,
|
|
1991
1913
|
save: bool,
|
|
1992
|
-
save_as: Optional[str],
|
|
1993
1914
|
job_id: Optional[str],
|
|
1994
|
-
) -> tuple[list[str], subprocess.Popen
|
|
1915
|
+
) -> tuple[list[str], subprocess.Popen]:
|
|
1995
1916
|
try:
|
|
1996
|
-
|
|
1997
|
-
query_script, feature_module[:-3]
|
|
1998
|
-
)
|
|
1999
|
-
if feature_code:
|
|
2000
|
-
feature_file.write(feature_code.encode())
|
|
2001
|
-
feature_file.flush()
|
|
2002
|
-
|
|
1917
|
+
query_script_compiled = self.compile_query_script(query_script)
|
|
2003
1918
|
except Exception as exc:
|
|
2004
1919
|
raise QueryScriptCompileError(
|
|
2005
1920
|
f"Query script failed to compile, reason: {exc}"
|
|
2006
1921
|
) from exc
|
|
2007
|
-
if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
|
|
2008
|
-
raise ValueError(
|
|
2009
|
-
f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
|
|
2010
|
-
)
|
|
2011
|
-
r, w = os.pipe()
|
|
2012
|
-
if os.name == "nt":
|
|
2013
|
-
import msvcrt
|
|
2014
|
-
|
|
2015
|
-
os.set_inheritable(w, True)
|
|
2016
|
-
|
|
2017
|
-
startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
|
|
2018
|
-
handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
|
|
2019
|
-
startupinfo.lpAttributeList["handle_list"].append(handle)
|
|
2020
|
-
kwargs: dict[str, Any] = {"startupinfo": startupinfo}
|
|
2021
|
-
else:
|
|
2022
|
-
handle = w
|
|
2023
|
-
kwargs = {"pass_fds": [w]}
|
|
2024
1922
|
envs = dict(envs or os.environ)
|
|
2025
|
-
if feature_code:
|
|
2026
|
-
envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
|
|
2027
|
-
{feature_module: feature_code}
|
|
2028
|
-
)
|
|
2029
1923
|
envs.update(
|
|
2030
1924
|
{
|
|
2031
1925
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
2032
1926
|
"PYTHONPATH": os.getcwd(), # For local imports
|
|
2033
|
-
"DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
|
|
2034
|
-
{
|
|
2035
|
-
"limit": preview_limit,
|
|
2036
|
-
"offset": preview_offset,
|
|
2037
|
-
"columns": preview_columns,
|
|
2038
|
-
}
|
|
2039
|
-
),
|
|
2040
1927
|
"DATACHAIN_QUERY_SAVE": "1" if save else "",
|
|
2041
|
-
"DATACHAIN_QUERY_SAVE_AS": save_as or "",
|
|
2042
1928
|
"PYTHONUNBUFFERED": "1",
|
|
2043
|
-
"DATACHAIN_OUTPUT_FD": str(handle),
|
|
2044
1929
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
2045
1930
|
},
|
|
2046
1931
|
)
|
|
@@ -2051,52 +1936,12 @@ class Catalog:
|
|
|
2051
1936
|
stderr=subprocess.STDOUT if capture_output else None,
|
|
2052
1937
|
bufsize=1,
|
|
2053
1938
|
text=False,
|
|
2054
|
-
**kwargs,
|
|
2055
1939
|
) as proc:
|
|
2056
|
-
os.close(w)
|
|
2057
|
-
|
|
2058
1940
|
out = proc.stdout
|
|
2059
1941
|
_lines: list[str] = []
|
|
2060
1942
|
ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
response_text = ""
|
|
2064
|
-
while proc.poll() is None:
|
|
2065
|
-
response_text += f.readline()
|
|
2066
|
-
time.sleep(0.1)
|
|
2067
|
-
response_text += f.readline()
|
|
2068
|
-
return lines, proc, response_text
|
|
2069
|
-
|
|
2070
|
-
def save_result(self, query_script, exec_result, output, version, job_id):
|
|
2071
|
-
if not exec_result.dataset:
|
|
2072
|
-
raise QueryScriptDatasetNotFound(
|
|
2073
|
-
"No dataset found after running Query script",
|
|
2074
|
-
output=output,
|
|
2075
|
-
)
|
|
2076
|
-
name, version = exec_result.dataset
|
|
2077
|
-
# finding returning dataset
|
|
2078
|
-
try:
|
|
2079
|
-
dataset = self.get_dataset(name)
|
|
2080
|
-
dataset.get_version(version)
|
|
2081
|
-
except (DatasetNotFoundError, ValueError) as e:
|
|
2082
|
-
raise QueryScriptDatasetNotFound(
|
|
2083
|
-
"No dataset found after running Query script",
|
|
2084
|
-
output=output,
|
|
2085
|
-
) from e
|
|
2086
|
-
dataset = self.update_dataset(
|
|
2087
|
-
dataset,
|
|
2088
|
-
script_output=output,
|
|
2089
|
-
query_script=query_script,
|
|
2090
|
-
)
|
|
2091
|
-
self.update_dataset_version_with_warehouse_info(
|
|
2092
|
-
dataset,
|
|
2093
|
-
version,
|
|
2094
|
-
script_output=output,
|
|
2095
|
-
query_script=query_script,
|
|
2096
|
-
job_id=job_id,
|
|
2097
|
-
is_job_result=True,
|
|
2098
|
-
)
|
|
2099
|
-
return dataset, version
|
|
1943
|
+
with ctx as lines:
|
|
1944
|
+
return lines, proc
|
|
2100
1945
|
|
|
2101
1946
|
def cp(
|
|
2102
1947
|
self,
|
datachain/cli.py
CHANGED
|
@@ -14,6 +14,7 @@ import shtab
|
|
|
14
14
|
|
|
15
15
|
from datachain import utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
|
+
from datachain.lib.dc import DataChain
|
|
17
18
|
from datachain.utils import DataChainDir
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
472
473
|
query_parser.add_argument(
|
|
473
474
|
"script", metavar="<script.py>", type=str, help="Filepath for script"
|
|
474
475
|
)
|
|
475
|
-
query_parser.add_argument(
|
|
476
|
-
"dataset_name", nargs="?", type=str, help="Save result dataset as"
|
|
477
|
-
)
|
|
478
476
|
query_parser.add_argument(
|
|
479
477
|
"--parallel",
|
|
480
478
|
nargs="?",
|
|
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
487
485
|
"N defaults to the CPU count."
|
|
488
486
|
),
|
|
489
487
|
)
|
|
490
|
-
add_show_args(query_parser)
|
|
491
488
|
query_parser.add_argument(
|
|
492
489
|
"-p",
|
|
493
490
|
"--param",
|
|
@@ -619,18 +616,6 @@ def _ls_urls_flat(
|
|
|
619
616
|
raise FileNotFoundError(f"No such file or directory: {source}")
|
|
620
617
|
|
|
621
618
|
|
|
622
|
-
def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
|
|
623
|
-
from datachain.node import long_line_str
|
|
624
|
-
|
|
625
|
-
storage_uris = catalog.ls_storage_uris()
|
|
626
|
-
if long:
|
|
627
|
-
for uri in storage_uris:
|
|
628
|
-
# TODO: add Storage.created so it can be used here
|
|
629
|
-
yield long_line_str(uri, None, "")
|
|
630
|
-
else:
|
|
631
|
-
yield from storage_uris
|
|
632
|
-
|
|
633
|
-
|
|
634
619
|
def ls_local(
|
|
635
620
|
sources,
|
|
636
621
|
long: bool = False,
|
|
@@ -661,8 +646,9 @@ def ls_local(
|
|
|
661
646
|
for entry in entries:
|
|
662
647
|
print(format_ls_entry(entry))
|
|
663
648
|
else:
|
|
664
|
-
|
|
665
|
-
|
|
649
|
+
chain = DataChain.listings()
|
|
650
|
+
for ls in chain.collect("listing"):
|
|
651
|
+
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
666
652
|
|
|
667
653
|
|
|
668
654
|
def format_ls_entry(entry: str) -> str:
|
|
@@ -813,16 +799,10 @@ def show(
|
|
|
813
799
|
def query(
|
|
814
800
|
catalog: "Catalog",
|
|
815
801
|
script: str,
|
|
816
|
-
dataset_name: Optional[str] = None,
|
|
817
802
|
parallel: Optional[int] = None,
|
|
818
|
-
limit: int = 10,
|
|
819
|
-
offset: int = 0,
|
|
820
|
-
columns: Optional[list[str]] = None,
|
|
821
|
-
no_collapse: bool = False,
|
|
822
803
|
params: Optional[dict[str, str]] = None,
|
|
823
804
|
) -> None:
|
|
824
805
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
825
|
-
from datachain.utils import show_records
|
|
826
806
|
|
|
827
807
|
with open(script, encoding="utf-8") as f:
|
|
828
808
|
script_content = f.read()
|
|
@@ -843,13 +823,9 @@ def query(
|
|
|
843
823
|
)
|
|
844
824
|
|
|
845
825
|
try:
|
|
846
|
-
|
|
826
|
+
catalog.query(
|
|
847
827
|
script_content,
|
|
848
828
|
python_executable=python_executable,
|
|
849
|
-
save_as=dataset_name,
|
|
850
|
-
preview_limit=limit,
|
|
851
|
-
preview_offset=offset,
|
|
852
|
-
preview_columns=columns,
|
|
853
829
|
capture_output=False,
|
|
854
830
|
params=params,
|
|
855
831
|
job_id=job_id,
|
|
@@ -864,10 +840,7 @@ def query(
|
|
|
864
840
|
error_stack=error_stack,
|
|
865
841
|
)
|
|
866
842
|
raise
|
|
867
|
-
|
|
868
|
-
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
|
|
869
|
-
|
|
870
|
-
show_records(result.preview, collapse_columns=not no_collapse)
|
|
843
|
+
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
|
|
871
844
|
|
|
872
845
|
|
|
873
846
|
def clear_cache(catalog: "Catalog"):
|
|
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1042
1015
|
query(
|
|
1043
1016
|
catalog,
|
|
1044
1017
|
args.script,
|
|
1045
|
-
dataset_name=args.dataset_name,
|
|
1046
1018
|
parallel=args.parallel,
|
|
1047
|
-
limit=args.limit,
|
|
1048
|
-
offset=args.offset,
|
|
1049
|
-
columns=args.columns,
|
|
1050
|
-
no_collapse=args.no_collapse,
|
|
1051
1019
|
params=args.param,
|
|
1052
1020
|
)
|
|
1053
1021
|
elif args.command == "apply-udf":
|
datachain/client/fsspec.py
CHANGED
|
@@ -87,6 +87,7 @@ class Client(ABC):
|
|
|
87
87
|
def get_implementation(url: str) -> type["Client"]:
|
|
88
88
|
from .azure import AzureClient
|
|
89
89
|
from .gcs import GCSClient
|
|
90
|
+
from .hf import HfClient
|
|
90
91
|
from .local import FileClient
|
|
91
92
|
from .s3 import ClientS3
|
|
92
93
|
|
|
@@ -104,6 +105,8 @@ class Client(ABC):
|
|
|
104
105
|
return AzureClient
|
|
105
106
|
if protocol == FileClient.protocol:
|
|
106
107
|
return FileClient
|
|
108
|
+
if protocol == HfClient.protocol:
|
|
109
|
+
return HfClient
|
|
107
110
|
|
|
108
111
|
raise NotImplementedError(f"Unsupported protocol: {protocol}")
|
|
109
112
|
|
datachain/client/hf.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import posixpath
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from huggingface_hub import HfFileSystem
|
|
6
|
+
|
|
7
|
+
from datachain.lib.file import File
|
|
8
|
+
from datachain.node import Entry
|
|
9
|
+
|
|
10
|
+
from .fsspec import Client
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HfClient(Client):
|
|
14
|
+
FS_CLASS = HfFileSystem
|
|
15
|
+
PREFIX = "hf://"
|
|
16
|
+
protocol = "hf"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def create_fs(cls, **kwargs) -> HfFileSystem:
|
|
20
|
+
if os.environ.get("HF_TOKEN"):
|
|
21
|
+
kwargs["token"] = os.environ["HF_TOKEN"]
|
|
22
|
+
|
|
23
|
+
return cast(HfFileSystem, super().create_fs(**kwargs))
|
|
24
|
+
|
|
25
|
+
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
26
|
+
return Entry.from_file(
|
|
27
|
+
path=path,
|
|
28
|
+
size=v["size"],
|
|
29
|
+
version=v["last_commit"].oid,
|
|
30
|
+
etag=v.get("blob_id", ""),
|
|
31
|
+
last_modified=v["last_commit"].date,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
35
|
+
return File(
|
|
36
|
+
path=path,
|
|
37
|
+
size=v["size"],
|
|
38
|
+
version=v["last_commit"].oid,
|
|
39
|
+
etag=v.get("blob_id", ""),
|
|
40
|
+
last_modified=v["last_commit"].date,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
async def ls_dir(self, path):
|
|
44
|
+
return self.fs.ls(path, detail=True)
|
|
45
|
+
|
|
46
|
+
def rel_path(self, path):
|
|
47
|
+
return posixpath.relpath(path, self.name)
|
|
@@ -167,21 +167,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
167
167
|
This method should be called when index operation is finished.
|
|
168
168
|
"""
|
|
169
169
|
|
|
170
|
-
@abstractmethod
|
|
171
|
-
def mark_storage_not_indexed(self, uri: StorageURI) -> None:
|
|
172
|
-
"""
|
|
173
|
-
Mark storage as not indexed.
|
|
174
|
-
This method should be called when storage index is deleted.
|
|
175
|
-
"""
|
|
176
|
-
|
|
177
170
|
@abstractmethod
|
|
178
171
|
def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
|
|
179
172
|
"""Updates last inserted datetime in bucket with current time."""
|
|
180
173
|
|
|
181
|
-
@abstractmethod
|
|
182
|
-
def get_all_storage_uris(self) -> Iterator[StorageURI]:
|
|
183
|
-
"""Returns all storage uris."""
|
|
184
|
-
|
|
185
174
|
@abstractmethod
|
|
186
175
|
def get_storage(self, uri: StorageURI) -> Storage:
|
|
187
176
|
"""
|
|
@@ -189,10 +178,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
189
178
|
E.g. if s3 is used as storage this would be s3 bucket data.
|
|
190
179
|
"""
|
|
191
180
|
|
|
192
|
-
@abstractmethod
|
|
193
|
-
def list_storages(self) -> list[Storage]:
|
|
194
|
-
"""Returns all storages."""
|
|
195
|
-
|
|
196
181
|
@abstractmethod
|
|
197
182
|
def mark_storage_pending(self, storage: Storage) -> Storage:
|
|
198
183
|
"""Marks storage as pending."""
|
|
@@ -324,7 +309,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
324
309
|
self.add_dataset_dependency(
|
|
325
310
|
source_dataset_name,
|
|
326
311
|
source_dataset_version,
|
|
327
|
-
dependency.
|
|
312
|
+
dependency.dataset_name,
|
|
328
313
|
int(dependency.version),
|
|
329
314
|
)
|
|
330
315
|
else:
|
|
@@ -906,11 +891,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
906
891
|
self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
|
|
907
892
|
)
|
|
908
893
|
|
|
909
|
-
def get_all_storage_uris(self) -> Iterator[StorageURI]:
|
|
910
|
-
"""Returns all storage uris."""
|
|
911
|
-
s = self._storages
|
|
912
|
-
yield from (r[0] for r in self.db.execute(self._storages_select(s.c.uri)))
|
|
913
|
-
|
|
914
894
|
def get_storage(self, uri: StorageURI, conn=None) -> Storage:
|
|
915
895
|
"""
|
|
916
896
|
Gets storage representation from database.
|
|
@@ -926,13 +906,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
926
906
|
|
|
927
907
|
return self.storage_class._make(result)
|
|
928
908
|
|
|
929
|
-
def list_storages(self) -> list[Storage]:
|
|
930
|
-
result = self.db.execute(self._storages_select())
|
|
931
|
-
if not result:
|
|
932
|
-
return []
|
|
933
|
-
|
|
934
|
-
return [self.storage_class._make(r) for r in result]
|
|
935
|
-
|
|
936
909
|
def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
|
|
937
910
|
# Update status to pending and dates
|
|
938
911
|
updates = {
|
|
@@ -1503,7 +1476,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1503
1476
|
return self._jobs.update().where(*where)
|
|
1504
1477
|
|
|
1505
1478
|
def _parse_job(self, rows) -> Job:
|
|
1506
|
-
return
|
|
1479
|
+
return self.job_class.parse(*rows)
|
|
1507
1480
|
|
|
1508
1481
|
def _parse_jobs(self, rows) -> Iterator["Job"]:
|
|
1509
1482
|
for _, g in groupby(rows, lambda r: r[0]):
|