datachain 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +50 -230
- datachain/error.py +0 -4
- datachain/job.py +4 -3
- datachain/lib/clip.py +1 -1
- datachain/lib/dc.py +92 -38
- datachain/lib/file.py +9 -8
- datachain/lib/image.py +1 -1
- datachain/lib/meta_formats.py +38 -59
- datachain/lib/model_store.py +6 -1
- datachain/lib/text.py +1 -1
- datachain/lib/webdataset.py +13 -0
- datachain/lib/webdataset_laion.py +13 -0
- datachain/query/dataset.py +9 -32
- {datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/METADATA +7 -5
- {datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/RECORD +19 -20
- datachain/catalog/subclass.py +0 -60
- {datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/LICENSE +0 -0
- {datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/WHEEL +0 -0
- {datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.10.dist-info → datachain-0.3.12.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -9,11 +9,9 @@ import os.path
|
|
|
9
9
|
import posixpath
|
|
10
10
|
import subprocess
|
|
11
11
|
import sys
|
|
12
|
-
import tempfile
|
|
13
12
|
import time
|
|
14
13
|
import traceback
|
|
15
14
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
16
|
-
from contextlib import contextmanager, nullcontext
|
|
17
15
|
from copy import copy
|
|
18
16
|
from dataclasses import dataclass
|
|
19
17
|
from functools import cached_property, reduce
|
|
@@ -24,7 +22,6 @@ from typing import (
|
|
|
24
22
|
TYPE_CHECKING,
|
|
25
23
|
Any,
|
|
26
24
|
Callable,
|
|
27
|
-
NamedTuple,
|
|
28
25
|
NoReturn,
|
|
29
26
|
Optional,
|
|
30
27
|
Union,
|
|
@@ -59,7 +56,6 @@ from datachain.error import (
|
|
|
59
56
|
PendingIndexingError,
|
|
60
57
|
QueryScriptCancelError,
|
|
61
58
|
QueryScriptCompileError,
|
|
62
|
-
QueryScriptDatasetNotFound,
|
|
63
59
|
QueryScriptRunError,
|
|
64
60
|
)
|
|
65
61
|
from datachain.listing import Listing
|
|
@@ -77,7 +73,6 @@ from datachain.utils import (
|
|
|
77
73
|
)
|
|
78
74
|
|
|
79
75
|
from .datasource import DataSource
|
|
80
|
-
from .subclass import SubclassFinder
|
|
81
76
|
|
|
82
77
|
if TYPE_CHECKING:
|
|
83
78
|
from datachain.data_storage import (
|
|
@@ -92,7 +87,6 @@ logger = logging.getLogger("datachain")
|
|
|
92
87
|
|
|
93
88
|
DEFAULT_DATASET_DIR = "dataset"
|
|
94
89
|
DATASET_FILE_SUFFIX = ".edatachain"
|
|
95
|
-
FEATURE_CLASSES = ["DataModel"]
|
|
96
90
|
|
|
97
91
|
TTL_INT = 4 * 60 * 60
|
|
98
92
|
|
|
@@ -118,44 +112,19 @@ def noop(_: str):
|
|
|
118
112
|
pass
|
|
119
113
|
|
|
120
114
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
stream:
|
|
124
|
-
|
|
125
|
-
lines: list[str] = []
|
|
126
|
-
append = lines.append
|
|
115
|
+
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
|
|
116
|
+
buffer = b""
|
|
117
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
118
|
+
buffer += byt
|
|
127
119
|
|
|
128
|
-
|
|
129
|
-
buffer = b""
|
|
130
|
-
while byt := stream.read(1): # Read one byte at a time
|
|
131
|
-
buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
|
|
132
|
-
|
|
133
|
-
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
134
|
-
line = buffer.decode("utf-8")
|
|
135
|
-
print(line, end="")
|
|
136
|
-
callback(line)
|
|
137
|
-
append(line)
|
|
138
|
-
buffer = b"" # Clear buffer for next line
|
|
139
|
-
|
|
140
|
-
if buffer: # Handle any remaining data in the buffer
|
|
120
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
141
121
|
line = buffer.decode("utf-8")
|
|
142
|
-
print(line, end="")
|
|
143
122
|
callback(line)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
thread = Thread(target=loop, daemon=True)
|
|
147
|
-
thread.start()
|
|
148
|
-
|
|
149
|
-
try:
|
|
150
|
-
yield lines
|
|
151
|
-
finally:
|
|
152
|
-
thread.join()
|
|
123
|
+
buffer = b"" # Clear buffer for next line
|
|
153
124
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
version: Optional[int]
|
|
158
|
-
output: str
|
|
125
|
+
if buffer: # Handle any remaining data in the buffer
|
|
126
|
+
line = buffer.decode("utf-8")
|
|
127
|
+
callback(line)
|
|
159
128
|
|
|
160
129
|
|
|
161
130
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -569,12 +538,6 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
569
538
|
return ""
|
|
570
539
|
|
|
571
540
|
|
|
572
|
-
def form_module_source(source_ast):
|
|
573
|
-
module = ast.Module(body=source_ast, type_ignores=[])
|
|
574
|
-
module = ast.fix_missing_locations(module)
|
|
575
|
-
return ast.unparse(module)
|
|
576
|
-
|
|
577
|
-
|
|
578
541
|
class Catalog:
|
|
579
542
|
def __init__(
|
|
580
543
|
self,
|
|
@@ -658,34 +621,8 @@ class Catalog:
|
|
|
658
621
|
),
|
|
659
622
|
]
|
|
660
623
|
code_ast.body[-1:] = new_expressions
|
|
661
|
-
else:
|
|
662
|
-
raise Exception("Last line in a script was not an expression")
|
|
663
624
|
return code_ast
|
|
664
625
|
|
|
665
|
-
def compile_query_script(
|
|
666
|
-
self, script: str, feature_module_name: str
|
|
667
|
-
) -> tuple[Union[str, None], str]:
|
|
668
|
-
code_ast = ast.parse(script)
|
|
669
|
-
code_ast = self.attach_query_wrapper(code_ast)
|
|
670
|
-
finder = SubclassFinder(FEATURE_CLASSES)
|
|
671
|
-
finder.visit(code_ast)
|
|
672
|
-
|
|
673
|
-
if not finder.feature_class:
|
|
674
|
-
main_module = form_module_source([*finder.imports, *finder.main_body])
|
|
675
|
-
return None, main_module
|
|
676
|
-
|
|
677
|
-
feature_import = ast.ImportFrom(
|
|
678
|
-
module=feature_module_name,
|
|
679
|
-
names=[ast.alias(name="*", asname=None)],
|
|
680
|
-
level=0,
|
|
681
|
-
)
|
|
682
|
-
feature_module = form_module_source([*finder.imports, *finder.feature_class])
|
|
683
|
-
main_module = form_module_source(
|
|
684
|
-
[*finder.imports, feature_import, *finder.main_body]
|
|
685
|
-
)
|
|
686
|
-
|
|
687
|
-
return feature_module, main_module
|
|
688
|
-
|
|
689
626
|
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
690
627
|
config = config or self.client_config
|
|
691
628
|
return Client.parse_url(uri, self.cache, **config)
|
|
@@ -1416,7 +1353,8 @@ class Catalog:
|
|
|
1416
1353
|
|
|
1417
1354
|
for d in datasets:
|
|
1418
1355
|
yield from (
|
|
1419
|
-
(d, v, jobs.get(v.job_id) if v.job_id else None)
|
|
1356
|
+
(d, v, jobs.get(str(v.job_id)) if v.job_id else None)
|
|
1357
|
+
for v in d.versions
|
|
1420
1358
|
)
|
|
1421
1359
|
|
|
1422
1360
|
def ls_dataset_rows(
|
|
@@ -1834,14 +1772,15 @@ class Catalog:
|
|
|
1834
1772
|
def query(
|
|
1835
1773
|
self,
|
|
1836
1774
|
query_script: str,
|
|
1837
|
-
|
|
1838
|
-
python_executable:
|
|
1775
|
+
env: Optional[Mapping[str, str]] = None,
|
|
1776
|
+
python_executable: str = sys.executable,
|
|
1839
1777
|
save: bool = False,
|
|
1840
1778
|
capture_output: bool = True,
|
|
1841
1779
|
output_hook: Callable[[str], None] = noop,
|
|
1842
1780
|
params: Optional[dict[str, str]] = None,
|
|
1843
1781
|
job_id: Optional[str] = None,
|
|
1844
|
-
|
|
1782
|
+
_execute_last_expression: bool = False,
|
|
1783
|
+
) -> None:
|
|
1845
1784
|
"""
|
|
1846
1785
|
Method to run custom user Python script to run a query and, as result,
|
|
1847
1786
|
creates new dataset from the results of a query.
|
|
@@ -1864,170 +1803,51 @@ class Catalog:
|
|
|
1864
1803
|
C.size > 1000
|
|
1865
1804
|
)
|
|
1866
1805
|
"""
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
query_script,
|
|
1877
|
-
envs,
|
|
1878
|
-
feature_file,
|
|
1879
|
-
capture_output,
|
|
1880
|
-
feature_module,
|
|
1881
|
-
output_hook,
|
|
1882
|
-
params,
|
|
1883
|
-
save,
|
|
1884
|
-
job_id,
|
|
1885
|
-
)
|
|
1886
|
-
finally:
|
|
1887
|
-
feature_file.close()
|
|
1888
|
-
os.unlink(feature_file.name)
|
|
1889
|
-
|
|
1890
|
-
output = "".join(lines)
|
|
1891
|
-
|
|
1892
|
-
if proc.returncode:
|
|
1893
|
-
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1894
|
-
raise QueryScriptCancelError(
|
|
1895
|
-
"Query script was canceled by user",
|
|
1896
|
-
return_code=proc.returncode,
|
|
1897
|
-
output=output,
|
|
1898
|
-
)
|
|
1899
|
-
if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
|
|
1900
|
-
raise QueryScriptRunError(
|
|
1901
|
-
"Last line in a script was not an instance of DataChain",
|
|
1902
|
-
return_code=proc.returncode,
|
|
1903
|
-
output=output,
|
|
1904
|
-
)
|
|
1905
|
-
raise QueryScriptRunError(
|
|
1906
|
-
f"Query script exited with error code {proc.returncode}",
|
|
1907
|
-
return_code=proc.returncode,
|
|
1908
|
-
output=output,
|
|
1909
|
-
)
|
|
1910
|
-
|
|
1911
|
-
try:
|
|
1912
|
-
result = json.loads(response_text)
|
|
1913
|
-
except ValueError:
|
|
1914
|
-
result = None
|
|
1915
|
-
|
|
1916
|
-
dataset: Optional[DatasetRecord] = None
|
|
1917
|
-
version: Optional[int] = None
|
|
1918
|
-
if save:
|
|
1919
|
-
dataset, version = self.save_result(
|
|
1920
|
-
query_script, result, output, version, job_id
|
|
1921
|
-
)
|
|
1922
|
-
|
|
1923
|
-
return QueryResult(dataset=dataset, version=version, output=output)
|
|
1924
|
-
|
|
1925
|
-
def run_query(
|
|
1926
|
-
self,
|
|
1927
|
-
python_executable: str,
|
|
1928
|
-
query_script: str,
|
|
1929
|
-
envs: Optional[Mapping[str, str]],
|
|
1930
|
-
feature_file: IO[bytes],
|
|
1931
|
-
capture_output: bool,
|
|
1932
|
-
feature_module: str,
|
|
1933
|
-
output_hook: Callable[[str], None],
|
|
1934
|
-
params: Optional[dict[str, str]],
|
|
1935
|
-
save: bool,
|
|
1936
|
-
job_id: Optional[str],
|
|
1937
|
-
) -> tuple[list[str], subprocess.Popen, str]:
|
|
1938
|
-
try:
|
|
1939
|
-
feature_code, query_script_compiled = self.compile_query_script(
|
|
1940
|
-
query_script, feature_module[:-3]
|
|
1941
|
-
)
|
|
1942
|
-
if feature_code:
|
|
1943
|
-
feature_file.write(feature_code.encode())
|
|
1944
|
-
feature_file.flush()
|
|
1945
|
-
|
|
1946
|
-
except Exception as exc:
|
|
1947
|
-
raise QueryScriptCompileError(
|
|
1948
|
-
f"Query script failed to compile, reason: {exc}"
|
|
1949
|
-
) from exc
|
|
1950
|
-
r, w = os.pipe()
|
|
1951
|
-
if os.name == "nt":
|
|
1952
|
-
import msvcrt
|
|
1953
|
-
|
|
1954
|
-
os.set_inheritable(w, True)
|
|
1955
|
-
|
|
1956
|
-
startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
|
|
1957
|
-
handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
|
|
1958
|
-
startupinfo.lpAttributeList["handle_list"].append(handle)
|
|
1959
|
-
kwargs: dict[str, Any] = {"startupinfo": startupinfo}
|
|
1806
|
+
if _execute_last_expression:
|
|
1807
|
+
try:
|
|
1808
|
+
code_ast = ast.parse(query_script)
|
|
1809
|
+
code_ast = self.attach_query_wrapper(code_ast)
|
|
1810
|
+
query_script_compiled = ast.unparse(code_ast)
|
|
1811
|
+
except Exception as exc:
|
|
1812
|
+
raise QueryScriptCompileError(
|
|
1813
|
+
f"Query script failed to compile, reason: {exc}"
|
|
1814
|
+
) from exc
|
|
1960
1815
|
else:
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
{feature_module: feature_code}
|
|
1967
|
-
)
|
|
1968
|
-
envs.update(
|
|
1816
|
+
query_script_compiled = query_script
|
|
1817
|
+
assert not save
|
|
1818
|
+
|
|
1819
|
+
env = dict(env or os.environ)
|
|
1820
|
+
env.update(
|
|
1969
1821
|
{
|
|
1970
1822
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
1971
1823
|
"PYTHONPATH": os.getcwd(), # For local imports
|
|
1972
1824
|
"DATACHAIN_QUERY_SAVE": "1" if save else "",
|
|
1973
1825
|
"PYTHONUNBUFFERED": "1",
|
|
1974
|
-
"DATACHAIN_OUTPUT_FD": str(handle),
|
|
1975
1826
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
1976
1827
|
},
|
|
1977
1828
|
)
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
return lines, proc, response_text
|
|
2000
|
-
|
|
2001
|
-
def save_result(self, query_script, exec_result, output, version, job_id):
|
|
2002
|
-
if not exec_result:
|
|
2003
|
-
raise QueryScriptDatasetNotFound(
|
|
2004
|
-
"No dataset found after running Query script",
|
|
2005
|
-
output=output,
|
|
1829
|
+
popen_kwargs = {}
|
|
1830
|
+
if capture_output:
|
|
1831
|
+
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1832
|
+
|
|
1833
|
+
cmd = [python_executable, "-c", query_script_compiled]
|
|
1834
|
+
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # type: ignore[call-overload] # noqa: S603
|
|
1835
|
+
if capture_output:
|
|
1836
|
+
args = (proc.stdout, output_hook)
|
|
1837
|
+
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
1838
|
+
thread.start()
|
|
1839
|
+
thread.join() # wait for the reader thread
|
|
1840
|
+
|
|
1841
|
+
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
|
|
1842
|
+
raise QueryScriptCancelError(
|
|
1843
|
+
"Query script was canceled by user",
|
|
1844
|
+
return_code=proc.returncode,
|
|
1845
|
+
)
|
|
1846
|
+
if proc.returncode:
|
|
1847
|
+
raise QueryScriptRunError(
|
|
1848
|
+
f"Query script exited with error code {proc.returncode}",
|
|
1849
|
+
return_code=proc.returncode,
|
|
2006
1850
|
)
|
|
2007
|
-
name, version = exec_result
|
|
2008
|
-
# finding returning dataset
|
|
2009
|
-
try:
|
|
2010
|
-
dataset = self.get_dataset(name)
|
|
2011
|
-
dataset.get_version(version)
|
|
2012
|
-
except (DatasetNotFoundError, ValueError) as e:
|
|
2013
|
-
raise QueryScriptDatasetNotFound(
|
|
2014
|
-
"No dataset found after running Query script",
|
|
2015
|
-
output=output,
|
|
2016
|
-
) from e
|
|
2017
|
-
dataset = self.update_dataset(
|
|
2018
|
-
dataset,
|
|
2019
|
-
script_output=output,
|
|
2020
|
-
query_script=query_script,
|
|
2021
|
-
)
|
|
2022
|
-
self.update_dataset_version_with_warehouse_info(
|
|
2023
|
-
dataset,
|
|
2024
|
-
version,
|
|
2025
|
-
script_output=output,
|
|
2026
|
-
query_script=query_script,
|
|
2027
|
-
job_id=job_id,
|
|
2028
|
-
is_job_result=True,
|
|
2029
|
-
)
|
|
2030
|
-
return dataset, version
|
|
2031
1851
|
|
|
2032
1852
|
def cp(
|
|
2033
1853
|
self,
|
datachain/error.py
CHANGED
datachain/job.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import uuid
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Optional, TypeVar
|
|
5
|
+
from typing import Any, Optional, TypeVar, Union
|
|
5
6
|
|
|
6
7
|
J = TypeVar("J", bound="Job")
|
|
7
8
|
|
|
@@ -25,7 +26,7 @@ class Job:
|
|
|
25
26
|
@classmethod
|
|
26
27
|
def parse(
|
|
27
28
|
cls: type[J],
|
|
28
|
-
id: str,
|
|
29
|
+
id: Union[str, uuid.UUID],
|
|
29
30
|
name: str,
|
|
30
31
|
status: int,
|
|
31
32
|
created_at: datetime,
|
|
@@ -40,7 +41,7 @@ class Job:
|
|
|
40
41
|
metrics: str,
|
|
41
42
|
) -> "Job":
|
|
42
43
|
return cls(
|
|
43
|
-
id,
|
|
44
|
+
str(id),
|
|
44
45
|
name,
|
|
45
46
|
status,
|
|
46
47
|
created_at,
|
datachain/lib/clip.py
CHANGED
|
@@ -18,7 +18,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
|
|
|
18
18
|
hasattr(model, method_name) and inspect.ismethod(getattr(model, method_name))
|
|
19
19
|
):
|
|
20
20
|
method = getattr(model, method_name)
|
|
21
|
-
return lambda x: method(torch.
|
|
21
|
+
return lambda x: method(torch.as_tensor(x).clone().detach())
|
|
22
22
|
|
|
23
23
|
# Check for model from clip or open_clip library
|
|
24
24
|
method_name = f"encode_{type}"
|
datachain/lib/dc.py
CHANGED
|
@@ -56,7 +56,7 @@ from datachain.query.dataset import (
|
|
|
56
56
|
PartitionByType,
|
|
57
57
|
detach,
|
|
58
58
|
)
|
|
59
|
-
from datachain.query.schema import Column, DatasetRow
|
|
59
|
+
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
60
60
|
from datachain.sql.functions import path as pathfunc
|
|
61
61
|
from datachain.utils import inside_notebook
|
|
62
62
|
|
|
@@ -112,11 +112,31 @@ class DatasetFromValuesError(DataChainParamsError): # noqa: D101
|
|
|
112
112
|
super().__init__(f"Dataset{name} from values error: {msg}")
|
|
113
113
|
|
|
114
114
|
|
|
115
|
+
def _get_merge_error_str(col: Union[str, sqlalchemy.ColumnElement]) -> str:
|
|
116
|
+
if isinstance(col, str):
|
|
117
|
+
return col
|
|
118
|
+
if isinstance(col, sqlalchemy.Column):
|
|
119
|
+
return col.name.replace(DEFAULT_DELIMITER, ".")
|
|
120
|
+
if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
|
|
121
|
+
return f"{col.name} expression"
|
|
122
|
+
return str(col)
|
|
123
|
+
|
|
124
|
+
|
|
115
125
|
class DatasetMergeError(DataChainParamsError): # noqa: D101
|
|
116
|
-
def __init__(
|
|
117
|
-
|
|
126
|
+
def __init__( # noqa: D107
|
|
127
|
+
self,
|
|
128
|
+
on: Sequence[Union[str, sqlalchemy.ColumnElement]],
|
|
129
|
+
right_on: Optional[Sequence[Union[str, sqlalchemy.ColumnElement]]],
|
|
130
|
+
msg: str,
|
|
131
|
+
):
|
|
132
|
+
def _get_str(on: Sequence[Union[str, sqlalchemy.ColumnElement]]) -> str:
|
|
133
|
+
if not isinstance(on, Sequence):
|
|
134
|
+
return str(on) # type: ignore[unreachable]
|
|
135
|
+
return ", ".join([_get_merge_error_str(col) for col in on])
|
|
136
|
+
|
|
137
|
+
on_str = _get_str(on)
|
|
118
138
|
right_on_str = (
|
|
119
|
-
", right_on='" +
|
|
139
|
+
", right_on='" + _get_str(right_on) + "'"
|
|
120
140
|
if right_on and isinstance(right_on, Sequence)
|
|
121
141
|
else ""
|
|
122
142
|
)
|
|
@@ -139,7 +159,7 @@ class Sys(DataModel):
|
|
|
139
159
|
|
|
140
160
|
|
|
141
161
|
class DataChain(DatasetQuery):
|
|
142
|
-
"""
|
|
162
|
+
"""DataChain - a data structure for batch data processing and evaluation.
|
|
143
163
|
|
|
144
164
|
It represents a sequence of data manipulation steps such as reading data from
|
|
145
165
|
storages, running AI or LLM models or calling external services API to validate or
|
|
@@ -252,13 +272,24 @@ class DataChain(DatasetQuery):
|
|
|
252
272
|
"""Returns Column instance with a type if name is found in current schema,
|
|
253
273
|
otherwise raises an exception.
|
|
254
274
|
"""
|
|
255
|
-
|
|
275
|
+
if "." in name:
|
|
276
|
+
name_path = name.split(".")
|
|
277
|
+
elif DEFAULT_DELIMITER in name:
|
|
278
|
+
name_path = name.split(DEFAULT_DELIMITER)
|
|
279
|
+
else:
|
|
280
|
+
name_path = [name]
|
|
256
281
|
for path, type_, _, _ in self.signals_schema.get_flat_tree():
|
|
257
282
|
if path == name_path:
|
|
258
283
|
return Column(name, python_to_sql(type_))
|
|
259
284
|
|
|
260
285
|
raise ValueError(f"Column with name {name} not found in the schema")
|
|
261
286
|
|
|
287
|
+
def c(self, column: Union[str, Column]) -> Column:
|
|
288
|
+
"""Returns Column instance attached to the current chain."""
|
|
289
|
+
c = self.column(column) if isinstance(column, str) else self.column(column.name)
|
|
290
|
+
c.table = self.table
|
|
291
|
+
return c
|
|
292
|
+
|
|
262
293
|
def print_schema(self) -> None:
|
|
263
294
|
"""Print schema of the chain."""
|
|
264
295
|
self._effective_signals_schema.print_tree()
|
|
@@ -384,7 +415,7 @@ class DataChain(DatasetQuery):
|
|
|
384
415
|
.save(list_dataset_name, listing=True)
|
|
385
416
|
)
|
|
386
417
|
|
|
387
|
-
dc = cls.from_dataset(list_dataset_name, session=session)
|
|
418
|
+
dc = cls.from_dataset(list_dataset_name, session=session, settings=settings)
|
|
388
419
|
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
389
420
|
|
|
390
421
|
return ls(dc, list_path, recursive=recursive, object_name=object_name)
|
|
@@ -395,6 +426,7 @@ class DataChain(DatasetQuery):
|
|
|
395
426
|
name: str,
|
|
396
427
|
version: Optional[int] = None,
|
|
397
428
|
session: Optional[Session] = None,
|
|
429
|
+
settings: Optional[dict] = None,
|
|
398
430
|
) -> "DataChain":
|
|
399
431
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
400
432
|
|
|
@@ -407,7 +439,7 @@ class DataChain(DatasetQuery):
|
|
|
407
439
|
chain = DataChain.from_dataset("my_cats")
|
|
408
440
|
```
|
|
409
441
|
"""
|
|
410
|
-
return DataChain(name=name, version=version, session=session)
|
|
442
|
+
return DataChain(name=name, version=version, session=session, settings=settings)
|
|
411
443
|
|
|
412
444
|
@classmethod
|
|
413
445
|
def from_json(
|
|
@@ -1140,8 +1172,17 @@ class DataChain(DatasetQuery):
|
|
|
1140
1172
|
def merge(
|
|
1141
1173
|
self,
|
|
1142
1174
|
right_ds: "DataChain",
|
|
1143
|
-
on: Union[
|
|
1144
|
-
|
|
1175
|
+
on: Union[
|
|
1176
|
+
str,
|
|
1177
|
+
sqlalchemy.ColumnElement,
|
|
1178
|
+
Sequence[Union[str, sqlalchemy.ColumnElement]],
|
|
1179
|
+
],
|
|
1180
|
+
right_on: Union[
|
|
1181
|
+
str,
|
|
1182
|
+
sqlalchemy.ColumnElement,
|
|
1183
|
+
Sequence[Union[str, sqlalchemy.ColumnElement]],
|
|
1184
|
+
None,
|
|
1185
|
+
] = None,
|
|
1145
1186
|
inner=False,
|
|
1146
1187
|
rname="right_",
|
|
1147
1188
|
) -> "Self":
|
|
@@ -1166,7 +1207,7 @@ class DataChain(DatasetQuery):
|
|
|
1166
1207
|
if on is None:
|
|
1167
1208
|
raise DatasetMergeError(["None"], None, "'on' must be specified")
|
|
1168
1209
|
|
|
1169
|
-
if isinstance(on, str):
|
|
1210
|
+
if isinstance(on, (str, sqlalchemy.ColumnElement)):
|
|
1170
1211
|
on = [on]
|
|
1171
1212
|
elif not isinstance(on, Sequence):
|
|
1172
1213
|
raise DatasetMergeError(
|
|
@@ -1175,19 +1216,15 @@ class DataChain(DatasetQuery):
|
|
|
1175
1216
|
f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
|
|
1176
1217
|
)
|
|
1177
1218
|
|
|
1178
|
-
signals_schema = self.signals_schema.clone_without_sys_signals()
|
|
1179
|
-
on_columns: list[str] = signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
1180
|
-
|
|
1181
|
-
right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
|
|
1182
1219
|
if right_on is not None:
|
|
1183
|
-
if isinstance(right_on, str):
|
|
1220
|
+
if isinstance(right_on, (str, sqlalchemy.ColumnElement)):
|
|
1184
1221
|
right_on = [right_on]
|
|
1185
1222
|
elif not isinstance(right_on, Sequence):
|
|
1186
1223
|
raise DatasetMergeError(
|
|
1187
1224
|
on,
|
|
1188
1225
|
right_on,
|
|
1189
1226
|
"'right_on' must be 'str' or 'Sequence' object"
|
|
1190
|
-
f" but got type '{right_on}'",
|
|
1227
|
+
f" but got type '{type(right_on)}'",
|
|
1191
1228
|
)
|
|
1192
1229
|
|
|
1193
1230
|
if len(right_on) != len(on):
|
|
@@ -1195,34 +1232,39 @@ class DataChain(DatasetQuery):
|
|
|
1195
1232
|
on, right_on, "'on' and 'right_on' must have the same length'"
|
|
1196
1233
|
)
|
|
1197
1234
|
|
|
1198
|
-
right_on_columns: list[str] = right_signals_schema.resolve(
|
|
1199
|
-
*right_on
|
|
1200
|
-
).db_signals() # type: ignore[assignment]
|
|
1201
|
-
|
|
1202
|
-
if len(right_on_columns) != len(on_columns):
|
|
1203
|
-
on_str = ", ".join(right_on_columns)
|
|
1204
|
-
right_on_str = ", ".join(right_on_columns)
|
|
1205
|
-
raise DatasetMergeError(
|
|
1206
|
-
on,
|
|
1207
|
-
right_on,
|
|
1208
|
-
"'on' and 'right_on' must have the same number of columns in db'."
|
|
1209
|
-
f" on -> {on_str}, right_on -> {right_on_str}",
|
|
1210
|
-
)
|
|
1211
|
-
else:
|
|
1212
|
-
right_on = on
|
|
1213
|
-
right_on_columns = on_columns
|
|
1214
|
-
|
|
1215
1235
|
if self == right_ds:
|
|
1216
1236
|
right_ds = right_ds.clone(new_table=True)
|
|
1217
1237
|
|
|
1238
|
+
errors = []
|
|
1239
|
+
|
|
1240
|
+
def _resolve(
|
|
1241
|
+
ds: DataChain,
|
|
1242
|
+
col: Union[str, sqlalchemy.ColumnElement],
|
|
1243
|
+
side: Union[str, None],
|
|
1244
|
+
):
|
|
1245
|
+
try:
|
|
1246
|
+
return ds.c(col) if isinstance(col, (str, C)) else col
|
|
1247
|
+
except ValueError:
|
|
1248
|
+
if side:
|
|
1249
|
+
errors.append(f"{_get_merge_error_str(col)} in {side}")
|
|
1250
|
+
|
|
1218
1251
|
ops = [
|
|
1219
|
-
self
|
|
1220
|
-
|
|
1252
|
+
_resolve(self, left, "left")
|
|
1253
|
+
== _resolve(right_ds, right, "right" if right_on else None)
|
|
1254
|
+
for left, right in zip(on, right_on or on)
|
|
1221
1255
|
]
|
|
1222
1256
|
|
|
1257
|
+
if errors:
|
|
1258
|
+
raise DatasetMergeError(
|
|
1259
|
+
on, right_on, f"Could not resolve {', '.join(errors)}"
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1223
1262
|
ds = self.join(right_ds, sqlalchemy.and_(*ops), inner, rname + "{name}")
|
|
1224
1263
|
|
|
1225
1264
|
ds.feature_schema = None
|
|
1265
|
+
|
|
1266
|
+
signals_schema = self.signals_schema.clone_without_sys_signals()
|
|
1267
|
+
right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
|
|
1226
1268
|
ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
|
|
1227
1269
|
right_signals_schema, rname
|
|
1228
1270
|
)
|
|
@@ -1581,6 +1623,8 @@ class DataChain(DatasetQuery):
|
|
|
1581
1623
|
model_name: str = "",
|
|
1582
1624
|
source: bool = True,
|
|
1583
1625
|
nrows=None,
|
|
1626
|
+
session: Optional[Session] = None,
|
|
1627
|
+
settings: Optional[dict] = None,
|
|
1584
1628
|
**kwargs,
|
|
1585
1629
|
) -> "DataChain":
|
|
1586
1630
|
"""Generate chain from csv files.
|
|
@@ -1597,6 +1641,8 @@ class DataChain(DatasetQuery):
|
|
|
1597
1641
|
model_name : Generated model name.
|
|
1598
1642
|
source : Whether to include info about the source file.
|
|
1599
1643
|
nrows : Optional row limit.
|
|
1644
|
+
session : Session to use for the chain.
|
|
1645
|
+
settings : Settings to use for the chain.
|
|
1600
1646
|
|
|
1601
1647
|
Example:
|
|
1602
1648
|
Reading a csv file:
|
|
@@ -1613,7 +1659,9 @@ class DataChain(DatasetQuery):
|
|
|
1613
1659
|
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
1614
1660
|
from pyarrow.dataset import CsvFileFormat
|
|
1615
1661
|
|
|
1616
|
-
chain = DataChain.from_storage(
|
|
1662
|
+
chain = DataChain.from_storage(
|
|
1663
|
+
path, session=session, settings=settings, **kwargs
|
|
1664
|
+
)
|
|
1617
1665
|
|
|
1618
1666
|
column_names = None
|
|
1619
1667
|
if not header:
|
|
@@ -1660,6 +1708,8 @@ class DataChain(DatasetQuery):
|
|
|
1660
1708
|
object_name: str = "",
|
|
1661
1709
|
model_name: str = "",
|
|
1662
1710
|
source: bool = True,
|
|
1711
|
+
session: Optional[Session] = None,
|
|
1712
|
+
settings: Optional[dict] = None,
|
|
1663
1713
|
**kwargs,
|
|
1664
1714
|
) -> "DataChain":
|
|
1665
1715
|
"""Generate chain from parquet files.
|
|
@@ -1672,6 +1722,8 @@ class DataChain(DatasetQuery):
|
|
|
1672
1722
|
object_name : Created object column name.
|
|
1673
1723
|
model_name : Generated model name.
|
|
1674
1724
|
source : Whether to include info about the source file.
|
|
1725
|
+
session : Session to use for the chain.
|
|
1726
|
+
settings : Settings to use for the chain.
|
|
1675
1727
|
|
|
1676
1728
|
Example:
|
|
1677
1729
|
Reading a single file:
|
|
@@ -1684,7 +1736,9 @@ class DataChain(DatasetQuery):
|
|
|
1684
1736
|
dc = DataChain.from_parquet("s3://mybucket/dir")
|
|
1685
1737
|
```
|
|
1686
1738
|
"""
|
|
1687
|
-
chain = DataChain.from_storage(
|
|
1739
|
+
chain = DataChain.from_storage(
|
|
1740
|
+
path, session=session, settings=settings, **kwargs
|
|
1741
|
+
)
|
|
1688
1742
|
return chain.parse_tabular(
|
|
1689
1743
|
output=output,
|
|
1690
1744
|
object_name=object_name,
|
datachain/lib/file.py
CHANGED
|
@@ -195,14 +195,15 @@ class File(DataModel):
|
|
|
195
195
|
with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
|
|
196
196
|
yield f
|
|
197
197
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
198
|
+
else:
|
|
199
|
+
uid = self.get_uid()
|
|
200
|
+
client = self._catalog.get_client(self.source)
|
|
201
|
+
if self._caching_enabled:
|
|
202
|
+
client.download(uid, callback=self._download_cb)
|
|
203
|
+
with client.open_object(
|
|
204
|
+
uid, use_cache=self._caching_enabled, cb=self._download_cb
|
|
205
|
+
) as f:
|
|
206
|
+
yield io.TextIOWrapper(f) if mode == "r" else f
|
|
206
207
|
|
|
207
208
|
def read(self, length: int = -1):
|
|
208
209
|
"""Returns file contents."""
|
datachain/lib/image.py
CHANGED
|
@@ -34,7 +34,7 @@ def convert_image(
|
|
|
34
34
|
from transformers.image_processing_utils import BaseImageProcessor
|
|
35
35
|
|
|
36
36
|
if isinstance(transform, BaseImageProcessor):
|
|
37
|
-
img = torch.
|
|
37
|
+
img = torch.as_tensor(img.pixel_values[0]).clone().detach() # type: ignore[assignment,attr-defined]
|
|
38
38
|
except ImportError:
|
|
39
39
|
pass
|
|
40
40
|
if device:
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -1,15 +1,12 @@
|
|
|
1
|
-
# pip install datamodel-code-generator
|
|
2
|
-
# pip install jmespath
|
|
3
|
-
#
|
|
4
1
|
import csv
|
|
5
|
-
import io
|
|
6
2
|
import json
|
|
7
|
-
import
|
|
8
|
-
import sys
|
|
3
|
+
import tempfile
|
|
9
4
|
import uuid
|
|
10
5
|
from collections.abc import Iterator
|
|
11
|
-
from
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable
|
|
12
8
|
|
|
9
|
+
import datamodel_code_generator
|
|
13
10
|
import jmespath as jsp
|
|
14
11
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
|
|
15
12
|
|
|
@@ -47,9 +44,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
47
44
|
data_string = ""
|
|
48
45
|
# using uiid to get around issue #1617
|
|
49
46
|
if not model_name:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
) # comply with Python class names
|
|
47
|
+
# comply with Python class names
|
|
48
|
+
uid_str = str(generate_uuid()).replace("-", "")
|
|
53
49
|
model_name = f"Model{data_type}{uid_str}"
|
|
54
50
|
try:
|
|
55
51
|
with source_file.open() as fd: # CSV can be larger than memory
|
|
@@ -70,33 +66,26 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
70
66
|
if data_type == "jsonl":
|
|
71
67
|
data_type = "json" # treat json line as plain JSON in auto-schema
|
|
72
68
|
data_string = json.dumps(json_object)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
capture_output=True,
|
|
88
|
-
check=True,
|
|
69
|
+
|
|
70
|
+
input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
|
|
71
|
+
input_file_type = input_file_types[data_type]
|
|
72
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
73
|
+
output = Path(tmpdir) / "model.py"
|
|
74
|
+
datamodel_code_generator.generate(
|
|
75
|
+
data_string,
|
|
76
|
+
input_file_type=input_file_type,
|
|
77
|
+
output=output,
|
|
78
|
+
target_python_version=datamodel_code_generator.PythonVersion.PY_39,
|
|
79
|
+
base_class="datachain.lib.meta_formats.UserModel",
|
|
80
|
+
class_name=model_name,
|
|
81
|
+
additional_imports=["datachain.lib.data_model.DataModel"],
|
|
82
|
+
use_standard_collections=True,
|
|
89
83
|
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
print(f"{model_output}")
|
|
96
|
-
print("from datachain.lib.data_model import DataModel")
|
|
97
|
-
print("\n" + f"DataModel.register({model_name})" + "\n")
|
|
98
|
-
print("\n" + f"spec={model_name}" + "\n")
|
|
99
|
-
return model_output
|
|
84
|
+
epilogue = f"""
|
|
85
|
+
DataModel.register({model_name})
|
|
86
|
+
spec = {model_name}
|
|
87
|
+
"""
|
|
88
|
+
return output.read_text() + epilogue
|
|
100
89
|
|
|
101
90
|
|
|
102
91
|
#
|
|
@@ -113,35 +102,25 @@ def read_meta( # noqa: C901
|
|
|
113
102
|
) -> Callable:
|
|
114
103
|
from datachain.lib.dc import DataChain
|
|
115
104
|
|
|
116
|
-
# ugly hack: datachain is run redirecting printed outputs to a variable
|
|
117
105
|
if schema_from:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
meta_schema=lambda file: read_schema(
|
|
127
|
-
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
128
|
-
),
|
|
129
|
-
output=str,
|
|
130
|
-
)
|
|
106
|
+
chain = (
|
|
107
|
+
DataChain.from_storage(schema_from, type="text")
|
|
108
|
+
.limit(1)
|
|
109
|
+
.map( # dummy column created (#1615)
|
|
110
|
+
meta_schema=lambda file: read_schema(
|
|
111
|
+
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
112
|
+
),
|
|
113
|
+
output=str,
|
|
131
114
|
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
sys.stdout = current_stdout
|
|
135
|
-
model_output = captured_output.getvalue()
|
|
136
|
-
captured_output.close()
|
|
137
|
-
|
|
115
|
+
)
|
|
116
|
+
(model_output,) = chain.collect("meta_schema")
|
|
138
117
|
if print_schema:
|
|
139
118
|
print(f"{model_output}")
|
|
140
119
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|
|
141
120
|
if not spec:
|
|
142
|
-
|
|
143
|
-
exec(model_output,
|
|
144
|
-
spec =
|
|
121
|
+
gl = globals()
|
|
122
|
+
exec(model_output, gl) # type: ignore[arg-type] # noqa: S102
|
|
123
|
+
spec = gl["spec"]
|
|
145
124
|
|
|
146
125
|
if not (spec) and not (schema_from):
|
|
147
126
|
raise ValueError(
|
datachain/lib/model_store.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
from typing import ClassVar, Optional
|
|
3
4
|
|
|
@@ -69,7 +70,11 @@ class ModelStore:
|
|
|
69
70
|
|
|
70
71
|
@staticmethod
|
|
71
72
|
def is_pydantic(val):
|
|
72
|
-
return
|
|
73
|
+
return (
|
|
74
|
+
not hasattr(val, "__origin__")
|
|
75
|
+
and inspect.isclass(val)
|
|
76
|
+
and issubclass(val, BaseModel)
|
|
77
|
+
)
|
|
73
78
|
|
|
74
79
|
@staticmethod
|
|
75
80
|
def to_pydantic(val) -> Optional[type[BaseModel]]:
|
datachain/lib/text.py
CHANGED
|
@@ -33,7 +33,7 @@ def convert_text(
|
|
|
33
33
|
res = tokenizer(text)
|
|
34
34
|
|
|
35
35
|
tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
|
|
36
|
-
tokens = torch.
|
|
36
|
+
tokens = torch.as_tensor(tokens).clone().detach()
|
|
37
37
|
if device:
|
|
38
38
|
tokens = tokens.to(device)
|
|
39
39
|
|
datachain/lib/webdataset.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
3
|
import tarfile
|
|
4
|
+
import warnings
|
|
4
5
|
from collections.abc import Iterator, Sequence
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import (
|
|
@@ -19,6 +20,18 @@ from datachain.lib.data_model import DataModel
|
|
|
19
20
|
from datachain.lib.file import File, TarVFile
|
|
20
21
|
from datachain.lib.utils import DataChainError
|
|
21
22
|
|
|
23
|
+
# The `json` method of the Pydantic `BaseModel` class has been deprecated
|
|
24
|
+
# and will be removed in Pydantic v3. For more details, see:
|
|
25
|
+
# https://github.com/pydantic/pydantic/issues/10033
|
|
26
|
+
# Until then, we can ignore the warning.
|
|
27
|
+
warnings.filterwarnings(
|
|
28
|
+
"ignore",
|
|
29
|
+
category=UserWarning,
|
|
30
|
+
message=(
|
|
31
|
+
'Field name "json" in "WDSAllFile" shadows an attribute in parent "WDSBasic"'
|
|
32
|
+
),
|
|
33
|
+
)
|
|
34
|
+
|
|
22
35
|
|
|
23
36
|
class WDSError(DataChainError):
|
|
24
37
|
def __init__(self, tar_stream, message: str):
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from collections.abc import Iterator
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
@@ -7,6 +8,18 @@ from pydantic import BaseModel, Field
|
|
|
7
8
|
from datachain.lib.file import File
|
|
8
9
|
from datachain.lib.webdataset import WDSBasic, WDSReadableSubclass
|
|
9
10
|
|
|
11
|
+
# The `json` method of the Pydantic `BaseModel` class has been deprecated
|
|
12
|
+
# and will be removed in Pydantic v3. For more details, see:
|
|
13
|
+
# https://github.com/pydantic/pydantic/issues/10033
|
|
14
|
+
# Until then, we can ignore the warning.
|
|
15
|
+
warnings.filterwarnings(
|
|
16
|
+
"ignore",
|
|
17
|
+
category=UserWarning,
|
|
18
|
+
message=(
|
|
19
|
+
'Field name "json" in "WDSLaion" shadows an attribute in parent "WDSBasic"'
|
|
20
|
+
),
|
|
21
|
+
)
|
|
22
|
+
|
|
10
23
|
|
|
11
24
|
class Laion(WDSReadableSubclass):
|
|
12
25
|
uid: str = Field(default="")
|
datachain/query/dataset.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import inspect
|
|
3
|
-
import json
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
6
5
|
import random
|
|
@@ -37,11 +36,7 @@ from sqlalchemy.sql.selectable import Select
|
|
|
37
36
|
from tqdm import tqdm
|
|
38
37
|
|
|
39
38
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
40
|
-
from datachain.catalog import
|
|
41
|
-
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
42
|
-
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
43
|
-
get_catalog,
|
|
44
|
-
)
|
|
39
|
+
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
45
40
|
from datachain.data_storage.schema import (
|
|
46
41
|
PARTITION_COLUMN_ID,
|
|
47
42
|
partition_col_names,
|
|
@@ -1173,8 +1168,12 @@ class DatasetQuery:
|
|
|
1173
1168
|
"""
|
|
1174
1169
|
return self.name is not None and self.version is not None
|
|
1175
1170
|
|
|
1176
|
-
def c(self,
|
|
1177
|
-
col
|
|
1171
|
+
def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
|
|
1172
|
+
col: sqlalchemy.ColumnClause = (
|
|
1173
|
+
sqlalchemy.column(column)
|
|
1174
|
+
if isinstance(column, str)
|
|
1175
|
+
else sqlalchemy.column(column.name, column.type)
|
|
1176
|
+
)
|
|
1178
1177
|
col.table = self.table
|
|
1179
1178
|
return col
|
|
1180
1179
|
|
|
@@ -1710,27 +1709,14 @@ class DatasetQuery:
|
|
|
1710
1709
|
return self.__class__(name=name, version=version, catalog=self.catalog)
|
|
1711
1710
|
|
|
1712
1711
|
|
|
1713
|
-
def
|
|
1714
|
-
handle = os.getenv("DATACHAIN_OUTPUT_FD")
|
|
1715
|
-
if not handle:
|
|
1716
|
-
return os.devnull
|
|
1717
|
-
|
|
1718
|
-
if os.name != "nt":
|
|
1719
|
-
return int(handle)
|
|
1720
|
-
|
|
1721
|
-
import msvcrt
|
|
1722
|
-
|
|
1723
|
-
return msvcrt.open_osfhandle(int(handle), os.O_WRONLY) # type: ignore[attr-defined]
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
1712
|
+
def query_wrapper(dataset_query: Any) -> Any:
|
|
1727
1713
|
"""
|
|
1728
1714
|
Wrapper function that wraps the last statement of user query script.
|
|
1729
1715
|
Last statement MUST be instance of DatasetQuery, otherwise script exits with
|
|
1730
1716
|
error code 10
|
|
1731
1717
|
"""
|
|
1732
1718
|
if not isinstance(dataset_query, DatasetQuery):
|
|
1733
|
-
|
|
1719
|
+
return dataset_query
|
|
1734
1720
|
|
|
1735
1721
|
catalog = dataset_query.catalog
|
|
1736
1722
|
save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
|
|
@@ -1742,13 +1728,4 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
|
1742
1728
|
if save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1743
1729
|
name = catalog.generate_query_dataset_name()
|
|
1744
1730
|
dataset_query = dataset_query.save(name)
|
|
1745
|
-
|
|
1746
|
-
dataset: Optional[tuple[str, int]] = None
|
|
1747
|
-
if dataset_query.attached:
|
|
1748
|
-
assert dataset_query.name, "Dataset name should be provided"
|
|
1749
|
-
assert dataset_query.version, "Dataset version should be provided"
|
|
1750
|
-
dataset = dataset_query.name, dataset_query.version
|
|
1751
|
-
|
|
1752
|
-
with open(_get_output_fd_for_write(), mode="w") as f:
|
|
1753
|
-
json.dump(dataset, f)
|
|
1754
1731
|
return dataset_query
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.12
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -80,7 +80,6 @@ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
|
80
80
|
Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
|
|
81
81
|
Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
|
|
82
82
|
Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
|
|
83
|
-
Requires-Dist: pytest-asyncio >=0.23.2 ; extra == 'tests'
|
|
84
83
|
Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
|
|
85
84
|
Requires-Dist: virtualenv ; extra == 'tests'
|
|
86
85
|
Requires-Dist: dulwich ; extra == 'tests'
|
|
@@ -96,8 +95,14 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
|
|
|
96
95
|
Provides-Extra: vector
|
|
97
96
|
Requires-Dist: usearch ; extra == 'vector'
|
|
98
97
|
|
|
98
|
+
================
|
|
99
|
+
|logo| DataChain
|
|
100
|
+
================
|
|
101
|
+
|
|
99
102
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
100
103
|
|
|
104
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
105
|
+
:height: 24
|
|
101
106
|
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
102
107
|
:target: https://pypi.org/project/datachain/
|
|
103
108
|
:alt: PyPI
|
|
@@ -111,9 +116,6 @@ Requires-Dist: usearch ; extra == 'vector'
|
|
|
111
116
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
112
117
|
:alt: Tests
|
|
113
118
|
|
|
114
|
-
AI 🔗 DataChain
|
|
115
|
-
----------------
|
|
116
|
-
|
|
117
119
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
118
120
|
It is made to organize your unstructured data into datasets and wrangle it at scale on
|
|
119
121
|
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
|
|
@@ -6,8 +6,8 @@ datachain/cli.py,sha256=ECf_z5X8ILDJdUn2Cpb_z-ZjSRIzn7skiuMGfM-y9i0,30999
|
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
|
|
9
|
-
datachain/error.py,sha256=
|
|
10
|
-
datachain/job.py,sha256=
|
|
9
|
+
datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
|
|
10
|
+
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
|
|
12
12
|
datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
@@ -17,10 +17,9 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=xVFNUZ339u2l58ZyPaiJ6GsRRpwqq0LYUbdOHC-Otog,69654
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
|
-
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
24
23
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
24
|
datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
|
|
26
25
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
@@ -40,27 +39,27 @@ datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9m
|
|
|
40
39
|
datachain/data_storage/warehouse.py,sha256=f7ETnYIXx5KMcPfwg_4bh_00QJiMLIliwE_41vmRGUo,33037
|
|
41
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
41
|
datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
|
|
43
|
-
datachain/lib/clip.py,sha256=
|
|
42
|
+
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
44
43
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
45
44
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
46
|
-
datachain/lib/dc.py,sha256=
|
|
47
|
-
datachain/lib/file.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=gYRkrriG5RJxgLpOUccDU8DFRSoeWZjgmJwHfUo_z7w,68731
|
|
46
|
+
datachain/lib/file.py,sha256=tNb3rJyRYGxpOc6XxcZjIQ9yVHKc7WLAOKoTYqp2TB0,11475
|
|
48
47
|
datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
|
|
49
|
-
datachain/lib/image.py,sha256=
|
|
48
|
+
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
50
49
|
datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
|
|
51
50
|
datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
|
|
52
|
-
datachain/lib/meta_formats.py,sha256=
|
|
53
|
-
datachain/lib/model_store.py,sha256=
|
|
51
|
+
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
52
|
+
datachain/lib/model_store.py,sha256=xcrQ69-jcQs716U4UFOSoSKM7EvFIWqxlPhIcE4X7oI,2497
|
|
54
53
|
datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
|
|
55
54
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
56
55
|
datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
|
|
57
|
-
datachain/lib/text.py,sha256=
|
|
56
|
+
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
58
57
|
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
59
58
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
60
59
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
61
60
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
-
datachain/lib/webdataset.py,sha256=
|
|
63
|
-
datachain/lib/webdataset_laion.py,sha256=
|
|
61
|
+
datachain/lib/webdataset.py,sha256=ZzGLtOUA-QjP4kttGgNqhrioDuDnomWFlsow4fLdezQ,8717
|
|
62
|
+
datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
|
|
64
63
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
64
|
datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
|
|
66
65
|
datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
|
|
@@ -70,7 +69,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
|
|
|
70
69
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
71
70
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
72
71
|
datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
|
|
73
|
-
datachain/query/dataset.py,sha256=
|
|
72
|
+
datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
|
|
74
73
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
75
74
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
76
75
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
96
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
97
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
98
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
105
|
-
datachain-0.3.
|
|
99
|
+
datachain-0.3.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.3.12.dist-info/METADATA,sha256=I_Yz0lbiCk4KWv026U7zpDGrU72G575Hd_OnE_seb1k,17073
|
|
101
|
+
datachain-0.3.12.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
|
102
|
+
datachain-0.3.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.3.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.3.12.dist-info/RECORD,,
|
datachain/catalog/subclass.py
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class SubclassFinder(ast.NodeVisitor):
|
|
5
|
-
"""Finds subclasses of a target class in an AST."""
|
|
6
|
-
|
|
7
|
-
def __init__(self, target_classes: list[str]):
|
|
8
|
-
self.imports: list[ast.AST] = []
|
|
9
|
-
self.main_body: list[ast.AST] = []
|
|
10
|
-
|
|
11
|
-
self.target_classes: list[str] = target_classes
|
|
12
|
-
self.aliases: dict[str, str] = {}
|
|
13
|
-
self.feature_class: list[ast.AST] = []
|
|
14
|
-
|
|
15
|
-
def visit_ImportFrom(self, node): # noqa: N802
|
|
16
|
-
module = node.module
|
|
17
|
-
for alias in node.names:
|
|
18
|
-
full_name = f"{module}.{alias.name}"
|
|
19
|
-
self.aliases[alias.asname or alias.name] = full_name
|
|
20
|
-
self.imports.append(node)
|
|
21
|
-
|
|
22
|
-
def visit_Import(self, node): # noqa: N802
|
|
23
|
-
for alias in node.names:
|
|
24
|
-
self.aliases[alias.asname or alias.name] = alias.name
|
|
25
|
-
self.imports.append(node)
|
|
26
|
-
|
|
27
|
-
def visit_ClassDef(self, node): # noqa: N802
|
|
28
|
-
base_names = [self.get_base_name(base) for base in node.bases]
|
|
29
|
-
if any(self.is_subclass(name) for name in base_names):
|
|
30
|
-
self.feature_class.append(node)
|
|
31
|
-
else:
|
|
32
|
-
self.main_body.append(node)
|
|
33
|
-
|
|
34
|
-
def visit(self, node):
|
|
35
|
-
if isinstance(
|
|
36
|
-
node,
|
|
37
|
-
(ast.Import, ast.ImportFrom, ast.ClassDef, ast.Module),
|
|
38
|
-
):
|
|
39
|
-
return super().visit(node)
|
|
40
|
-
self.main_body.append(node)
|
|
41
|
-
return node
|
|
42
|
-
|
|
43
|
-
def get_base_name(self, node):
|
|
44
|
-
if isinstance(node, ast.Name):
|
|
45
|
-
return self.aliases.get(node.id, node.id)
|
|
46
|
-
if isinstance(node, ast.Attribute):
|
|
47
|
-
return self.get_full_attr_name(node)
|
|
48
|
-
if isinstance(node, ast.Subscript):
|
|
49
|
-
return self.get_base_name(node.value)
|
|
50
|
-
return None
|
|
51
|
-
|
|
52
|
-
def get_full_attr_name(self, node):
|
|
53
|
-
if isinstance(node.value, ast.Name):
|
|
54
|
-
return f"{node.value.id}.{node.attr}"
|
|
55
|
-
if isinstance(node.value, ast.Attribute):
|
|
56
|
-
return f"{self.get_full_attr_name(node.value)}.{node.attr}"
|
|
57
|
-
return node.attr
|
|
58
|
-
|
|
59
|
-
def is_subclass(self, base_name):
|
|
60
|
-
return base_name and base_name.split(".")[-1] in self.target_classes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|