datachain 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +54 -140
- datachain/job.py +4 -3
- datachain/lib/dc.py +75 -34
- datachain/lib/meta_formats.py +36 -53
- datachain/query/dataset.py +9 -32
- {datachain-0.3.10.dist-info → datachain-0.3.11.dist-info}/METADATA +5 -2
- {datachain-0.3.10.dist-info → datachain-0.3.11.dist-info}/RECORD +11 -12
- datachain/catalog/subclass.py +0 -60
- {datachain-0.3.10.dist-info → datachain-0.3.11.dist-info}/LICENSE +0 -0
- {datachain-0.3.10.dist-info → datachain-0.3.11.dist-info}/WHEEL +0 -0
- {datachain-0.3.10.dist-info → datachain-0.3.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.10.dist-info → datachain-0.3.11.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -9,7 +9,6 @@ import os.path
|
|
|
9
9
|
import posixpath
|
|
10
10
|
import subprocess
|
|
11
11
|
import sys
|
|
12
|
-
import tempfile
|
|
13
12
|
import time
|
|
14
13
|
import traceback
|
|
15
14
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
@@ -77,7 +76,6 @@ from datachain.utils import (
|
|
|
77
76
|
)
|
|
78
77
|
|
|
79
78
|
from .datasource import DataSource
|
|
80
|
-
from .subclass import SubclassFinder
|
|
81
79
|
|
|
82
80
|
if TYPE_CHECKING:
|
|
83
81
|
from datachain.data_storage import (
|
|
@@ -92,7 +90,6 @@ logger = logging.getLogger("datachain")
|
|
|
92
90
|
|
|
93
91
|
DEFAULT_DATASET_DIR = "dataset"
|
|
94
92
|
DATASET_FILE_SUFFIX = ".edatachain"
|
|
95
|
-
FEATURE_CLASSES = ["DataModel"]
|
|
96
93
|
|
|
97
94
|
TTL_INT = 4 * 60 * 60
|
|
98
95
|
|
|
@@ -569,12 +566,6 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
569
566
|
return ""
|
|
570
567
|
|
|
571
568
|
|
|
572
|
-
def form_module_source(source_ast):
|
|
573
|
-
module = ast.Module(body=source_ast, type_ignores=[])
|
|
574
|
-
module = ast.fix_missing_locations(module)
|
|
575
|
-
return ast.unparse(module)
|
|
576
|
-
|
|
577
|
-
|
|
578
569
|
class Catalog:
|
|
579
570
|
def __init__(
|
|
580
571
|
self,
|
|
@@ -658,33 +649,12 @@ class Catalog:
|
|
|
658
649
|
),
|
|
659
650
|
]
|
|
660
651
|
code_ast.body[-1:] = new_expressions
|
|
661
|
-
else:
|
|
662
|
-
raise Exception("Last line in a script was not an expression")
|
|
663
652
|
return code_ast
|
|
664
653
|
|
|
665
|
-
def compile_query_script(
|
|
666
|
-
self, script: str, feature_module_name: str
|
|
667
|
-
) -> tuple[Union[str, None], str]:
|
|
654
|
+
def compile_query_script(self, script: str) -> str:
|
|
668
655
|
code_ast = ast.parse(script)
|
|
669
656
|
code_ast = self.attach_query_wrapper(code_ast)
|
|
670
|
-
|
|
671
|
-
finder.visit(code_ast)
|
|
672
|
-
|
|
673
|
-
if not finder.feature_class:
|
|
674
|
-
main_module = form_module_source([*finder.imports, *finder.main_body])
|
|
675
|
-
return None, main_module
|
|
676
|
-
|
|
677
|
-
feature_import = ast.ImportFrom(
|
|
678
|
-
module=feature_module_name,
|
|
679
|
-
names=[ast.alias(name="*", asname=None)],
|
|
680
|
-
level=0,
|
|
681
|
-
)
|
|
682
|
-
feature_module = form_module_source([*finder.imports, *finder.feature_class])
|
|
683
|
-
main_module = form_module_source(
|
|
684
|
-
[*finder.imports, feature_import, *finder.main_body]
|
|
685
|
-
)
|
|
686
|
-
|
|
687
|
-
return feature_module, main_module
|
|
657
|
+
return ast.unparse(code_ast)
|
|
688
658
|
|
|
689
659
|
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
690
660
|
config = config or self.client_config
|
|
@@ -1416,7 +1386,8 @@ class Catalog:
|
|
|
1416
1386
|
|
|
1417
1387
|
for d in datasets:
|
|
1418
1388
|
yield from (
|
|
1419
|
-
(d, v, jobs.get(v.job_id) if v.job_id else None)
|
|
1389
|
+
(d, v, jobs.get(str(v.job_id)) if v.job_id else None)
|
|
1390
|
+
for v in d.versions
|
|
1420
1391
|
)
|
|
1421
1392
|
|
|
1422
1393
|
def ls_dataset_rows(
|
|
@@ -1864,29 +1835,25 @@ class Catalog:
|
|
|
1864
1835
|
C.size > 1000
|
|
1865
1836
|
)
|
|
1866
1837
|
"""
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
lines, proc, response_text = self.run_query(
|
|
1875
|
-
python_executable or sys.executable,
|
|
1876
|
-
query_script,
|
|
1877
|
-
envs,
|
|
1878
|
-
feature_file,
|
|
1879
|
-
capture_output,
|
|
1880
|
-
feature_module,
|
|
1881
|
-
output_hook,
|
|
1882
|
-
params,
|
|
1883
|
-
save,
|
|
1884
|
-
job_id,
|
|
1838
|
+
if not job_id:
|
|
1839
|
+
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
|
|
1840
|
+
job_id = self.metastore.create_job(
|
|
1841
|
+
name="",
|
|
1842
|
+
query=query_script,
|
|
1843
|
+
params=params,
|
|
1844
|
+
python_version=python_version,
|
|
1885
1845
|
)
|
|
1886
|
-
finally:
|
|
1887
|
-
feature_file.close()
|
|
1888
|
-
os.unlink(feature_file.name)
|
|
1889
1846
|
|
|
1847
|
+
lines, proc = self.run_query(
|
|
1848
|
+
python_executable or sys.executable,
|
|
1849
|
+
query_script,
|
|
1850
|
+
envs,
|
|
1851
|
+
capture_output,
|
|
1852
|
+
output_hook,
|
|
1853
|
+
params,
|
|
1854
|
+
save,
|
|
1855
|
+
job_id,
|
|
1856
|
+
)
|
|
1890
1857
|
output = "".join(lines)
|
|
1891
1858
|
|
|
1892
1859
|
if proc.returncode:
|
|
@@ -1896,82 +1863,69 @@ class Catalog:
|
|
|
1896
1863
|
return_code=proc.returncode,
|
|
1897
1864
|
output=output,
|
|
1898
1865
|
)
|
|
1899
|
-
if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
|
|
1900
|
-
raise QueryScriptRunError(
|
|
1901
|
-
"Last line in a script was not an instance of DataChain",
|
|
1902
|
-
return_code=proc.returncode,
|
|
1903
|
-
output=output,
|
|
1904
|
-
)
|
|
1905
1866
|
raise QueryScriptRunError(
|
|
1906
1867
|
f"Query script exited with error code {proc.returncode}",
|
|
1907
1868
|
return_code=proc.returncode,
|
|
1908
1869
|
output=output,
|
|
1909
1870
|
)
|
|
1910
1871
|
|
|
1872
|
+
def _get_dataset_versions_by_job_id():
|
|
1873
|
+
for dr, dv, job in self.list_datasets_versions():
|
|
1874
|
+
if job and str(job.id) == job_id:
|
|
1875
|
+
yield dr, dv
|
|
1876
|
+
|
|
1911
1877
|
try:
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
result = None
|
|
1915
|
-
|
|
1916
|
-
dataset: Optional[DatasetRecord] = None
|
|
1917
|
-
version: Optional[int] = None
|
|
1918
|
-
if save:
|
|
1919
|
-
dataset, version = self.save_result(
|
|
1920
|
-
query_script, result, output, version, job_id
|
|
1878
|
+
dr, dv = max(
|
|
1879
|
+
_get_dataset_versions_by_job_id(), key=lambda x: x[1].created_at
|
|
1921
1880
|
)
|
|
1881
|
+
except ValueError as e:
|
|
1882
|
+
if not save:
|
|
1883
|
+
return QueryResult(dataset=None, version=None, output=output)
|
|
1884
|
+
|
|
1885
|
+
raise QueryScriptDatasetNotFound(
|
|
1886
|
+
"No dataset found after running Query script",
|
|
1887
|
+
output=output,
|
|
1888
|
+
) from e
|
|
1922
1889
|
|
|
1923
|
-
|
|
1890
|
+
dr = self.update_dataset(
|
|
1891
|
+
dr,
|
|
1892
|
+
script_output=output,
|
|
1893
|
+
query_script=query_script,
|
|
1894
|
+
)
|
|
1895
|
+
self.update_dataset_version_with_warehouse_info(
|
|
1896
|
+
dr,
|
|
1897
|
+
dv.version,
|
|
1898
|
+
script_output=output,
|
|
1899
|
+
query_script=query_script,
|
|
1900
|
+
job_id=job_id,
|
|
1901
|
+
is_job_result=True,
|
|
1902
|
+
)
|
|
1903
|
+
return QueryResult(dataset=dr, version=dv.version, output=output)
|
|
1924
1904
|
|
|
1925
1905
|
def run_query(
|
|
1926
1906
|
self,
|
|
1927
1907
|
python_executable: str,
|
|
1928
1908
|
query_script: str,
|
|
1929
1909
|
envs: Optional[Mapping[str, str]],
|
|
1930
|
-
feature_file: IO[bytes],
|
|
1931
1910
|
capture_output: bool,
|
|
1932
|
-
feature_module: str,
|
|
1933
1911
|
output_hook: Callable[[str], None],
|
|
1934
1912
|
params: Optional[dict[str, str]],
|
|
1935
1913
|
save: bool,
|
|
1936
1914
|
job_id: Optional[str],
|
|
1937
|
-
) -> tuple[list[str], subprocess.Popen
|
|
1915
|
+
) -> tuple[list[str], subprocess.Popen]:
|
|
1938
1916
|
try:
|
|
1939
|
-
|
|
1940
|
-
query_script, feature_module[:-3]
|
|
1941
|
-
)
|
|
1942
|
-
if feature_code:
|
|
1943
|
-
feature_file.write(feature_code.encode())
|
|
1944
|
-
feature_file.flush()
|
|
1945
|
-
|
|
1917
|
+
query_script_compiled = self.compile_query_script(query_script)
|
|
1946
1918
|
except Exception as exc:
|
|
1947
1919
|
raise QueryScriptCompileError(
|
|
1948
1920
|
f"Query script failed to compile, reason: {exc}"
|
|
1949
1921
|
) from exc
|
|
1950
|
-
r, w = os.pipe()
|
|
1951
|
-
if os.name == "nt":
|
|
1952
|
-
import msvcrt
|
|
1953
|
-
|
|
1954
|
-
os.set_inheritable(w, True)
|
|
1955
|
-
|
|
1956
|
-
startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
|
|
1957
|
-
handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
|
|
1958
|
-
startupinfo.lpAttributeList["handle_list"].append(handle)
|
|
1959
|
-
kwargs: dict[str, Any] = {"startupinfo": startupinfo}
|
|
1960
|
-
else:
|
|
1961
|
-
handle = w
|
|
1962
|
-
kwargs = {"pass_fds": [w]}
|
|
1963
1922
|
envs = dict(envs or os.environ)
|
|
1964
|
-
if feature_code:
|
|
1965
|
-
envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
|
|
1966
|
-
{feature_module: feature_code}
|
|
1967
|
-
)
|
|
1968
1923
|
envs.update(
|
|
1969
1924
|
{
|
|
1970
1925
|
"DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
|
|
1971
1926
|
"PYTHONPATH": os.getcwd(), # For local imports
|
|
1972
1927
|
"DATACHAIN_QUERY_SAVE": "1" if save else "",
|
|
1973
1928
|
"PYTHONUNBUFFERED": "1",
|
|
1974
|
-
"DATACHAIN_OUTPUT_FD": str(handle),
|
|
1975
1929
|
"DATACHAIN_JOB_ID": job_id or "",
|
|
1976
1930
|
},
|
|
1977
1931
|
)
|
|
@@ -1982,52 +1936,12 @@ class Catalog:
|
|
|
1982
1936
|
stderr=subprocess.STDOUT if capture_output else None,
|
|
1983
1937
|
bufsize=1,
|
|
1984
1938
|
text=False,
|
|
1985
|
-
**kwargs,
|
|
1986
1939
|
) as proc:
|
|
1987
|
-
os.close(w)
|
|
1988
|
-
|
|
1989
1940
|
out = proc.stdout
|
|
1990
1941
|
_lines: list[str] = []
|
|
1991
1942
|
ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
|
|
1992
|
-
|
|
1993
|
-
|
|
1994
|
-
response_text = ""
|
|
1995
|
-
while proc.poll() is None:
|
|
1996
|
-
response_text += f.readline()
|
|
1997
|
-
time.sleep(0.1)
|
|
1998
|
-
response_text += f.readline()
|
|
1999
|
-
return lines, proc, response_text
|
|
2000
|
-
|
|
2001
|
-
def save_result(self, query_script, exec_result, output, version, job_id):
|
|
2002
|
-
if not exec_result:
|
|
2003
|
-
raise QueryScriptDatasetNotFound(
|
|
2004
|
-
"No dataset found after running Query script",
|
|
2005
|
-
output=output,
|
|
2006
|
-
)
|
|
2007
|
-
name, version = exec_result
|
|
2008
|
-
# finding returning dataset
|
|
2009
|
-
try:
|
|
2010
|
-
dataset = self.get_dataset(name)
|
|
2011
|
-
dataset.get_version(version)
|
|
2012
|
-
except (DatasetNotFoundError, ValueError) as e:
|
|
2013
|
-
raise QueryScriptDatasetNotFound(
|
|
2014
|
-
"No dataset found after running Query script",
|
|
2015
|
-
output=output,
|
|
2016
|
-
) from e
|
|
2017
|
-
dataset = self.update_dataset(
|
|
2018
|
-
dataset,
|
|
2019
|
-
script_output=output,
|
|
2020
|
-
query_script=query_script,
|
|
2021
|
-
)
|
|
2022
|
-
self.update_dataset_version_with_warehouse_info(
|
|
2023
|
-
dataset,
|
|
2024
|
-
version,
|
|
2025
|
-
script_output=output,
|
|
2026
|
-
query_script=query_script,
|
|
2027
|
-
job_id=job_id,
|
|
2028
|
-
is_job_result=True,
|
|
2029
|
-
)
|
|
2030
|
-
return dataset, version
|
|
1943
|
+
with ctx as lines:
|
|
1944
|
+
return lines, proc
|
|
2031
1945
|
|
|
2032
1946
|
def cp(
|
|
2033
1947
|
self,
|
datachain/job.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import uuid
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Optional, TypeVar
|
|
5
|
+
from typing import Any, Optional, TypeVar, Union
|
|
5
6
|
|
|
6
7
|
J = TypeVar("J", bound="Job")
|
|
7
8
|
|
|
@@ -25,7 +26,7 @@ class Job:
|
|
|
25
26
|
@classmethod
|
|
26
27
|
def parse(
|
|
27
28
|
cls: type[J],
|
|
28
|
-
id: str,
|
|
29
|
+
id: Union[str, uuid.UUID],
|
|
29
30
|
name: str,
|
|
30
31
|
status: int,
|
|
31
32
|
created_at: datetime,
|
|
@@ -40,7 +41,7 @@ class Job:
|
|
|
40
41
|
metrics: str,
|
|
41
42
|
) -> "Job":
|
|
42
43
|
return cls(
|
|
43
|
-
id,
|
|
44
|
+
str(id),
|
|
44
45
|
name,
|
|
45
46
|
status,
|
|
46
47
|
created_at,
|
datachain/lib/dc.py
CHANGED
|
@@ -56,7 +56,7 @@ from datachain.query.dataset import (
|
|
|
56
56
|
PartitionByType,
|
|
57
57
|
detach,
|
|
58
58
|
)
|
|
59
|
-
from datachain.query.schema import Column, DatasetRow
|
|
59
|
+
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
60
60
|
from datachain.sql.functions import path as pathfunc
|
|
61
61
|
from datachain.utils import inside_notebook
|
|
62
62
|
|
|
@@ -112,11 +112,31 @@ class DatasetFromValuesError(DataChainParamsError): # noqa: D101
|
|
|
112
112
|
super().__init__(f"Dataset{name} from values error: {msg}")
|
|
113
113
|
|
|
114
114
|
|
|
115
|
+
def _get_merge_error_str(col: Union[str, sqlalchemy.ColumnElement]) -> str:
|
|
116
|
+
if isinstance(col, str):
|
|
117
|
+
return col
|
|
118
|
+
if isinstance(col, sqlalchemy.Column):
|
|
119
|
+
return col.name.replace(DEFAULT_DELIMITER, ".")
|
|
120
|
+
if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
|
|
121
|
+
return f"{col.name} expression"
|
|
122
|
+
return str(col)
|
|
123
|
+
|
|
124
|
+
|
|
115
125
|
class DatasetMergeError(DataChainParamsError): # noqa: D101
|
|
116
|
-
def __init__(
|
|
117
|
-
|
|
126
|
+
def __init__( # noqa: D107
|
|
127
|
+
self,
|
|
128
|
+
on: Sequence[Union[str, sqlalchemy.ColumnElement]],
|
|
129
|
+
right_on: Optional[Sequence[Union[str, sqlalchemy.ColumnElement]]],
|
|
130
|
+
msg: str,
|
|
131
|
+
):
|
|
132
|
+
def _get_str(on: Sequence[Union[str, sqlalchemy.ColumnElement]]) -> str:
|
|
133
|
+
if not isinstance(on, Sequence):
|
|
134
|
+
return str(on) # type: ignore[unreachable]
|
|
135
|
+
return ", ".join([_get_merge_error_str(col) for col in on])
|
|
136
|
+
|
|
137
|
+
on_str = _get_str(on)
|
|
118
138
|
right_on_str = (
|
|
119
|
-
", right_on='" +
|
|
139
|
+
", right_on='" + _get_str(right_on) + "'"
|
|
120
140
|
if right_on and isinstance(right_on, Sequence)
|
|
121
141
|
else ""
|
|
122
142
|
)
|
|
@@ -139,7 +159,7 @@ class Sys(DataModel):
|
|
|
139
159
|
|
|
140
160
|
|
|
141
161
|
class DataChain(DatasetQuery):
|
|
142
|
-
"""
|
|
162
|
+
"""DataChain - a data structure for batch data processing and evaluation.
|
|
143
163
|
|
|
144
164
|
It represents a sequence of data manipulation steps such as reading data from
|
|
145
165
|
storages, running AI or LLM models or calling external services API to validate or
|
|
@@ -252,13 +272,24 @@ class DataChain(DatasetQuery):
|
|
|
252
272
|
"""Returns Column instance with a type if name is found in current schema,
|
|
253
273
|
otherwise raises an exception.
|
|
254
274
|
"""
|
|
255
|
-
|
|
275
|
+
if "." in name:
|
|
276
|
+
name_path = name.split(".")
|
|
277
|
+
elif DEFAULT_DELIMITER in name:
|
|
278
|
+
name_path = name.split(DEFAULT_DELIMITER)
|
|
279
|
+
else:
|
|
280
|
+
name_path = [name]
|
|
256
281
|
for path, type_, _, _ in self.signals_schema.get_flat_tree():
|
|
257
282
|
if path == name_path:
|
|
258
283
|
return Column(name, python_to_sql(type_))
|
|
259
284
|
|
|
260
285
|
raise ValueError(f"Column with name {name} not found in the schema")
|
|
261
286
|
|
|
287
|
+
def c(self, column: Union[str, Column]) -> Column:
|
|
288
|
+
"""Returns Column instance attached to the current chain."""
|
|
289
|
+
c = self.column(column) if isinstance(column, str) else self.column(column.name)
|
|
290
|
+
c.table = self.table
|
|
291
|
+
return c
|
|
292
|
+
|
|
262
293
|
def print_schema(self) -> None:
|
|
263
294
|
"""Print schema of the chain."""
|
|
264
295
|
self._effective_signals_schema.print_tree()
|
|
@@ -1140,8 +1171,17 @@ class DataChain(DatasetQuery):
|
|
|
1140
1171
|
def merge(
|
|
1141
1172
|
self,
|
|
1142
1173
|
right_ds: "DataChain",
|
|
1143
|
-
on: Union[
|
|
1144
|
-
|
|
1174
|
+
on: Union[
|
|
1175
|
+
str,
|
|
1176
|
+
sqlalchemy.ColumnElement,
|
|
1177
|
+
Sequence[Union[str, sqlalchemy.ColumnElement]],
|
|
1178
|
+
],
|
|
1179
|
+
right_on: Union[
|
|
1180
|
+
str,
|
|
1181
|
+
sqlalchemy.ColumnElement,
|
|
1182
|
+
Sequence[Union[str, sqlalchemy.ColumnElement]],
|
|
1183
|
+
None,
|
|
1184
|
+
] = None,
|
|
1145
1185
|
inner=False,
|
|
1146
1186
|
rname="right_",
|
|
1147
1187
|
) -> "Self":
|
|
@@ -1166,7 +1206,7 @@ class DataChain(DatasetQuery):
|
|
|
1166
1206
|
if on is None:
|
|
1167
1207
|
raise DatasetMergeError(["None"], None, "'on' must be specified")
|
|
1168
1208
|
|
|
1169
|
-
if isinstance(on, str):
|
|
1209
|
+
if isinstance(on, (str, sqlalchemy.ColumnElement)):
|
|
1170
1210
|
on = [on]
|
|
1171
1211
|
elif not isinstance(on, Sequence):
|
|
1172
1212
|
raise DatasetMergeError(
|
|
@@ -1175,19 +1215,15 @@ class DataChain(DatasetQuery):
|
|
|
1175
1215
|
f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
|
|
1176
1216
|
)
|
|
1177
1217
|
|
|
1178
|
-
signals_schema = self.signals_schema.clone_without_sys_signals()
|
|
1179
|
-
on_columns: list[str] = signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
1180
|
-
|
|
1181
|
-
right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
|
|
1182
1218
|
if right_on is not None:
|
|
1183
|
-
if isinstance(right_on, str):
|
|
1219
|
+
if isinstance(right_on, (str, sqlalchemy.ColumnElement)):
|
|
1184
1220
|
right_on = [right_on]
|
|
1185
1221
|
elif not isinstance(right_on, Sequence):
|
|
1186
1222
|
raise DatasetMergeError(
|
|
1187
1223
|
on,
|
|
1188
1224
|
right_on,
|
|
1189
1225
|
"'right_on' must be 'str' or 'Sequence' object"
|
|
1190
|
-
f" but got type '{right_on}'",
|
|
1226
|
+
f" but got type '{type(right_on)}'",
|
|
1191
1227
|
)
|
|
1192
1228
|
|
|
1193
1229
|
if len(right_on) != len(on):
|
|
@@ -1195,34 +1231,39 @@ class DataChain(DatasetQuery):
|
|
|
1195
1231
|
on, right_on, "'on' and 'right_on' must have the same length'"
|
|
1196
1232
|
)
|
|
1197
1233
|
|
|
1198
|
-
right_on_columns: list[str] = right_signals_schema.resolve(
|
|
1199
|
-
*right_on
|
|
1200
|
-
).db_signals() # type: ignore[assignment]
|
|
1201
|
-
|
|
1202
|
-
if len(right_on_columns) != len(on_columns):
|
|
1203
|
-
on_str = ", ".join(right_on_columns)
|
|
1204
|
-
right_on_str = ", ".join(right_on_columns)
|
|
1205
|
-
raise DatasetMergeError(
|
|
1206
|
-
on,
|
|
1207
|
-
right_on,
|
|
1208
|
-
"'on' and 'right_on' must have the same number of columns in db'."
|
|
1209
|
-
f" on -> {on_str}, right_on -> {right_on_str}",
|
|
1210
|
-
)
|
|
1211
|
-
else:
|
|
1212
|
-
right_on = on
|
|
1213
|
-
right_on_columns = on_columns
|
|
1214
|
-
|
|
1215
1234
|
if self == right_ds:
|
|
1216
1235
|
right_ds = right_ds.clone(new_table=True)
|
|
1217
1236
|
|
|
1237
|
+
errors = []
|
|
1238
|
+
|
|
1239
|
+
def _resolve(
|
|
1240
|
+
ds: DataChain,
|
|
1241
|
+
col: Union[str, sqlalchemy.ColumnElement],
|
|
1242
|
+
side: Union[str, None],
|
|
1243
|
+
):
|
|
1244
|
+
try:
|
|
1245
|
+
return ds.c(col) if isinstance(col, (str, C)) else col
|
|
1246
|
+
except ValueError:
|
|
1247
|
+
if side:
|
|
1248
|
+
errors.append(f"{_get_merge_error_str(col)} in {side}")
|
|
1249
|
+
|
|
1218
1250
|
ops = [
|
|
1219
|
-
self
|
|
1220
|
-
|
|
1251
|
+
_resolve(self, left, "left")
|
|
1252
|
+
== _resolve(right_ds, right, "right" if right_on else None)
|
|
1253
|
+
for left, right in zip(on, right_on or on)
|
|
1221
1254
|
]
|
|
1222
1255
|
|
|
1256
|
+
if errors:
|
|
1257
|
+
raise DatasetMergeError(
|
|
1258
|
+
on, right_on, f"Could not resolve {', '.join(errors)}"
|
|
1259
|
+
)
|
|
1260
|
+
|
|
1223
1261
|
ds = self.join(right_ds, sqlalchemy.and_(*ops), inner, rname + "{name}")
|
|
1224
1262
|
|
|
1225
1263
|
ds.feature_schema = None
|
|
1264
|
+
|
|
1265
|
+
signals_schema = self.signals_schema.clone_without_sys_signals()
|
|
1266
|
+
right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
|
|
1226
1267
|
ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
|
|
1227
1268
|
right_signals_schema, rname
|
|
1228
1269
|
)
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
# pip install jmespath
|
|
3
3
|
#
|
|
4
4
|
import csv
|
|
5
|
-
import io
|
|
6
5
|
import json
|
|
7
|
-
import
|
|
8
|
-
import sys
|
|
6
|
+
import tempfile
|
|
9
7
|
import uuid
|
|
10
8
|
from collections.abc import Iterator
|
|
9
|
+
from pathlib import Path
|
|
11
10
|
from typing import Any, Callable
|
|
12
11
|
|
|
12
|
+
import datamodel_code_generator
|
|
13
13
|
import jmespath as jsp
|
|
14
14
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
|
|
15
15
|
|
|
@@ -47,9 +47,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
47
47
|
data_string = ""
|
|
48
48
|
# using uiid to get around issue #1617
|
|
49
49
|
if not model_name:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
) # comply with Python class names
|
|
50
|
+
# comply with Python class names
|
|
51
|
+
uid_str = str(generate_uuid()).replace("-", "")
|
|
53
52
|
model_name = f"Model{data_type}{uid_str}"
|
|
54
53
|
try:
|
|
55
54
|
with source_file.open() as fd: # CSV can be larger than memory
|
|
@@ -70,33 +69,27 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
70
69
|
if data_type == "jsonl":
|
|
71
70
|
data_type = "json" # treat json line as plain JSON in auto-schema
|
|
72
71
|
data_string = json.dumps(json_object)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
capture_output=True,
|
|
88
|
-
check=True,
|
|
72
|
+
|
|
73
|
+
input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
|
|
74
|
+
input_file_type = input_file_types[data_type]
|
|
75
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
76
|
+
output = Path(tmpdir) / "model.py"
|
|
77
|
+
datamodel_code_generator.generate(
|
|
78
|
+
data_string,
|
|
79
|
+
input_file_type=input_file_type,
|
|
80
|
+
output=output,
|
|
81
|
+
target_python_version=datamodel_code_generator.PythonVersion.PY_39,
|
|
82
|
+
base_class="datachain.lib.meta_formats.UserModel",
|
|
83
|
+
class_name=model_name,
|
|
84
|
+
additional_imports=["datachain.lib.data_model.DataModel"],
|
|
85
|
+
use_standard_collections=True,
|
|
89
86
|
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
print("from datachain.lib.data_model import DataModel")
|
|
97
|
-
print("\n" + f"DataModel.register({model_name})" + "\n")
|
|
98
|
-
print("\n" + f"spec={model_name}" + "\n")
|
|
99
|
-
return model_output
|
|
87
|
+
epilogue = f"""
|
|
88
|
+
{model_name}.model_rebuild()
|
|
89
|
+
DataModel.register({model_name})
|
|
90
|
+
spec = {model_name}
|
|
91
|
+
"""
|
|
92
|
+
return output.read_text() + epilogue
|
|
100
93
|
|
|
101
94
|
|
|
102
95
|
#
|
|
@@ -113,34 +106,24 @@ def read_meta( # noqa: C901
|
|
|
113
106
|
) -> Callable:
|
|
114
107
|
from datachain.lib.dc import DataChain
|
|
115
108
|
|
|
116
|
-
# ugly hack: datachain is run redirecting printed outputs to a variable
|
|
117
109
|
if schema_from:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
meta_schema=lambda file: read_schema(
|
|
127
|
-
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
128
|
-
),
|
|
129
|
-
output=str,
|
|
130
|
-
)
|
|
110
|
+
chain = (
|
|
111
|
+
DataChain.from_storage(schema_from, type="text")
|
|
112
|
+
.limit(1)
|
|
113
|
+
.map( # dummy column created (#1615)
|
|
114
|
+
meta_schema=lambda file: read_schema(
|
|
115
|
+
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
116
|
+
),
|
|
117
|
+
output=str,
|
|
131
118
|
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
sys.stdout = current_stdout
|
|
135
|
-
model_output = captured_output.getvalue()
|
|
136
|
-
captured_output.close()
|
|
137
|
-
|
|
119
|
+
)
|
|
120
|
+
(model_output,) = chain.collect("meta_schema")
|
|
138
121
|
if print_schema:
|
|
139
122
|
print(f"{model_output}")
|
|
140
123
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|
|
141
124
|
if not spec:
|
|
142
125
|
local_vars: dict[str, Any] = {}
|
|
143
|
-
exec(model_output, globals(), local_vars) # noqa: S102
|
|
126
|
+
exec(model_output, globals(), local_vars) # type: ignore[arg-type] # noqa: S102
|
|
144
127
|
spec = local_vars["spec"]
|
|
145
128
|
|
|
146
129
|
if not (spec) and not (schema_from):
|
datachain/query/dataset.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import inspect
|
|
3
|
-
import json
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
6
5
|
import random
|
|
@@ -37,11 +36,7 @@ from sqlalchemy.sql.selectable import Select
|
|
|
37
36
|
from tqdm import tqdm
|
|
38
37
|
|
|
39
38
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
40
|
-
from datachain.catalog import
|
|
41
|
-
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
42
|
-
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
43
|
-
get_catalog,
|
|
44
|
-
)
|
|
39
|
+
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
45
40
|
from datachain.data_storage.schema import (
|
|
46
41
|
PARTITION_COLUMN_ID,
|
|
47
42
|
partition_col_names,
|
|
@@ -1173,8 +1168,12 @@ class DatasetQuery:
|
|
|
1173
1168
|
"""
|
|
1174
1169
|
return self.name is not None and self.version is not None
|
|
1175
1170
|
|
|
1176
|
-
def c(self,
|
|
1177
|
-
col
|
|
1171
|
+
def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
|
|
1172
|
+
col: sqlalchemy.ColumnClause = (
|
|
1173
|
+
sqlalchemy.column(column)
|
|
1174
|
+
if isinstance(column, str)
|
|
1175
|
+
else sqlalchemy.column(column.name, column.type)
|
|
1176
|
+
)
|
|
1178
1177
|
col.table = self.table
|
|
1179
1178
|
return col
|
|
1180
1179
|
|
|
@@ -1710,27 +1709,14 @@ class DatasetQuery:
|
|
|
1710
1709
|
return self.__class__(name=name, version=version, catalog=self.catalog)
|
|
1711
1710
|
|
|
1712
1711
|
|
|
1713
|
-
def
|
|
1714
|
-
handle = os.getenv("DATACHAIN_OUTPUT_FD")
|
|
1715
|
-
if not handle:
|
|
1716
|
-
return os.devnull
|
|
1717
|
-
|
|
1718
|
-
if os.name != "nt":
|
|
1719
|
-
return int(handle)
|
|
1720
|
-
|
|
1721
|
-
import msvcrt
|
|
1722
|
-
|
|
1723
|
-
return msvcrt.open_osfhandle(int(handle), os.O_WRONLY) # type: ignore[attr-defined]
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
1712
|
+
def query_wrapper(dataset_query: Any) -> Any:
|
|
1727
1713
|
"""
|
|
1728
1714
|
Wrapper function that wraps the last statement of user query script.
|
|
1729
1715
|
Last statement MUST be instance of DatasetQuery, otherwise script exits with
|
|
1730
1716
|
error code 10
|
|
1731
1717
|
"""
|
|
1732
1718
|
if not isinstance(dataset_query, DatasetQuery):
|
|
1733
|
-
|
|
1719
|
+
return dataset_query
|
|
1734
1720
|
|
|
1735
1721
|
catalog = dataset_query.catalog
|
|
1736
1722
|
save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
|
|
@@ -1742,13 +1728,4 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
|
1742
1728
|
if save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1743
1729
|
name = catalog.generate_query_dataset_name()
|
|
1744
1730
|
dataset_query = dataset_query.save(name)
|
|
1745
|
-
|
|
1746
|
-
dataset: Optional[tuple[str, int]] = None
|
|
1747
|
-
if dataset_query.attached:
|
|
1748
|
-
assert dataset_query.name, "Dataset name should be provided"
|
|
1749
|
-
assert dataset_query.version, "Dataset version should be provided"
|
|
1750
|
-
dataset = dataset_query.name, dataset_query.version
|
|
1751
|
-
|
|
1752
|
-
with open(_get_output_fd_for_write(), mode="w") as f:
|
|
1753
|
-
json.dump(dataset, f)
|
|
1754
1731
|
return dataset_query
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -96,6 +96,10 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
|
|
|
96
96
|
Provides-Extra: vector
|
|
97
97
|
Requires-Dist: usearch ; extra == 'vector'
|
|
98
98
|
|
|
99
|
+
.. image:: docs/assets/datachain_logotype.svg
|
|
100
|
+
:height: 48
|
|
101
|
+
:alt: DataChain logo
|
|
102
|
+
|
|
99
103
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
100
104
|
|
|
101
105
|
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
@@ -111,7 +115,6 @@ Requires-Dist: usearch ; extra == 'vector'
|
|
|
111
115
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
112
116
|
:alt: Tests
|
|
113
117
|
|
|
114
|
-
AI 🔗 DataChain
|
|
115
118
|
----------------
|
|
116
119
|
|
|
117
120
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
@@ -7,7 +7,7 @@ datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
|
-
datachain/job.py,sha256=
|
|
10
|
+
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
|
|
12
12
|
datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
@@ -17,10 +17,9 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
18
|
datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=NgS7_SlmpJdUSp1v8KdCuLTjFklmYvT_jOLdzTyyK5I,72313
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
|
-
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
24
23
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
24
|
datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
|
|
26
25
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
@@ -43,13 +42,13 @@ datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
|
|
|
43
42
|
datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
|
|
44
43
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
45
44
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
46
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=s4E-bD6_T6JFJ7TEa5Y9RS705lIfcV9OUJwDD6RNCX0,68156
|
|
47
46
|
datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
|
|
48
47
|
datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
|
|
49
48
|
datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
|
|
50
49
|
datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
|
|
51
50
|
datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
|
|
52
|
-
datachain/lib/meta_formats.py,sha256=
|
|
51
|
+
datachain/lib/meta_formats.py,sha256=67uF9trQ2II6xFvN0u6eo5NNRf5xvCkpMHj7ThiG41Y,6777
|
|
53
52
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
54
53
|
datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
|
|
55
54
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
@@ -70,7 +69,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
|
|
|
70
69
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
71
70
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
72
71
|
datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
|
|
73
|
-
datachain/query/dataset.py,sha256=
|
|
72
|
+
datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
|
|
74
73
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
75
74
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
76
75
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
96
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
97
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
98
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
105
|
-
datachain-0.3.
|
|
99
|
+
datachain-0.3.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.3.11.dist-info/METADATA,sha256=iSdfjWpVT1Iqzlg82eN5QzJ-icaYxkG7TUKEpEOi5sk,17124
|
|
101
|
+
datachain-0.3.11.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
|
102
|
+
datachain-0.3.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.3.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.3.11.dist-info/RECORD,,
|
datachain/catalog/subclass.py
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class SubclassFinder(ast.NodeVisitor):
|
|
5
|
-
"""Finds subclasses of a target class in an AST."""
|
|
6
|
-
|
|
7
|
-
def __init__(self, target_classes: list[str]):
|
|
8
|
-
self.imports: list[ast.AST] = []
|
|
9
|
-
self.main_body: list[ast.AST] = []
|
|
10
|
-
|
|
11
|
-
self.target_classes: list[str] = target_classes
|
|
12
|
-
self.aliases: dict[str, str] = {}
|
|
13
|
-
self.feature_class: list[ast.AST] = []
|
|
14
|
-
|
|
15
|
-
def visit_ImportFrom(self, node): # noqa: N802
|
|
16
|
-
module = node.module
|
|
17
|
-
for alias in node.names:
|
|
18
|
-
full_name = f"{module}.{alias.name}"
|
|
19
|
-
self.aliases[alias.asname or alias.name] = full_name
|
|
20
|
-
self.imports.append(node)
|
|
21
|
-
|
|
22
|
-
def visit_Import(self, node): # noqa: N802
|
|
23
|
-
for alias in node.names:
|
|
24
|
-
self.aliases[alias.asname or alias.name] = alias.name
|
|
25
|
-
self.imports.append(node)
|
|
26
|
-
|
|
27
|
-
def visit_ClassDef(self, node): # noqa: N802
|
|
28
|
-
base_names = [self.get_base_name(base) for base in node.bases]
|
|
29
|
-
if any(self.is_subclass(name) for name in base_names):
|
|
30
|
-
self.feature_class.append(node)
|
|
31
|
-
else:
|
|
32
|
-
self.main_body.append(node)
|
|
33
|
-
|
|
34
|
-
def visit(self, node):
|
|
35
|
-
if isinstance(
|
|
36
|
-
node,
|
|
37
|
-
(ast.Import, ast.ImportFrom, ast.ClassDef, ast.Module),
|
|
38
|
-
):
|
|
39
|
-
return super().visit(node)
|
|
40
|
-
self.main_body.append(node)
|
|
41
|
-
return node
|
|
42
|
-
|
|
43
|
-
def get_base_name(self, node):
|
|
44
|
-
if isinstance(node, ast.Name):
|
|
45
|
-
return self.aliases.get(node.id, node.id)
|
|
46
|
-
if isinstance(node, ast.Attribute):
|
|
47
|
-
return self.get_full_attr_name(node)
|
|
48
|
-
if isinstance(node, ast.Subscript):
|
|
49
|
-
return self.get_base_name(node.value)
|
|
50
|
-
return None
|
|
51
|
-
|
|
52
|
-
def get_full_attr_name(self, node):
|
|
53
|
-
if isinstance(node.value, ast.Name):
|
|
54
|
-
return f"{node.value.id}.{node.attr}"
|
|
55
|
-
if isinstance(node.value, ast.Attribute):
|
|
56
|
-
return f"{self.get_full_attr_name(node.value)}.{node.attr}"
|
|
57
|
-
return node.attr
|
|
58
|
-
|
|
59
|
-
def is_subclass(self, base_name):
|
|
60
|
-
return base_name and base_name.split(".")[-1] in self.target_classes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|