datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +57 -212
- datachain/cli.py +6 -38
- datachain/client/fsspec.py +3 -0
- datachain/client/hf.py +47 -0
- datachain/data_storage/metastore.py +2 -29
- datachain/data_storage/sqlite.py +3 -12
- datachain/data_storage/warehouse.py +20 -29
- datachain/dataset.py +44 -32
- datachain/job.py +4 -3
- datachain/lib/arrow.py +21 -5
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc.py +183 -59
- datachain/lib/file.py +10 -33
- datachain/lib/hf.py +2 -1
- datachain/lib/listing.py +102 -94
- datachain/lib/listing_info.py +32 -0
- datachain/lib/meta_formats.py +39 -56
- datachain/lib/signal_schema.py +5 -2
- datachain/node.py +13 -0
- datachain/query/dataset.py +12 -105
- datachain/query/metrics.py +8 -0
- datachain/utils.py +5 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/METADATA +7 -3
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/RECORD +28 -27
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/WHEEL +1 -1
- datachain/catalog/subclass.py +0 -60
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/LICENSE +0 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from datetime import datetime, timedelta, timezone
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datachain.client import Client
|
|
5
|
+
from datachain.lib.dataset_info import DatasetInfo
|
|
6
|
+
from datachain.lib.listing import LISTING_PREFIX, LISTING_TTL
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ListingInfo(DatasetInfo):
|
|
10
|
+
@property
|
|
11
|
+
def uri(self) -> str:
|
|
12
|
+
return self.name.removeprefix(LISTING_PREFIX)
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def storage_uri(self) -> str:
|
|
16
|
+
client, _ = Client.parse_url(self.uri, None) # type: ignore[arg-type]
|
|
17
|
+
return client.uri
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def expires(self) -> Optional[datetime]:
|
|
21
|
+
if not self.finished_at:
|
|
22
|
+
return None
|
|
23
|
+
return self.finished_at + timedelta(seconds=LISTING_TTL)
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def is_expired(self) -> bool:
|
|
27
|
+
return datetime.now(timezone.utc) > self.expires if self.expires else False
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def last_inserted_at(self):
|
|
31
|
+
# TODO we need to add updated_at to dataset version or explicit last_inserted_at
|
|
32
|
+
raise NotImplementedError
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
# pip install jmespath
|
|
3
3
|
#
|
|
4
4
|
import csv
|
|
5
|
-
import io
|
|
6
5
|
import json
|
|
7
|
-
import
|
|
8
|
-
import sys
|
|
6
|
+
import tempfile
|
|
9
7
|
import uuid
|
|
10
8
|
from collections.abc import Iterator
|
|
9
|
+
from pathlib import Path
|
|
11
10
|
from typing import Any, Callable
|
|
12
11
|
|
|
12
|
+
import datamodel_code_generator
|
|
13
13
|
import jmespath as jsp
|
|
14
14
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
|
|
15
15
|
|
|
@@ -47,17 +47,16 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
47
47
|
data_string = ""
|
|
48
48
|
# using uiid to get around issue #1617
|
|
49
49
|
if not model_name:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
) # comply with Python class names
|
|
50
|
+
# comply with Python class names
|
|
51
|
+
uid_str = str(generate_uuid()).replace("-", "")
|
|
53
52
|
model_name = f"Model{data_type}{uid_str}"
|
|
54
53
|
try:
|
|
55
54
|
with source_file.open() as fd: # CSV can be larger than memory
|
|
56
55
|
if data_type == "csv":
|
|
57
|
-
data_string += fd.readline().
|
|
58
|
-
data_string += fd.readline().
|
|
56
|
+
data_string += fd.readline().replace("\r", "")
|
|
57
|
+
data_string += fd.readline().replace("\r", "")
|
|
59
58
|
elif data_type == "jsonl":
|
|
60
|
-
data_string = fd.readline().
|
|
59
|
+
data_string = fd.readline().replace("\r", "")
|
|
61
60
|
else:
|
|
62
61
|
data_string = fd.read() # other meta must fit into RAM
|
|
63
62
|
except OSError as e:
|
|
@@ -70,33 +69,27 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
70
69
|
if data_type == "jsonl":
|
|
71
70
|
data_type = "json" # treat json line as plain JSON in auto-schema
|
|
72
71
|
data_string = json.dumps(json_object)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
capture_output=True,
|
|
88
|
-
check=True,
|
|
72
|
+
|
|
73
|
+
input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
|
|
74
|
+
input_file_type = input_file_types[data_type]
|
|
75
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
76
|
+
output = Path(tmpdir) / "model.py"
|
|
77
|
+
datamodel_code_generator.generate(
|
|
78
|
+
data_string,
|
|
79
|
+
input_file_type=input_file_type,
|
|
80
|
+
output=output,
|
|
81
|
+
target_python_version=datamodel_code_generator.PythonVersion.PY_39,
|
|
82
|
+
base_class="datachain.lib.meta_formats.UserModel",
|
|
83
|
+
class_name=model_name,
|
|
84
|
+
additional_imports=["datachain.lib.data_model.DataModel"],
|
|
85
|
+
use_standard_collections=True,
|
|
89
86
|
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
print("from datachain.lib.data_model import DataModel")
|
|
97
|
-
print("\n" + f"DataModel.register({model_name})" + "\n")
|
|
98
|
-
print("\n" + f"spec={model_name}" + "\n")
|
|
99
|
-
return model_output
|
|
87
|
+
epilogue = f"""
|
|
88
|
+
{model_name}.model_rebuild()
|
|
89
|
+
DataModel.register({model_name})
|
|
90
|
+
spec = {model_name}
|
|
91
|
+
"""
|
|
92
|
+
return output.read_text() + epilogue
|
|
100
93
|
|
|
101
94
|
|
|
102
95
|
#
|
|
@@ -113,34 +106,24 @@ def read_meta( # noqa: C901
|
|
|
113
106
|
) -> Callable:
|
|
114
107
|
from datachain.lib.dc import DataChain
|
|
115
108
|
|
|
116
|
-
# ugly hack: datachain is run redirecting printed outputs to a variable
|
|
117
109
|
if schema_from:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
meta_schema=lambda file: read_schema(
|
|
127
|
-
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
128
|
-
),
|
|
129
|
-
output=str,
|
|
130
|
-
)
|
|
110
|
+
chain = (
|
|
111
|
+
DataChain.from_storage(schema_from, type="text")
|
|
112
|
+
.limit(1)
|
|
113
|
+
.map( # dummy column created (#1615)
|
|
114
|
+
meta_schema=lambda file: read_schema(
|
|
115
|
+
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
116
|
+
),
|
|
117
|
+
output=str,
|
|
131
118
|
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
sys.stdout = current_stdout
|
|
135
|
-
model_output = captured_output.getvalue()
|
|
136
|
-
captured_output.close()
|
|
137
|
-
|
|
119
|
+
)
|
|
120
|
+
(model_output,) = chain.collect("meta_schema")
|
|
138
121
|
if print_schema:
|
|
139
122
|
print(f"{model_output}")
|
|
140
123
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|
|
141
124
|
if not spec:
|
|
142
125
|
local_vars: dict[str, Any] = {}
|
|
143
|
-
exec(model_output, globals(), local_vars) # noqa: S102
|
|
126
|
+
exec(model_output, globals(), local_vars) # type: ignore[arg-type] # noqa: S102
|
|
144
127
|
spec = local_vars["spec"]
|
|
145
128
|
|
|
146
129
|
if not (spec) and not (schema_from):
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -16,7 +16,6 @@ from typing import (
|
|
|
16
16
|
get_origin,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
-
import sqlalchemy as sa
|
|
20
19
|
from pydantic import BaseModel, create_model
|
|
21
20
|
from typing_extensions import Literal as LiteralEx
|
|
22
21
|
|
|
@@ -341,7 +340,7 @@ class SignalSchema:
|
|
|
341
340
|
signals = [
|
|
342
341
|
DEFAULT_DELIMITER.join(path)
|
|
343
342
|
if not as_columns
|
|
344
|
-
else
|
|
343
|
+
else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
|
|
345
344
|
for path, _type, has_subtree, _ in self.get_flat_tree()
|
|
346
345
|
if not has_subtree
|
|
347
346
|
]
|
|
@@ -415,6 +414,10 @@ class SignalSchema:
|
|
|
415
414
|
# renaming existing signal
|
|
416
415
|
del new_values[value.name]
|
|
417
416
|
new_values[name] = self.values[value.name]
|
|
417
|
+
elif name in self.values:
|
|
418
|
+
# changing the type of existing signal, e.g File -> ImageFile
|
|
419
|
+
del new_values[name]
|
|
420
|
+
new_values[name] = args_map[name]
|
|
418
421
|
else:
|
|
419
422
|
# adding new signal
|
|
420
423
|
new_values.update(sql_to_python({name: value}))
|
datachain/node.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
6
|
from datachain.cache import UniqueId
|
|
7
|
+
from datachain.lib.file import File
|
|
7
8
|
from datachain.storage import StorageURI
|
|
8
9
|
from datachain.utils import TIME_ZERO, time_to_str
|
|
9
10
|
|
|
@@ -189,6 +190,18 @@ class Entry:
|
|
|
189
190
|
return ""
|
|
190
191
|
return split[0]
|
|
191
192
|
|
|
193
|
+
def to_file(self, source: str) -> File:
|
|
194
|
+
return File(
|
|
195
|
+
source=source,
|
|
196
|
+
path=self.path,
|
|
197
|
+
size=self.size,
|
|
198
|
+
version=self.version,
|
|
199
|
+
etag=self.etag,
|
|
200
|
+
is_latest=self.is_latest,
|
|
201
|
+
last_modified=self.last_modified,
|
|
202
|
+
location=self.location,
|
|
203
|
+
)
|
|
204
|
+
|
|
192
205
|
|
|
193
206
|
def get_path(parent: str, name: str):
|
|
194
207
|
return f"{parent}/{name}" if parent else name
|
datachain/query/dataset.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
-
import datetime
|
|
3
2
|
import inspect
|
|
4
|
-
import json
|
|
5
3
|
import logging
|
|
6
4
|
import os
|
|
7
5
|
import random
|
|
@@ -38,11 +36,7 @@ from sqlalchemy.sql.selectable import Select
|
|
|
38
36
|
from tqdm import tqdm
|
|
39
37
|
|
|
40
38
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
41
|
-
from datachain.catalog import
|
|
42
|
-
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
43
|
-
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
44
|
-
get_catalog,
|
|
45
|
-
)
|
|
39
|
+
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
46
40
|
from datachain.data_storage.schema import (
|
|
47
41
|
PARTITION_COLUMN_ID,
|
|
48
42
|
partition_col_names,
|
|
@@ -60,7 +54,6 @@ from datachain.utils import (
|
|
|
60
54
|
get_datachain_executable,
|
|
61
55
|
)
|
|
62
56
|
|
|
63
|
-
from .metrics import metrics
|
|
64
57
|
from .schema import C, UDFParamSpec, normalize_param
|
|
65
58
|
from .session import Session
|
|
66
59
|
from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
|
|
@@ -219,7 +212,7 @@ class IndexingStep(StartingStep):
|
|
|
219
212
|
recursive=self.recursive,
|
|
220
213
|
)
|
|
221
214
|
|
|
222
|
-
storage = self.catalog.get_storage(uri)
|
|
215
|
+
storage = self.catalog.metastore.get_storage(uri)
|
|
223
216
|
|
|
224
217
|
return step_result(q, dataset_rows.c, dependencies=[storage.uri])
|
|
225
218
|
|
|
@@ -1175,8 +1168,12 @@ class DatasetQuery:
|
|
|
1175
1168
|
"""
|
|
1176
1169
|
return self.name is not None and self.version is not None
|
|
1177
1170
|
|
|
1178
|
-
def c(self,
|
|
1179
|
-
col
|
|
1171
|
+
def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
|
|
1172
|
+
col: sqlalchemy.ColumnClause = (
|
|
1173
|
+
sqlalchemy.column(column)
|
|
1174
|
+
if isinstance(column, str)
|
|
1175
|
+
else sqlalchemy.column(column.name, column.type)
|
|
1176
|
+
)
|
|
1180
1177
|
col.table = self.table
|
|
1181
1178
|
return col
|
|
1182
1179
|
|
|
@@ -1634,7 +1631,7 @@ class DatasetQuery:
|
|
|
1634
1631
|
)
|
|
1635
1632
|
else:
|
|
1636
1633
|
# storage dependency - its name is a valid StorageURI
|
|
1637
|
-
storage = self.catalog.get_storage(dependency)
|
|
1634
|
+
storage = self.catalog.metastore.get_storage(dependency)
|
|
1638
1635
|
self.catalog.metastore.add_storage_dependency(
|
|
1639
1636
|
StorageURI(dataset.name),
|
|
1640
1637
|
version,
|
|
@@ -1712,113 +1709,23 @@ class DatasetQuery:
|
|
|
1712
1709
|
return self.__class__(name=name, version=version, catalog=self.catalog)
|
|
1713
1710
|
|
|
1714
1711
|
|
|
1715
|
-
def
|
|
1716
|
-
handle = os.getenv("DATACHAIN_OUTPUT_FD")
|
|
1717
|
-
if not handle:
|
|
1718
|
-
return os.devnull
|
|
1719
|
-
|
|
1720
|
-
if os.name != "nt":
|
|
1721
|
-
return int(handle)
|
|
1722
|
-
|
|
1723
|
-
import msvcrt
|
|
1724
|
-
|
|
1725
|
-
return msvcrt.open_osfhandle(int(handle), os.O_WRONLY) # type: ignore[attr-defined]
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
@attrs.define
|
|
1729
|
-
class ExecutionResult:
|
|
1730
|
-
preview: list[dict] = attrs.field(factory=list)
|
|
1731
|
-
dataset: Optional[tuple[str, int]] = None
|
|
1732
|
-
metrics: dict[str, Any] = attrs.field(factory=dict)
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
def _send_result(dataset_query: DatasetQuery) -> None:
|
|
1736
|
-
class JSONSerialize(json.JSONEncoder):
|
|
1737
|
-
def default(self, obj):
|
|
1738
|
-
if isinstance(obj, (datetime.datetime, datetime.date)):
|
|
1739
|
-
return obj.isoformat()
|
|
1740
|
-
if isinstance(obj, bytes):
|
|
1741
|
-
return list(obj[:1024])
|
|
1742
|
-
return super().default(obj)
|
|
1743
|
-
|
|
1744
|
-
try:
|
|
1745
|
-
preview_args: dict[str, Any] = json.loads(
|
|
1746
|
-
os.getenv("DATACHAIN_QUERY_PREVIEW_ARGS", "")
|
|
1747
|
-
)
|
|
1748
|
-
except ValueError:
|
|
1749
|
-
preview_args = {}
|
|
1750
|
-
|
|
1751
|
-
columns = preview_args.get("columns") or []
|
|
1752
|
-
|
|
1753
|
-
if type(dataset_query) is DatasetQuery:
|
|
1754
|
-
preview_query = dataset_query.select(*columns)
|
|
1755
|
-
else:
|
|
1756
|
-
preview_query = dataset_query.select(*columns, _sys=False)
|
|
1757
|
-
|
|
1758
|
-
preview_query = preview_query.limit(preview_args.get("limit", 10)).offset(
|
|
1759
|
-
preview_args.get("offset", 0)
|
|
1760
|
-
)
|
|
1761
|
-
|
|
1762
|
-
dataset: Optional[tuple[str, int]] = None
|
|
1763
|
-
if dataset_query.attached:
|
|
1764
|
-
assert dataset_query.name, "Dataset name should be provided"
|
|
1765
|
-
assert dataset_query.version, "Dataset version should be provided"
|
|
1766
|
-
dataset = dataset_query.name, dataset_query.version
|
|
1767
|
-
|
|
1768
|
-
preview = preview_query.to_db_records()
|
|
1769
|
-
result = ExecutionResult(preview, dataset, metrics)
|
|
1770
|
-
data = attrs.asdict(result)
|
|
1771
|
-
|
|
1772
|
-
with open(_get_output_fd_for_write(), mode="w") as f:
|
|
1773
|
-
json.dump(data, f, cls=JSONSerialize)
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
|
|
1712
|
+
def query_wrapper(dataset_query: Any) -> Any:
|
|
1777
1713
|
"""
|
|
1778
1714
|
Wrapper function that wraps the last statement of user query script.
|
|
1779
1715
|
Last statement MUST be instance of DatasetQuery, otherwise script exits with
|
|
1780
1716
|
error code 10
|
|
1781
1717
|
"""
|
|
1782
1718
|
if not isinstance(dataset_query, DatasetQuery):
|
|
1783
|
-
|
|
1719
|
+
return dataset_query
|
|
1784
1720
|
|
|
1785
1721
|
catalog = dataset_query.catalog
|
|
1786
1722
|
save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
|
|
1787
|
-
save_as = os.getenv("DATACHAIN_QUERY_SAVE_AS")
|
|
1788
1723
|
|
|
1789
1724
|
is_session_temp_dataset = dataset_query.name and dataset_query.name.startswith(
|
|
1790
1725
|
dataset_query.session.get_temp_prefix()
|
|
1791
1726
|
)
|
|
1792
1727
|
|
|
1793
|
-
if
|
|
1794
|
-
if dataset_query.attached:
|
|
1795
|
-
dataset_name = dataset_query.name
|
|
1796
|
-
version = dataset_query.version
|
|
1797
|
-
assert dataset_name, "Dataset name should be provided in attached mode"
|
|
1798
|
-
assert version, "Dataset version should be provided in attached mode"
|
|
1799
|
-
|
|
1800
|
-
dataset = catalog.get_dataset(dataset_name)
|
|
1801
|
-
|
|
1802
|
-
try:
|
|
1803
|
-
target_dataset = catalog.get_dataset(save_as)
|
|
1804
|
-
except DatasetNotFoundError:
|
|
1805
|
-
target_dataset = None
|
|
1806
|
-
|
|
1807
|
-
if target_dataset:
|
|
1808
|
-
dataset = catalog.register_dataset(dataset, version, target_dataset)
|
|
1809
|
-
else:
|
|
1810
|
-
dataset = catalog.register_new_dataset(dataset, version, save_as)
|
|
1811
|
-
|
|
1812
|
-
dataset_query = DatasetQuery(
|
|
1813
|
-
name=dataset.name,
|
|
1814
|
-
version=dataset.latest_version,
|
|
1815
|
-
catalog=catalog,
|
|
1816
|
-
)
|
|
1817
|
-
else:
|
|
1818
|
-
dataset_query = dataset_query.save(save_as)
|
|
1819
|
-
elif save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1728
|
+
if save and (is_session_temp_dataset or not dataset_query.attached):
|
|
1820
1729
|
name = catalog.generate_query_dataset_name()
|
|
1821
1730
|
dataset_query = dataset_query.save(name)
|
|
1822
|
-
|
|
1823
|
-
_send_result(dataset_query)
|
|
1824
1731
|
return dataset_query
|
datachain/query/metrics.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Optional, Union
|
|
2
3
|
|
|
3
4
|
metrics: dict[str, Union[str, int, float, bool, None]] = {}
|
|
@@ -13,6 +14,13 @@ def set(key: str, value: Union[str, int, float, bool, None]) -> None: # noqa: P
|
|
|
13
14
|
raise TypeError("Value must be a string, int, float or bool")
|
|
14
15
|
metrics[key] = value
|
|
15
16
|
|
|
17
|
+
if job_id := os.getenv("DATACHAIN_JOB_ID"):
|
|
18
|
+
from datachain.data_storage.job import JobStatus
|
|
19
|
+
from datachain.query.session import Session
|
|
20
|
+
|
|
21
|
+
metastore = Session.get().catalog.metastore
|
|
22
|
+
metastore.set_job_status(job_id, JobStatus.RUNNING, metrics=metrics)
|
|
23
|
+
|
|
16
24
|
|
|
17
25
|
def get(key: str) -> Optional[Union[str, int, float, bool]]:
|
|
18
26
|
"""Get a metric value."""
|
datachain/utils.py
CHANGED
|
@@ -448,3 +448,8 @@ def get_datachain_executable() -> list[str]:
|
|
|
448
448
|
if datachain_exec_path := os.getenv("DATACHAIN_EXEC_PATH"):
|
|
449
449
|
return [datachain_exec_path]
|
|
450
450
|
return [sys.executable, "-m", "datachain"]
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def uses_glob(path: str) -> bool:
|
|
454
|
+
"""Checks if some URI path has glob syntax in it"""
|
|
455
|
+
return glob.has_magic(os.path.basename(os.path.normpath(path)))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -42,6 +42,7 @@ Requires-Dist: datamodel-code-generator >=0.25
|
|
|
42
42
|
Requires-Dist: Pillow <11,>=10.0.0
|
|
43
43
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
44
44
|
Requires-Dist: psutil
|
|
45
|
+
Requires-Dist: huggingface-hub
|
|
45
46
|
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
46
47
|
Provides-Extra: dev
|
|
47
48
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
@@ -67,7 +68,7 @@ Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
|
67
68
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
68
69
|
Provides-Extra: hf
|
|
69
70
|
Requires-Dist: numba >=0.60.0 ; extra == 'hf'
|
|
70
|
-
Requires-Dist: datasets[audio,vision] ; extra == 'hf'
|
|
71
|
+
Requires-Dist: datasets[audio,vision] >=2.21.0 ; extra == 'hf'
|
|
71
72
|
Provides-Extra: remote
|
|
72
73
|
Requires-Dist: lz4 ; extra == 'remote'
|
|
73
74
|
Requires-Dist: requests >=2.22.0 ; extra == 'remote'
|
|
@@ -95,6 +96,10 @@ Requires-Dist: transformers >=4.36.0 ; extra == 'torch'
|
|
|
95
96
|
Provides-Extra: vector
|
|
96
97
|
Requires-Dist: usearch ; extra == 'vector'
|
|
97
98
|
|
|
99
|
+
.. image:: docs/assets/datachain_logotype.svg
|
|
100
|
+
:height: 48
|
|
101
|
+
:alt: DataChain logo
|
|
102
|
+
|
|
98
103
|
|PyPI| |Python Version| |Codecov| |Tests|
|
|
99
104
|
|
|
100
105
|
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
@@ -110,7 +115,6 @@ Requires-Dist: usearch ; extra == 'vector'
|
|
|
110
115
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
111
116
|
:alt: Tests
|
|
112
117
|
|
|
113
|
-
AI 🔗 DataChain
|
|
114
118
|
----------------
|
|
115
119
|
|
|
116
120
|
DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
|
|
@@ -2,56 +2,57 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
|
|
4
4
|
datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=ECf_z5X8ILDJdUn2Cpb_z-ZjSRIzn7skiuMGfM-y9i0,30999
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=EcYjhHg1dxxPbDwSuIxc-mDRDo3v_pYf79fMy4re1oA,14740
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
|
-
datachain/job.py,sha256=
|
|
10
|
+
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
|
|
12
|
-
datachain/node.py,sha256=
|
|
12
|
+
datachain/node.py,sha256=dcm_7dVurFHpI0EHV2K6SjpJyh-gN4PVWAB-20quk04,6382
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
15
15
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
|
-
datachain/utils.py,sha256=
|
|
18
|
+
datachain/utils.py,sha256=HKUdVqreBTzcCULAYRw1sC6z33OaomVD1WoMSoFcPHg,13148
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=NgS7_SlmpJdUSp1v8KdCuLTjFklmYvT_jOLdzTyyK5I,72313
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
|
-
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
24
23
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
24
|
datachain/client/azure.py,sha256=LXSahE0Z6r4dXqpBkKnq3J5fg7N7ymC1lSn-1SoILGc,2687
|
|
26
25
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
|
-
datachain/client/fsspec.py,sha256=
|
|
26
|
+
datachain/client/fsspec.py,sha256=LQb5tr-pP9umCFYo3nGJR_dZxUyiSN7IDE8jhp1TXco,13333
|
|
28
27
|
datachain/client/gcs.py,sha256=P_E3mhzhXR9mJ_wc3AYZuczzwOJ0-D3J5qhJXeSU-xk,4518
|
|
28
|
+
datachain/client/hf.py,sha256=R-F6Ks6aVM9wSNkIXOkOnZFwsJlfdRwJjymRa78RLjM,1246
|
|
29
29
|
datachain/client/local.py,sha256=H8TNY8pi2kA8y9_f_1XLUjJF66f229qC_b2y4xGkzdU,5300
|
|
30
30
|
datachain/client/s3.py,sha256=aQxfMH8G8bUjmHF1-6P90MSkXsU5DgOPEVlKWLu459I,6568
|
|
31
31
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
32
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
33
33
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
34
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
|
-
datachain/data_storage/metastore.py,sha256=
|
|
35
|
+
datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
|
|
36
36
|
datachain/data_storage/schema.py,sha256=JKpSEz8igpwZ9zkpRPYVXZxEpiXuLKEs2WNhH0KqM6U,8552
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=Z4B2KDL4C8Uio2aLMxaKv0t2MoOtCV3bSqWg4X9mTFg,28048
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=f7ETnYIXx5KMcPfwg_4bh_00QJiMLIliwE_41vmRGUo,33037
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
41
|
+
datachain/lib/arrow.py,sha256=dV17oGiknqEW55ogGK_9T0ycNFwd2z-EFOW0AQiR6TU,5840
|
|
42
42
|
datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
|
|
43
43
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
|
-
datachain/lib/dataset_info.py,sha256=
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
-
datachain/lib/file.py,sha256=
|
|
47
|
-
datachain/lib/hf.py,sha256=
|
|
44
|
+
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
45
|
+
datachain/lib/dc.py,sha256=s4E-bD6_T6JFJ7TEa5Y9RS705lIfcV9OUJwDD6RNCX0,68156
|
|
46
|
+
datachain/lib/file.py,sha256=WOOYw3LcGROA6wshJ_aZkSgcTqfB4UxTbZDTx9KqAOg,11429
|
|
47
|
+
datachain/lib/hf.py,sha256=ZiMvgy3DYiklGKZv-w7gevrHOgn3bGfpTlpDPOHCNqs,5336
|
|
48
48
|
datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
|
|
49
|
-
datachain/lib/listing.py,sha256=
|
|
50
|
-
datachain/lib/
|
|
49
|
+
datachain/lib/listing.py,sha256=S9Xn_Saxu4xk3K_01VexkfMZW0INQiATlidt2bzgWKY,3938
|
|
50
|
+
datachain/lib/listing_info.py,sha256=sr5KzCXlCxlPuRmy_pVadD4miLpp5y0btvyaIPcluwI,996
|
|
51
|
+
datachain/lib/meta_formats.py,sha256=67uF9trQ2II6xFvN0u6eo5NNRf5xvCkpMHj7ThiG41Y,6777
|
|
51
52
|
datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
|
|
52
53
|
datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
|
|
53
54
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
54
|
-
datachain/lib/signal_schema.py,sha256=
|
|
55
|
+
datachain/lib/signal_schema.py,sha256=hqQLwUmt3w8RLa96MtubK9N2CBXqqTPrUkSRXc0ktt4,20275
|
|
55
56
|
datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
|
|
56
57
|
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
57
58
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
@@ -68,9 +69,9 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
|
|
|
68
69
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
69
70
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
70
71
|
datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
|
|
71
|
-
datachain/query/dataset.py,sha256=
|
|
72
|
+
datachain/query/dataset.py,sha256=B2EmGOL8gjrdU_WhU88Dj7FsxvxrNeKwe2STXnU9T9E,58369
|
|
72
73
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
73
|
-
datachain/query/metrics.py,sha256=
|
|
74
|
+
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
74
75
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
75
76
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
76
77
|
datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
|
|
@@ -95,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
95
96
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
96
97
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
97
98
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
98
|
-
datachain-0.3.
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
99
|
+
datachain-0.3.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.3.11.dist-info/METADATA,sha256=iSdfjWpVT1Iqzlg82eN5QzJ-icaYxkG7TUKEpEOi5sk,17124
|
|
101
|
+
datachain-0.3.11.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
|
102
|
+
datachain-0.3.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.3.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.3.11.dist-info/RECORD,,
|
datachain/catalog/subclass.py
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class SubclassFinder(ast.NodeVisitor):
|
|
5
|
-
"""Finds subclasses of a target class in an AST."""
|
|
6
|
-
|
|
7
|
-
def __init__(self, target_classes: list[str]):
|
|
8
|
-
self.imports: list[ast.AST] = []
|
|
9
|
-
self.main_body: list[ast.AST] = []
|
|
10
|
-
|
|
11
|
-
self.target_classes: list[str] = target_classes
|
|
12
|
-
self.aliases: dict[str, str] = {}
|
|
13
|
-
self.feature_class: list[ast.AST] = []
|
|
14
|
-
|
|
15
|
-
def visit_ImportFrom(self, node): # noqa: N802
|
|
16
|
-
module = node.module
|
|
17
|
-
for alias in node.names:
|
|
18
|
-
full_name = f"{module}.{alias.name}"
|
|
19
|
-
self.aliases[alias.asname or alias.name] = full_name
|
|
20
|
-
self.imports.append(node)
|
|
21
|
-
|
|
22
|
-
def visit_Import(self, node): # noqa: N802
|
|
23
|
-
for alias in node.names:
|
|
24
|
-
self.aliases[alias.asname or alias.name] = alias.name
|
|
25
|
-
self.imports.append(node)
|
|
26
|
-
|
|
27
|
-
def visit_ClassDef(self, node): # noqa: N802
|
|
28
|
-
base_names = [self.get_base_name(base) for base in node.bases]
|
|
29
|
-
if any(self.is_subclass(name) for name in base_names):
|
|
30
|
-
self.feature_class.append(node)
|
|
31
|
-
else:
|
|
32
|
-
self.main_body.append(node)
|
|
33
|
-
|
|
34
|
-
def visit(self, node):
|
|
35
|
-
if isinstance(
|
|
36
|
-
node,
|
|
37
|
-
(ast.Import, ast.ImportFrom, ast.ClassDef, ast.Module),
|
|
38
|
-
):
|
|
39
|
-
return super().visit(node)
|
|
40
|
-
self.main_body.append(node)
|
|
41
|
-
return node
|
|
42
|
-
|
|
43
|
-
def get_base_name(self, node):
|
|
44
|
-
if isinstance(node, ast.Name):
|
|
45
|
-
return self.aliases.get(node.id, node.id)
|
|
46
|
-
if isinstance(node, ast.Attribute):
|
|
47
|
-
return self.get_full_attr_name(node)
|
|
48
|
-
if isinstance(node, ast.Subscript):
|
|
49
|
-
return self.get_base_name(node.value)
|
|
50
|
-
return None
|
|
51
|
-
|
|
52
|
-
def get_full_attr_name(self, node):
|
|
53
|
-
if isinstance(node.value, ast.Name):
|
|
54
|
-
return f"{node.value.id}.{node.attr}"
|
|
55
|
-
if isinstance(node.value, ast.Attribute):
|
|
56
|
-
return f"{self.get_full_attr_name(node.value)}.{node.attr}"
|
|
57
|
-
return node.attr
|
|
58
|
-
|
|
59
|
-
def is_subclass(self, base_name):
|
|
60
|
-
return base_name and base_name.split(".")[-1] in self.target_classes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|