datachain 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/lib/file.py +6 -2
- datachain/script_meta.py +147 -0
- {datachain-0.10.0.dist-info → datachain-0.11.0.dist-info}/METADATA +3 -2
- {datachain-0.10.0.dist-info → datachain-0.11.0.dist-info}/RECORD +8 -7
- {datachain-0.10.0.dist-info → datachain-0.11.0.dist-info}/WHEEL +1 -1
- {datachain-0.10.0.dist-info → datachain-0.11.0.dist-info}/LICENSE +0 -0
- {datachain-0.10.0.dist-info → datachain-0.11.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.10.0.dist-info → datachain-0.11.0.dist-info}/top_level.txt +0 -0
datachain/lib/file.py
CHANGED
|
@@ -272,8 +272,12 @@ class File(DataModel):
|
|
|
272
272
|
def save(self, destination: str):
|
|
273
273
|
"""Writes it's content to destination"""
|
|
274
274
|
destination = stringify_path(destination)
|
|
275
|
-
client: Client = self._catalog.get_client(
|
|
276
|
-
|
|
275
|
+
client: Client = self._catalog.get_client(destination)
|
|
276
|
+
|
|
277
|
+
if client.PREFIX == "file://" and not destination.startswith(client.PREFIX):
|
|
278
|
+
destination = Path(destination).absolute().as_uri()
|
|
279
|
+
|
|
280
|
+
client.upload(self.read(), destination)
|
|
277
281
|
|
|
278
282
|
def _symlink_to(self, destination: str):
|
|
279
283
|
if self.location:
|
datachain/script_meta.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import tomllib
|
|
7
|
+
except ModuleNotFoundError:
|
|
8
|
+
# tomllib is in standard library from python 3.11 so for earlier versions
|
|
9
|
+
# we need tomli
|
|
10
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ScriptConfigParsingError(Exception):
|
|
14
|
+
def __init__(self, message):
|
|
15
|
+
super().__init__(message)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ScriptConfig:
|
|
20
|
+
"""
|
|
21
|
+
Class that is parsing inline script metadata to get some basic information for
|
|
22
|
+
running datachain script like python version, dependencies, attachments etc.
|
|
23
|
+
Inline script metadata must follow the format described in https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata.
|
|
24
|
+
Example of script with inline metadata:
|
|
25
|
+
# /// script
|
|
26
|
+
# requires-python = ">=3.12"
|
|
27
|
+
#
|
|
28
|
+
# dependencies = [
|
|
29
|
+
# "pandas < 2.1.0",
|
|
30
|
+
# "numpy == 1.26.4"
|
|
31
|
+
# ]
|
|
32
|
+
#
|
|
33
|
+
# [tools.datachain.workers]
|
|
34
|
+
# num_workers = 3
|
|
35
|
+
#
|
|
36
|
+
# [tools.datachain.attachments]
|
|
37
|
+
# image1 = "s3://ldb-public/image1.jpg"
|
|
38
|
+
# file1 = "s3://ldb-public/file.pdf"
|
|
39
|
+
#
|
|
40
|
+
# [tools.datachain.params]
|
|
41
|
+
# min_length_sec = 1
|
|
42
|
+
# cache = false
|
|
43
|
+
#
|
|
44
|
+
# [tools.datachain.inputs]
|
|
45
|
+
# threshold = 0.5
|
|
46
|
+
# start_ds_name = "ds://start"
|
|
47
|
+
#
|
|
48
|
+
# [tools.datachain.outputs]
|
|
49
|
+
# result_dataset = "ds://res"
|
|
50
|
+
# result_dir = "/temp"
|
|
51
|
+
#
|
|
52
|
+
# ///
|
|
53
|
+
|
|
54
|
+
import sys
|
|
55
|
+
import pandas as pd
|
|
56
|
+
|
|
57
|
+
print(f"Python version: {sys.version_info}")
|
|
58
|
+
print(f"Pandas version: {pd.__version__}")
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
python_version: Optional[str]
|
|
63
|
+
dependencies: list[str]
|
|
64
|
+
attachments: dict[str, str]
|
|
65
|
+
params: dict[str, Any]
|
|
66
|
+
inputs: dict[str, Any]
|
|
67
|
+
outputs: dict[str, Any]
|
|
68
|
+
num_workers: Optional[int] = None
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
python_version: Optional[str] = None,
|
|
73
|
+
dependencies: Optional[list[str]] = None,
|
|
74
|
+
attachments: Optional[dict[str, str]] = None,
|
|
75
|
+
params: Optional[dict[str, Any]] = None,
|
|
76
|
+
inputs: Optional[dict[str, Any]] = None,
|
|
77
|
+
outputs: Optional[dict[str, Any]] = None,
|
|
78
|
+
num_workers: Optional[int] = None,
|
|
79
|
+
):
|
|
80
|
+
self.python_version = python_version
|
|
81
|
+
self.dependencies = dependencies or []
|
|
82
|
+
self.attachments = attachments or {}
|
|
83
|
+
self.params = params or {}
|
|
84
|
+
self.inputs = inputs or {}
|
|
85
|
+
self.outputs = outputs or {}
|
|
86
|
+
self.num_workers = num_workers
|
|
87
|
+
|
|
88
|
+
def get_param(self, name: str, default: Any) -> Any:
|
|
89
|
+
return self.params.get(name, default)
|
|
90
|
+
|
|
91
|
+
def get_input(self, name: str, default: Any) -> Any:
|
|
92
|
+
return self.inputs.get(name, default)
|
|
93
|
+
|
|
94
|
+
def get_output(self, name: str, default: Any) -> Any:
|
|
95
|
+
return self.outputs.get(name, default)
|
|
96
|
+
|
|
97
|
+
def get_attachment(self, name: str, default: Any) -> Any:
|
|
98
|
+
return self.attachments.get(name, default)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def read(script: str) -> Optional[dict]:
|
|
102
|
+
"""Converts inline script metadata to dict with all found data"""
|
|
103
|
+
regex = (
|
|
104
|
+
r"(?m)^# \/\/\/ (?P<type>[a-zA-Z0-9-]+)[ \t]*$[\r\n|\r|\n]"
|
|
105
|
+
"(?P<content>(?:^#(?:| .*)$[\r\n|\r|\n])+)^# \\/\\/\\/[ \t]*$"
|
|
106
|
+
)
|
|
107
|
+
name = "script"
|
|
108
|
+
matches = list(
|
|
109
|
+
filter(lambda m: m.group("type") == name, re.finditer(regex, script))
|
|
110
|
+
)
|
|
111
|
+
if len(matches) > 1:
|
|
112
|
+
raise ValueError(f"Multiple {name} blocks found")
|
|
113
|
+
if len(matches) == 1:
|
|
114
|
+
content = "".join(
|
|
115
|
+
line[2:] if line.startswith("# ") else line[1:]
|
|
116
|
+
for line in matches[0].group("content").splitlines(keepends=True)
|
|
117
|
+
)
|
|
118
|
+
return tomllib.loads(content)
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def parse(script: str) -> Optional["ScriptConfig"]:
|
|
123
|
+
"""
|
|
124
|
+
Method that is parsing inline script metadata from datachain script and
|
|
125
|
+
instantiating ScriptConfig class with found data. If no inline metadata is
|
|
126
|
+
found, it returns None
|
|
127
|
+
"""
|
|
128
|
+
try:
|
|
129
|
+
meta = ScriptConfig.read(script)
|
|
130
|
+
if not meta:
|
|
131
|
+
return None
|
|
132
|
+
custom = meta.get("tools", {}).get("datachain", {})
|
|
133
|
+
return ScriptConfig(
|
|
134
|
+
python_version=meta.get("requires-python"),
|
|
135
|
+
dependencies=meta.get("dependencies"),
|
|
136
|
+
num_workers=custom.get("workers", {}).get("num_workers"),
|
|
137
|
+
attachments=custom.get("attachments"),
|
|
138
|
+
params={k: str(v) for k, v in custom.get("params").items()}
|
|
139
|
+
if custom.get("params")
|
|
140
|
+
else None,
|
|
141
|
+
inputs=custom.get("inputs"),
|
|
142
|
+
outputs=custom.get("outputs"),
|
|
143
|
+
)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise ScriptConfigParsingError(
|
|
146
|
+
f"Error when parsing script meta: {e}"
|
|
147
|
+
) from e
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -49,6 +49,7 @@ Requires-Dist: platformdirs
|
|
|
49
49
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
50
50
|
Requires-Dist: tabulate
|
|
51
51
|
Requires-Dist: websockets
|
|
52
|
+
Requires-Dist: tomli; python_version < "3.11"
|
|
52
53
|
Provides-Extra: docs
|
|
53
54
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
54
55
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
102
103
|
Requires-Dist: defusedxml; extra == "examples"
|
|
103
104
|
Requires-Dist: accelerate; extra == "examples"
|
|
104
105
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
105
|
-
Requires-Dist: ultralytics==8.3.
|
|
106
|
+
Requires-Dist: ultralytics==8.3.78; extra == "examples"
|
|
106
107
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
107
108
|
|
|
108
109
|
================
|
|
@@ -12,6 +12,7 @@ datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,10
|
|
|
12
12
|
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
13
13
|
datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
|
|
14
14
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
15
16
|
datachain/studio.py,sha256=Coo_6murSjh-RypiHDWNsVXGmfsopyMPCpPS1sA6uUc,9844
|
|
16
17
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
17
18
|
datachain/utils.py,sha256=n8fcyOM8P_2CEFK4h8BZxCAwCkOpt8NAeJK5tm1gIOg,14433
|
|
@@ -69,7 +70,7 @@ datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
|
69
70
|
datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
|
|
70
71
|
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
71
72
|
datachain/lib/dc.py,sha256=QQPnrS_OB1d3CfjLnYtRByGc7wNX_YT24WOjaoFPJgw,95372
|
|
72
|
-
datachain/lib/file.py,sha256=
|
|
73
|
+
datachain/lib/file.py,sha256=Bbnb7JBiAFRD1RsZwPdvoiWFKHkl7V3haDLh672xTZg,27658
|
|
73
74
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
74
75
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
75
76
|
datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
|
|
@@ -135,9 +136,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
135
136
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
136
137
|
datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
|
|
137
138
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
138
|
-
datachain-0.
|
|
139
|
-
datachain-0.
|
|
140
|
-
datachain-0.
|
|
141
|
-
datachain-0.
|
|
142
|
-
datachain-0.
|
|
143
|
-
datachain-0.
|
|
139
|
+
datachain-0.11.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
140
|
+
datachain-0.11.0.dist-info/METADATA,sha256=ijLSRDc7IAZe6YxdX0ZRRNY2LOUlsFFib660U_upu20,11241
|
|
141
|
+
datachain-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
142
|
+
datachain-0.11.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
143
|
+
datachain-0.11.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
144
|
+
datachain-0.11.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|