datachain 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/file.py CHANGED
@@ -272,8 +272,12 @@ class File(DataModel):
272
272
  def save(self, destination: str):
273
273
  """Writes it's content to destination"""
274
274
  destination = stringify_path(destination)
275
- client: Client = self._catalog.get_client(str(destination))
276
- client.upload(self.read(), str(destination))
275
+ client: Client = self._catalog.get_client(destination)
276
+
277
+ if client.PREFIX == "file://" and not destination.startswith(client.PREFIX):
278
+ destination = Path(destination).absolute().as_uri()
279
+
280
+ client.upload(self.read(), destination)
277
281
 
278
282
  def _symlink_to(self, destination: str):
279
283
  if self.location:
@@ -0,0 +1,147 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from typing import Any, Optional
4
+
5
+ try:
6
+ import tomllib
7
+ except ModuleNotFoundError:
8
+ # tomllib is in standard library from python 3.11 so for earlier versions
9
+ # we need tomli
10
+ import tomli as tomllib # type: ignore[no-redef]
11
+
12
+
13
+ class ScriptConfigParsingError(Exception):
14
+ def __init__(self, message):
15
+ super().__init__(message)
16
+
17
+
18
+ @dataclass
19
+ class ScriptConfig:
20
+ """
21
+ Class that is parsing inline script metadata to get some basic information for
22
+ running datachain script like python version, dependencies, attachments etc.
23
+ Inline script metadata must follow the format described in https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata.
24
+ Example of script with inline metadata:
25
+ # /// script
26
+ # requires-python = ">=3.12"
27
+ #
28
+ # dependencies = [
29
+ # "pandas < 2.1.0",
30
+ # "numpy == 1.26.4"
31
+ # ]
32
+ #
33
+ # [tools.datachain.workers]
34
+ # num_workers = 3
35
+ #
36
+ # [tools.datachain.attachments]
37
+ # image1 = "s3://ldb-public/image1.jpg"
38
+ # file1 = "s3://ldb-public/file.pdf"
39
+ #
40
+ # [tools.datachain.params]
41
+ # min_length_sec = 1
42
+ # cache = false
43
+ #
44
+ # [tools.datachain.inputs]
45
+ # threshold = 0.5
46
+ # start_ds_name = "ds://start"
47
+ #
48
+ # [tools.datachain.outputs]
49
+ # result_dataset = "ds://res"
50
+ # result_dir = "/temp"
51
+ #
52
+ # ///
53
+
54
+ import sys
55
+ import pandas as pd
56
+
57
+ print(f"Python version: {sys.version_info}")
58
+ print(f"Pandas version: {pd.__version__}")
59
+
60
+ """
61
+
62
+ python_version: Optional[str]
63
+ dependencies: list[str]
64
+ attachments: dict[str, str]
65
+ params: dict[str, Any]
66
+ inputs: dict[str, Any]
67
+ outputs: dict[str, Any]
68
+ num_workers: Optional[int] = None
69
+
70
+ def __init__(
71
+ self,
72
+ python_version: Optional[str] = None,
73
+ dependencies: Optional[list[str]] = None,
74
+ attachments: Optional[dict[str, str]] = None,
75
+ params: Optional[dict[str, Any]] = None,
76
+ inputs: Optional[dict[str, Any]] = None,
77
+ outputs: Optional[dict[str, Any]] = None,
78
+ num_workers: Optional[int] = None,
79
+ ):
80
+ self.python_version = python_version
81
+ self.dependencies = dependencies or []
82
+ self.attachments = attachments or {}
83
+ self.params = params or {}
84
+ self.inputs = inputs or {}
85
+ self.outputs = outputs or {}
86
+ self.num_workers = num_workers
87
+
88
+ def get_param(self, name: str, default: Any) -> Any:
89
+ return self.params.get(name, default)
90
+
91
+ def get_input(self, name: str, default: Any) -> Any:
92
+ return self.inputs.get(name, default)
93
+
94
+ def get_output(self, name: str, default: Any) -> Any:
95
+ return self.outputs.get(name, default)
96
+
97
+ def get_attachment(self, name: str, default: Any) -> Any:
98
+ return self.attachments.get(name, default)
99
+
100
+ @staticmethod
101
+ def read(script: str) -> Optional[dict]:
102
+ """Converts inline script metadata to dict with all found data"""
103
+ regex = (
104
+ r"(?m)^# \/\/\/ (?P<type>[a-zA-Z0-9-]+)[ \t]*$[\r\n|\r|\n]"
105
+ "(?P<content>(?:^#(?:| .*)$[\r\n|\r|\n])+)^# \\/\\/\\/[ \t]*$"
106
+ )
107
+ name = "script"
108
+ matches = list(
109
+ filter(lambda m: m.group("type") == name, re.finditer(regex, script))
110
+ )
111
+ if len(matches) > 1:
112
+ raise ValueError(f"Multiple {name} blocks found")
113
+ if len(matches) == 1:
114
+ content = "".join(
115
+ line[2:] if line.startswith("# ") else line[1:]
116
+ for line in matches[0].group("content").splitlines(keepends=True)
117
+ )
118
+ return tomllib.loads(content)
119
+ return None
120
+
121
+ @staticmethod
122
+ def parse(script: str) -> Optional["ScriptConfig"]:
123
+ """
124
+ Method that is parsing inline script metadata from datachain script and
125
+ instantiating ScriptConfig class with found data. If no inline metadata is
126
+ found, it returns None
127
+ """
128
+ try:
129
+ meta = ScriptConfig.read(script)
130
+ if not meta:
131
+ return None
132
+ custom = meta.get("tools", {}).get("datachain", {})
133
+ return ScriptConfig(
134
+ python_version=meta.get("requires-python"),
135
+ dependencies=meta.get("dependencies"),
136
+ num_workers=custom.get("workers", {}).get("num_workers"),
137
+ attachments=custom.get("attachments"),
138
+ params={k: str(v) for k, v in custom.get("params").items()}
139
+ if custom.get("params")
140
+ else None,
141
+ inputs=custom.get("inputs"),
142
+ outputs=custom.get("outputs"),
143
+ )
144
+ except Exception as e:
145
+ raise ScriptConfigParsingError(
146
+ f"Error when parsing script meta: {e}"
147
+ ) from e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -49,6 +49,7 @@ Requires-Dist: platformdirs
49
49
  Requires-Dist: dvc-studio-client<1,>=0.21
50
50
  Requires-Dist: tabulate
51
51
  Requires-Dist: websockets
52
+ Requires-Dist: tomli; python_version < "3.11"
52
53
  Provides-Extra: docs
53
54
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
54
55
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -102,7 +103,7 @@ Requires-Dist: datachain[tests]; extra == "examples"
102
103
  Requires-Dist: defusedxml; extra == "examples"
103
104
  Requires-Dist: accelerate; extra == "examples"
104
105
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
105
- Requires-Dist: ultralytics==8.3.74; extra == "examples"
106
+ Requires-Dist: ultralytics==8.3.78; extra == "examples"
106
107
  Requires-Dist: open_clip_torch; extra == "examples"
107
108
 
108
109
  ================
@@ -12,6 +12,7 @@ datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,10
12
12
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
13
13
  datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
14
14
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
15
16
  datachain/studio.py,sha256=Coo_6murSjh-RypiHDWNsVXGmfsopyMPCpPS1sA6uUc,9844
16
17
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
17
18
  datachain/utils.py,sha256=n8fcyOM8P_2CEFK4h8BZxCAwCkOpt8NAeJK5tm1gIOg,14433
@@ -69,7 +70,7 @@ datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
69
70
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
70
71
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
71
72
  datachain/lib/dc.py,sha256=QQPnrS_OB1d3CfjLnYtRByGc7wNX_YT24WOjaoFPJgw,95372
72
- datachain/lib/file.py,sha256=8OblP_hYJLh0z7MWGo3AiyO48eEJ13tzgla1UQf9A8I,27517
73
+ datachain/lib/file.py,sha256=Bbnb7JBiAFRD1RsZwPdvoiWFKHkl7V3haDLh672xTZg,27658
73
74
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
74
75
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
75
76
  datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
@@ -135,9 +136,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
135
136
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
136
137
  datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
137
138
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
138
- datachain-0.10.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
139
- datachain-0.10.0.dist-info/METADATA,sha256=4Eoe6lnoy_HBYtdzrAIjNnagKXagattQ_mluP9WC-ek,11195
140
- datachain-0.10.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
141
- datachain-0.10.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
142
- datachain-0.10.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
143
- datachain-0.10.0.dist-info/RECORD,,
139
+ datachain-0.11.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
140
+ datachain-0.11.0.dist-info/METADATA,sha256=ijLSRDc7IAZe6YxdX0ZRRNY2LOUlsFFib660U_upu20,11241
141
+ datachain-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
142
+ datachain-0.11.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
143
+ datachain-0.11.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
144
+ datachain-0.11.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5