datachain 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/_version.py +2 -2
- datachain/asyn.py +3 -3
- datachain/catalog/__init__.py +3 -3
- datachain/catalog/catalog.py +6 -6
- datachain/catalog/loader.py +3 -3
- datachain/cli.py +2 -1
- datachain/client/azure.py +37 -1
- datachain/client/fsspec.py +1 -1
- datachain/client/local.py +1 -1
- datachain/data_storage/__init__.py +1 -1
- datachain/data_storage/metastore.py +11 -3
- datachain/data_storage/schema.py +2 -3
- datachain/data_storage/warehouse.py +31 -30
- datachain/dataset.py +1 -3
- datachain/lib/arrow.py +85 -0
- datachain/lib/dc.py +377 -178
- datachain/lib/feature.py +41 -90
- datachain/lib/feature_registry.py +3 -1
- datachain/lib/feature_utils.py +2 -2
- datachain/lib/file.py +20 -20
- datachain/lib/image.py +9 -2
- datachain/lib/meta_formats.py +66 -34
- datachain/lib/settings.py +5 -5
- datachain/lib/signal_schema.py +103 -105
- datachain/lib/udf.py +3 -12
- datachain/lib/udf_signature.py +11 -6
- datachain/lib/webdataset_laion.py +5 -22
- datachain/listing.py +8 -8
- datachain/node.py +1 -1
- datachain/progress.py +1 -1
- datachain/query/builtins.py +1 -1
- datachain/query/dataset.py +39 -110
- datachain/query/dispatch.py +1 -1
- datachain/query/metrics.py +19 -0
- datachain/query/schema.py +13 -3
- datachain/sql/__init__.py +1 -1
- datachain/utils.py +1 -122
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
- datachain/lib/parquet.py +0 -32
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
datachain/lib/feature.py
CHANGED
|
@@ -4,6 +4,7 @@ import re
|
|
|
4
4
|
import warnings
|
|
5
5
|
from collections.abc import Iterable, Sequence
|
|
6
6
|
from datetime import datetime
|
|
7
|
+
from functools import lru_cache
|
|
7
8
|
from types import GenericAlias
|
|
8
9
|
from typing import (
|
|
9
10
|
Any,
|
|
@@ -22,7 +23,7 @@ from typing_extensions import Literal as LiteralEx
|
|
|
22
23
|
|
|
23
24
|
from datachain.lib.feature_registry import Registry
|
|
24
25
|
from datachain.query import C
|
|
25
|
-
from datachain.query.
|
|
26
|
+
from datachain.query.schema import DEFAULT_DELIMITER
|
|
26
27
|
from datachain.sql.types import (
|
|
27
28
|
JSON,
|
|
28
29
|
Array,
|
|
@@ -62,6 +63,7 @@ TYPE_TO_DATACHAIN = {
|
|
|
62
63
|
bool: Boolean,
|
|
63
64
|
datetime: DateTime, # Note, list of datetime is not supported yet
|
|
64
65
|
bytes: Binary, # Note, list of bytes is not supported yet
|
|
66
|
+
list: Array,
|
|
65
67
|
dict: JSON,
|
|
66
68
|
}
|
|
67
69
|
|
|
@@ -108,8 +110,6 @@ warnings.filterwarnings(
|
|
|
108
110
|
# skipped within loops.
|
|
109
111
|
feature_classes_lookup: dict[type, bool] = {}
|
|
110
112
|
|
|
111
|
-
DEFAULT_DELIMITER = "__"
|
|
112
|
-
|
|
113
113
|
|
|
114
114
|
class Feature(BaseModel):
|
|
115
115
|
"""A base class for defining data classes that serve as inputs and outputs for
|
|
@@ -117,9 +117,6 @@ class Feature(BaseModel):
|
|
|
117
117
|
`pydantic`'s BaseModel.
|
|
118
118
|
"""
|
|
119
119
|
|
|
120
|
-
_is_shallow: ClassVar[bool] = False
|
|
121
|
-
_expand_class_name: ClassVar[bool] = False
|
|
122
|
-
_delimiter: ClassVar[str] = DEFAULT_DELIMITER
|
|
123
120
|
_is_file: ClassVar[bool] = False
|
|
124
121
|
_version: ClassVar[int] = 1
|
|
125
122
|
|
|
@@ -135,20 +132,6 @@ class Feature(BaseModel):
|
|
|
135
132
|
def _name(cls) -> str:
|
|
136
133
|
return f"{cls.__name__}@{cls._version}"
|
|
137
134
|
|
|
138
|
-
def _get_value_with_check(self, *args: Any, **kwargs: Any) -> Any:
|
|
139
|
-
signature = inspect.signature(self.get_value)
|
|
140
|
-
for i, (name, prm) in enumerate(signature.parameters.items()):
|
|
141
|
-
if prm.default == inspect.Parameter.empty:
|
|
142
|
-
if i < len(args):
|
|
143
|
-
continue
|
|
144
|
-
if name not in kwargs:
|
|
145
|
-
raise ValueError(
|
|
146
|
-
f"unable to get value for class {self.__class__.__name__}"
|
|
147
|
-
f" due to a missing parameter {name} in get_value()"
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
return self.get_value(*args, **kwargs)
|
|
151
|
-
|
|
152
135
|
@classmethod
|
|
153
136
|
def __pydantic_init_subclass__(cls):
|
|
154
137
|
Registry.add(cls)
|
|
@@ -162,9 +145,10 @@ class Feature(BaseModel):
|
|
|
162
145
|
|
|
163
146
|
@classmethod
|
|
164
147
|
def _normalize(cls, name: str) -> str:
|
|
165
|
-
if
|
|
148
|
+
if DEFAULT_DELIMITER in name:
|
|
166
149
|
raise RuntimeError(
|
|
167
|
-
f"variable '{name}' cannot be used
|
|
150
|
+
f"variable '{name}' cannot be used "
|
|
151
|
+
f"because it contains {DEFAULT_DELIMITER}"
|
|
168
152
|
)
|
|
169
153
|
return Feature._to_snake_case(name)
|
|
170
154
|
|
|
@@ -187,35 +171,6 @@ class Feature(BaseModel):
|
|
|
187
171
|
if Feature.is_feature(anno):
|
|
188
172
|
yield from anno.get_file_signals([*path, name]) # type: ignore[union-attr]
|
|
189
173
|
|
|
190
|
-
@classmethod
|
|
191
|
-
def _flatten_full_schema(cls, fields, name_path):
|
|
192
|
-
for name, f_info in fields.items():
|
|
193
|
-
anno = f_info.annotation
|
|
194
|
-
name = cls._normalize(name)
|
|
195
|
-
|
|
196
|
-
orig = get_origin(anno)
|
|
197
|
-
if orig == list:
|
|
198
|
-
anno = get_args(anno)
|
|
199
|
-
if isinstance(anno, tuple):
|
|
200
|
-
anno = anno[0]
|
|
201
|
-
is_list = True
|
|
202
|
-
else:
|
|
203
|
-
is_list = False
|
|
204
|
-
|
|
205
|
-
if Feature.is_feature(anno):
|
|
206
|
-
lst = copy.copy(name_path)
|
|
207
|
-
lst = [] if anno._is_shallow else [*lst, name]
|
|
208
|
-
|
|
209
|
-
if is_list:
|
|
210
|
-
yield anno._delimiter.join(lst), Array(JSON)
|
|
211
|
-
else:
|
|
212
|
-
yield from cls._flatten_full_schema(anno.model_fields, lst)
|
|
213
|
-
else:
|
|
214
|
-
typ = convert_type_to_datachain(anno)
|
|
215
|
-
if is_list:
|
|
216
|
-
typ = Array(typ)
|
|
217
|
-
yield cls._delimiter.join([*name_path, name]), typ
|
|
218
|
-
|
|
219
174
|
@classmethod
|
|
220
175
|
def is_feature(cls, anno) -> bool:
|
|
221
176
|
if anno in feature_classes_lookup:
|
|
@@ -242,22 +197,10 @@ class Feature(BaseModel):
|
|
|
242
197
|
def is_feature_type(cls, t: type) -> bool:
|
|
243
198
|
if cls.is_standard_type(t):
|
|
244
199
|
return True
|
|
245
|
-
if get_origin(t)
|
|
200
|
+
if get_origin(t) is list and len(get_args(t)) == 1:
|
|
246
201
|
return cls.is_feature_type(get_args(t)[0])
|
|
247
202
|
return cls.is_feature(t)
|
|
248
203
|
|
|
249
|
-
@classmethod
|
|
250
|
-
def _to_udf_spec(cls):
|
|
251
|
-
return list(cls._flatten_full_schema(cls.model_fields, []))
|
|
252
|
-
|
|
253
|
-
@staticmethod
|
|
254
|
-
def _features_to_udf_spec(fr_classes: Sequence[type["Feature"]]) -> UDFOutputSpec:
|
|
255
|
-
return dict(
|
|
256
|
-
item
|
|
257
|
-
for b in fr_classes
|
|
258
|
-
for item in b._to_udf_spec() # type: ignore[attr-defined]
|
|
259
|
-
)
|
|
260
|
-
|
|
261
204
|
def _flatten_fields_values(self, fields, model):
|
|
262
205
|
for name, f_info in fields.items():
|
|
263
206
|
anno = f_info.annotation
|
|
@@ -280,16 +223,15 @@ class Feature(BaseModel):
|
|
|
280
223
|
yield value
|
|
281
224
|
|
|
282
225
|
def _flatten(self):
|
|
283
|
-
return tuple(self.
|
|
284
|
-
|
|
285
|
-
def _flatten_generator(self):
|
|
286
|
-
# Optimization: Use a generator instead of a tuple if all values are going to
|
|
287
|
-
# be used immediately in another comprehension or function call.
|
|
288
|
-
return self._flatten_fields_values(self.model_fields, self)
|
|
226
|
+
return tuple(self._flatten_fields_values(self.model_fields, self))
|
|
289
227
|
|
|
290
228
|
@staticmethod
|
|
291
229
|
def _flatten_list(objs):
|
|
292
|
-
return tuple(
|
|
230
|
+
return tuple(
|
|
231
|
+
val
|
|
232
|
+
for obj in objs
|
|
233
|
+
for val in obj._flatten_fields_values(obj.model_fields, obj)
|
|
234
|
+
)
|
|
293
235
|
|
|
294
236
|
@classmethod
|
|
295
237
|
def _unflatten_with_path(cls, dump, name_path: list[str]):
|
|
@@ -300,14 +242,12 @@ class Feature(BaseModel):
|
|
|
300
242
|
lst = copy.copy(name_path)
|
|
301
243
|
|
|
302
244
|
if inspect.isclass(anno) and issubclass(anno, Feature):
|
|
303
|
-
|
|
304
|
-
lst.append(name_norm)
|
|
305
|
-
|
|
245
|
+
lst.append(name_norm)
|
|
306
246
|
val = anno._unflatten_with_path(dump, lst)
|
|
307
247
|
res[name] = val
|
|
308
248
|
else:
|
|
309
249
|
lst.append(name_norm)
|
|
310
|
-
curr_path =
|
|
250
|
+
curr_path = DEFAULT_DELIMITER.join(lst)
|
|
311
251
|
res[name] = dump[curr_path]
|
|
312
252
|
return cls(**res)
|
|
313
253
|
|
|
@@ -336,6 +276,18 @@ class Feature(BaseModel):
|
|
|
336
276
|
pos += 1
|
|
337
277
|
return res, pos
|
|
338
278
|
|
|
279
|
+
@classmethod
|
|
280
|
+
@lru_cache(maxsize=1000)
|
|
281
|
+
def build_tree(cls):
|
|
282
|
+
res = {}
|
|
283
|
+
|
|
284
|
+
for name, f_info in cls.model_fields.items():
|
|
285
|
+
anno = f_info.annotation
|
|
286
|
+
subtree = anno.build_tree() if Feature.is_feature(anno) else None
|
|
287
|
+
res[name] = (anno, subtree)
|
|
288
|
+
|
|
289
|
+
return res
|
|
290
|
+
|
|
339
291
|
|
|
340
292
|
class RestrictedAttribute:
|
|
341
293
|
"""Descriptor implementing an attribute that can only be accessed through
|
|
@@ -374,7 +326,7 @@ class FeatureAttributeWrapper:
|
|
|
374
326
|
|
|
375
327
|
@property
|
|
376
328
|
def name(self) -> str:
|
|
377
|
-
return
|
|
329
|
+
return DEFAULT_DELIMITER.join(self.prefix)
|
|
378
330
|
|
|
379
331
|
def __getattr__(self, name):
|
|
380
332
|
field_info = self.cls.model_fields.get(name)
|
|
@@ -401,22 +353,16 @@ def _resolve(cls, name, field_info, prefix: list[str]):
|
|
|
401
353
|
except TypeError:
|
|
402
354
|
anno_sql_class = NullType
|
|
403
355
|
new_prefix = copy.copy(prefix)
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
return C(cls._delimiter.join(new_prefix), anno_sql_class)
|
|
356
|
+
new_prefix.append(norm_name)
|
|
357
|
+
return C(DEFAULT_DELIMITER.join(new_prefix), anno_sql_class)
|
|
407
358
|
|
|
408
|
-
|
|
409
|
-
return FeatureAttributeWrapper(anno, [*prefix, norm_name])
|
|
410
|
-
|
|
411
|
-
new_prefix_value = copy.copy(prefix)
|
|
412
|
-
if not cls._is_shallow:
|
|
413
|
-
new_prefix_value.append(norm_name)
|
|
414
|
-
return FeatureAttributeWrapper(anno, new_prefix_value)
|
|
359
|
+
return FeatureAttributeWrapper(anno, [*prefix, norm_name])
|
|
415
360
|
|
|
416
361
|
|
|
417
362
|
def convert_type_to_datachain(typ): # noqa: PLR0911
|
|
418
363
|
if inspect.isclass(typ) and issubclass(typ, SQLType):
|
|
419
364
|
return typ
|
|
365
|
+
|
|
420
366
|
res = TYPE_TO_DATACHAIN.get(typ)
|
|
421
367
|
if res:
|
|
422
368
|
return res
|
|
@@ -430,7 +376,12 @@ def convert_type_to_datachain(typ): # noqa: PLR0911
|
|
|
430
376
|
if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
|
|
431
377
|
if args is None or len(args) != 1:
|
|
432
378
|
raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
|
|
433
|
-
|
|
379
|
+
|
|
380
|
+
args0 = args[0]
|
|
381
|
+
if Feature.is_feature(args0):
|
|
382
|
+
return Array(JSON())
|
|
383
|
+
|
|
384
|
+
next_type = convert_type_to_datachain(args0)
|
|
434
385
|
return Array(next_type)
|
|
435
386
|
|
|
436
387
|
if inspect.isclass(orig) and issubclass(dict, orig):
|
|
@@ -443,10 +394,10 @@ def convert_type_to_datachain(typ): # noqa: PLR0911
|
|
|
443
394
|
if orig == Union and len(args) >= 2:
|
|
444
395
|
args_no_nones = [arg for arg in args if arg != type(None)]
|
|
445
396
|
if len(args_no_nones) == 2:
|
|
446
|
-
args_no_dicts = [arg for arg in args_no_nones if arg
|
|
447
|
-
if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0])
|
|
397
|
+
args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
|
|
398
|
+
if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
|
|
448
399
|
arg = get_args(args_no_dicts[0])
|
|
449
|
-
if len(arg) == 1 and arg[0]
|
|
400
|
+
if len(arg) == 1 and arg[0] is dict:
|
|
450
401
|
return JSON
|
|
451
402
|
|
|
452
403
|
raise TypeError(f"Cannot recognize type {typ}")
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from typing import Any, ClassVar, Optional
|
|
2
2
|
|
|
3
|
+
from datachain.cli import logger
|
|
4
|
+
|
|
3
5
|
|
|
4
6
|
class Registry:
|
|
5
7
|
reg: ClassVar[dict[str, dict[int, Any]]] = {}
|
|
@@ -14,7 +16,7 @@ class Registry:
|
|
|
14
16
|
version = fr._version # type: ignore[attr-defined]
|
|
15
17
|
if version in cls.reg[name]:
|
|
16
18
|
full_name = f"{name}@{version}"
|
|
17
|
-
|
|
19
|
+
logger.warning(f"Feature {full_name} is already registered")
|
|
18
20
|
cls.reg[name][version] = fr
|
|
19
21
|
|
|
20
22
|
@classmethod
|
datachain/lib/feature_utils.py
CHANGED
|
@@ -40,7 +40,7 @@ def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
|
|
|
40
40
|
anno = field_info.annotation
|
|
41
41
|
if anno not in TYPE_TO_DATACHAIN:
|
|
42
42
|
orig = get_origin(anno)
|
|
43
|
-
if orig
|
|
43
|
+
if orig is list:
|
|
44
44
|
anno = get_args(anno) # type: ignore[assignment]
|
|
45
45
|
if isinstance(anno, Sequence):
|
|
46
46
|
anno = anno[0] # type: ignore[unreachable]
|
|
@@ -122,7 +122,7 @@ def features_to_tuples(
|
|
|
122
122
|
if isinstance(output, dict):
|
|
123
123
|
raise FeatureToTupleError(
|
|
124
124
|
ds_name,
|
|
125
|
-
|
|
125
|
+
"output type must be dict[str, FeatureType] while "
|
|
126
126
|
f"'{type(output).__name__}' is given",
|
|
127
127
|
)
|
|
128
128
|
else:
|
datachain/lib/file.py
CHANGED
|
@@ -1,30 +1,22 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from io import BytesIO
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Any, ClassVar, Literal, Optional, Union
|
|
6
|
+
from urllib.parse import unquote, urlparse
|
|
7
|
+
from urllib.request import url2pathname
|
|
7
8
|
|
|
8
9
|
from fsspec import Callback
|
|
10
|
+
from fsspec.implementations.local import LocalFileSystem
|
|
9
11
|
from pydantic import Field, field_validator
|
|
10
12
|
|
|
11
|
-
from datachain.lib.feature import Feature
|
|
12
|
-
from datachain.utils import TIME_ZERO
|
|
13
|
-
|
|
14
|
-
try:
|
|
15
|
-
from PIL import Image
|
|
16
|
-
except ImportError as exc:
|
|
17
|
-
raise ImportError(
|
|
18
|
-
"Missing dependencies for computer vision:\n"
|
|
19
|
-
"To install run:\n\n"
|
|
20
|
-
" pip install 'datachain[cv]'\n"
|
|
21
|
-
) from exc
|
|
22
|
-
|
|
23
13
|
from datachain.cache import UniqueId
|
|
24
14
|
from datachain.client.fileslice import FileSlice
|
|
25
15
|
from datachain.lib.cached_stream import PreCachedStream, PreDownloadStream
|
|
16
|
+
from datachain.lib.feature import Feature
|
|
26
17
|
from datachain.lib.utils import DataChainError
|
|
27
18
|
from datachain.sql.types import JSON, Int, String
|
|
19
|
+
from datachain.utils import TIME_ZERO
|
|
28
20
|
|
|
29
21
|
|
|
30
22
|
class FileFeature(Feature):
|
|
@@ -49,7 +41,7 @@ class VFileError(DataChainError):
|
|
|
49
41
|
|
|
50
42
|
class FileError(DataChainError):
|
|
51
43
|
def __init__(self, file: "File", message: str):
|
|
52
|
-
super().__init__(f"Error in file {file.
|
|
44
|
+
super().__init__(f"Error in file {file.get_uri()}: {message}")
|
|
53
45
|
|
|
54
46
|
|
|
55
47
|
class VFile(ABC):
|
|
@@ -237,7 +229,7 @@ class File(FileFeature):
|
|
|
237
229
|
def get_full_name(self):
|
|
238
230
|
return (Path(self.parent) / self.name).as_posix()
|
|
239
231
|
|
|
240
|
-
def
|
|
232
|
+
def get_uri(self):
|
|
241
233
|
return f"{self.source}/{self.get_full_name()}"
|
|
242
234
|
|
|
243
235
|
def _open_stream(self, cache: bool = False, cb: Optional[Callback] = None):
|
|
@@ -245,14 +237,20 @@ class File(FileFeature):
|
|
|
245
237
|
uid = self.get_uid()
|
|
246
238
|
return client.open_object(uid, use_cache=cache, cb=cb)
|
|
247
239
|
|
|
240
|
+
def get_path(self) -> str:
|
|
241
|
+
path = unquote(self.get_uri())
|
|
242
|
+
fs = self.get_fs()
|
|
243
|
+
if isinstance(fs, LocalFileSystem):
|
|
244
|
+
# Drop file:// protocol
|
|
245
|
+
path = urlparse(path).path
|
|
246
|
+
path = url2pathname(path)
|
|
247
|
+
return path
|
|
248
248
|
|
|
249
|
-
|
|
249
|
+
def get_fs(self):
|
|
250
|
+
return self._catalog.get_client(self.source).fs
|
|
250
251
|
|
|
251
252
|
|
|
252
|
-
|
|
253
|
-
def get_value(self):
|
|
254
|
-
value = super().get_value()
|
|
255
|
-
return Image.open(BytesIO(value))
|
|
253
|
+
BinaryFile = File
|
|
256
254
|
|
|
257
255
|
|
|
258
256
|
class TextFile(File):
|
|
@@ -272,6 +270,8 @@ def get_file(type: Literal["binary", "text", "image"] = "binary"):
|
|
|
272
270
|
if type == "text":
|
|
273
271
|
file = TextFile
|
|
274
272
|
elif type == "image":
|
|
273
|
+
from datachain.lib.image import ImageFile
|
|
274
|
+
|
|
275
275
|
file = ImageFile # type: ignore[assignment]
|
|
276
276
|
|
|
277
277
|
def get_file_type(
|
datachain/lib/image.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import inspect
|
|
2
|
+
from io import BytesIO
|
|
2
3
|
from typing import Any, Callable, Optional
|
|
3
4
|
|
|
4
|
-
from datachain.lib.file import
|
|
5
|
+
from datachain.lib.file import File
|
|
5
6
|
|
|
6
7
|
try:
|
|
7
8
|
import torch
|
|
@@ -16,6 +17,12 @@ except ImportError as exc:
|
|
|
16
17
|
from datachain.lib.reader import FeatureReader
|
|
17
18
|
|
|
18
19
|
|
|
20
|
+
class ImageFile(File):
|
|
21
|
+
def get_value(self):
|
|
22
|
+
value = super().get_value()
|
|
23
|
+
return Image.open(BytesIO(value))
|
|
24
|
+
|
|
25
|
+
|
|
19
26
|
def convert_image(
|
|
20
27
|
img: Image.Image,
|
|
21
28
|
mode: str = "RGB",
|
|
@@ -48,7 +55,7 @@ def convert_image(
|
|
|
48
55
|
and inspect.ismethod(getattr(open_clip_model, method_name))
|
|
49
56
|
):
|
|
50
57
|
raise ValueError(
|
|
51
|
-
|
|
58
|
+
"Unable to render Image: 'open_clip_model' doesn't support"
|
|
52
59
|
f" '{method_name}()'"
|
|
53
60
|
)
|
|
54
61
|
img = open_clip_model.encode_image(img)
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -11,6 +11,7 @@ from collections.abc import Iterator
|
|
|
11
11
|
from typing import Any, Callable
|
|
12
12
|
|
|
13
13
|
import jmespath as jsp
|
|
14
|
+
from pydantic import ValidationError
|
|
14
15
|
|
|
15
16
|
from datachain.lib.feature_utils import pydantic_to_feature # noqa: F401
|
|
16
17
|
from datachain.lib.file import File
|
|
@@ -25,46 +26,48 @@ def generate_uuid():
|
|
|
25
26
|
# JSON decoder
|
|
26
27
|
def load_json_from_string(json_string):
|
|
27
28
|
try:
|
|
28
|
-
|
|
29
|
-
print("Successfully parsed JSON", file=sys.stderr)
|
|
30
|
-
return data
|
|
29
|
+
return json.loads(json_string)
|
|
31
30
|
except json.JSONDecodeError:
|
|
32
|
-
print("Failed to decode JSON:
|
|
33
|
-
|
|
31
|
+
print(f"Failed to decode JSON: {json_string} is not formatted correctly.")
|
|
32
|
+
return None
|
|
34
33
|
|
|
35
34
|
|
|
36
|
-
#
|
|
35
|
+
# Validate and reduce JSON
|
|
37
36
|
def process_json(data_string, jmespath):
|
|
38
37
|
json_dict = load_json_from_string(data_string)
|
|
39
38
|
if jmespath:
|
|
40
39
|
json_dict = jsp.search(jmespath, json_dict)
|
|
41
|
-
|
|
42
|
-
# but if jmespath expression is given, we assume a list
|
|
43
|
-
if not isinstance(json_dict, list):
|
|
44
|
-
raise ValueError("JMESPATH expression must resolve to a list")
|
|
45
|
-
return None
|
|
46
|
-
json_dict = json_dict[0] # sample the first object
|
|
47
|
-
return json.dumps(json_dict)
|
|
40
|
+
return json_dict
|
|
48
41
|
|
|
49
42
|
|
|
50
43
|
# Print a dynamic datamodel-codegen output from JSON or CSV on stdout
|
|
51
|
-
def read_schema(source_file, data_type="csv", expr=None):
|
|
44
|
+
def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
52
45
|
data_string = ""
|
|
53
|
-
uid_str = str(generate_uuid()).replace("-", "") # comply with Python class names
|
|
54
46
|
# using uiid to get around issue #1617
|
|
55
|
-
|
|
47
|
+
if not model_name:
|
|
48
|
+
uid_str = str(generate_uuid()).replace(
|
|
49
|
+
"-", ""
|
|
50
|
+
) # comply with Python class names
|
|
51
|
+
model_name = f"Model{data_type}{uid_str}"
|
|
56
52
|
try:
|
|
57
53
|
with source_file.open() as fd: # CSV can be larger than memory
|
|
58
54
|
if data_type == "csv":
|
|
59
55
|
data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
|
|
60
56
|
data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
|
|
57
|
+
elif data_type == "jsonl":
|
|
58
|
+
data_string = fd.readline().decode("utf-8", "ignore").replace("\r", "")
|
|
61
59
|
else:
|
|
62
60
|
data_string = fd.read() # other meta must fit into RAM
|
|
63
61
|
except OSError as e:
|
|
64
62
|
print(f"An unexpected file error occurred: {e}")
|
|
65
63
|
return
|
|
66
|
-
if data_type
|
|
67
|
-
|
|
64
|
+
if data_type in ("json", "jsonl"):
|
|
65
|
+
json_object = process_json(data_string, expr)
|
|
66
|
+
if data_type == "json" and isinstance(json_object, list):
|
|
67
|
+
json_object = json_object[0] # sample the 1st object from JSON array
|
|
68
|
+
if data_type == "jsonl":
|
|
69
|
+
data_type = "json" # treat json line as plain JSON in auto-schema
|
|
70
|
+
data_string = json.dumps(json_object)
|
|
68
71
|
command = [
|
|
69
72
|
"datamodel-codegen",
|
|
70
73
|
"--input-file-type",
|
|
@@ -73,8 +76,8 @@ def read_schema(source_file, data_type="csv", expr=None):
|
|
|
73
76
|
model_name,
|
|
74
77
|
]
|
|
75
78
|
try:
|
|
76
|
-
result = subprocess.run(
|
|
77
|
-
command,
|
|
79
|
+
result = subprocess.run( # noqa: S603
|
|
80
|
+
command,
|
|
78
81
|
input=data_string,
|
|
79
82
|
text=True,
|
|
80
83
|
capture_output=True,
|
|
@@ -87,13 +90,19 @@ def read_schema(source_file, data_type="csv", expr=None):
|
|
|
87
90
|
model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
|
|
88
91
|
print(f"{model_output}")
|
|
89
92
|
print("\n" + f"spec=pydantic_to_feature({model_name})" + "\n")
|
|
93
|
+
return model_output
|
|
90
94
|
|
|
91
95
|
|
|
92
96
|
#
|
|
93
97
|
# UDF mapper which calls chain in the setup to infer the dynamic schema
|
|
94
98
|
#
|
|
95
|
-
def read_meta(
|
|
96
|
-
spec=None,
|
|
99
|
+
def read_meta( # noqa: C901
|
|
100
|
+
spec=None,
|
|
101
|
+
schema_from=None,
|
|
102
|
+
meta_type="json",
|
|
103
|
+
jmespath=None,
|
|
104
|
+
show_schema=False,
|
|
105
|
+
model_name=None,
|
|
97
106
|
) -> Callable:
|
|
98
107
|
from datachain.lib.dc import DataChain
|
|
99
108
|
|
|
@@ -108,7 +117,7 @@ def read_meta(
|
|
|
108
117
|
.limit(1)
|
|
109
118
|
.map( # dummy column created (#1615)
|
|
110
119
|
meta_schema=lambda file: read_schema(
|
|
111
|
-
file, data_type=meta_type, expr=jmespath
|
|
120
|
+
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
112
121
|
),
|
|
113
122
|
output=str,
|
|
114
123
|
)
|
|
@@ -119,6 +128,7 @@ def read_meta(
|
|
|
119
128
|
sys.stdout = current_stdout
|
|
120
129
|
model_output = captured_output.getvalue()
|
|
121
130
|
captured_output.close()
|
|
131
|
+
|
|
122
132
|
if show_schema:
|
|
123
133
|
print(f"{model_output}")
|
|
124
134
|
# Below 'spec' should be a dynamically converted Feature from Pydantic datamodel
|
|
@@ -135,30 +145,52 @@ def read_meta(
|
|
|
135
145
|
#
|
|
136
146
|
# UDF mapper parsing a JSON or CSV file using schema spec
|
|
137
147
|
#
|
|
148
|
+
|
|
138
149
|
def parse_data(
|
|
139
|
-
file: File,
|
|
150
|
+
file: File,
|
|
151
|
+
DataModel=spec, # noqa: N803
|
|
152
|
+
meta_type=meta_type,
|
|
153
|
+
jmespath=jmespath,
|
|
140
154
|
) -> Iterator[spec]:
|
|
155
|
+
def validator(json_object: dict) -> spec:
|
|
156
|
+
json_string = json.dumps(json_object)
|
|
157
|
+
try:
|
|
158
|
+
data_instance = DataModel.model_validate_json(json_string)
|
|
159
|
+
yield data_instance
|
|
160
|
+
except ValidationError as e:
|
|
161
|
+
print(f"Validation error occurred in file {file.name}:", e)
|
|
162
|
+
|
|
141
163
|
if meta_type == "csv":
|
|
142
164
|
with (
|
|
143
165
|
file.open() as fd
|
|
144
166
|
): # TODO: if schema is statically given, should allow CSV without headers
|
|
145
167
|
reader = csv.DictReader(fd)
|
|
146
168
|
for row in reader: # CSV can be larger than memory
|
|
147
|
-
|
|
148
|
-
|
|
169
|
+
yield from validator(row)
|
|
170
|
+
|
|
149
171
|
if meta_type == "json":
|
|
150
172
|
try:
|
|
151
173
|
with file.open() as fd: # JSON must fit into RAM
|
|
152
174
|
data_string = fd.read()
|
|
153
175
|
except OSError as e:
|
|
154
|
-
print(f"An unexpected file error occurred: {e}")
|
|
155
|
-
json_object =
|
|
156
|
-
if jmespath:
|
|
157
|
-
json_object = jsp.search(jmespath, json_object)
|
|
176
|
+
print(f"An unexpected file error occurred in file {file.name}: {e}")
|
|
177
|
+
json_object = process_json(data_string, jmespath)
|
|
158
178
|
if not isinstance(json_object, list):
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
179
|
+
yield from validator(json_object)
|
|
180
|
+
|
|
181
|
+
else:
|
|
182
|
+
for json_dict in json_object:
|
|
183
|
+
yield from validator(json_dict)
|
|
184
|
+
|
|
185
|
+
if meta_type == "jsonl":
|
|
186
|
+
try:
|
|
187
|
+
with file.open() as fd:
|
|
188
|
+
data_string = fd.readline().replace("\r", "")
|
|
189
|
+
while data_string:
|
|
190
|
+
json_object = process_json(data_string, jmespath)
|
|
191
|
+
data_string = fd.readline()
|
|
192
|
+
yield from validator(json_object)
|
|
193
|
+
except OSError as e:
|
|
194
|
+
print(f"An unexpected file error occurred in file {file.name}: {e}")
|
|
163
195
|
|
|
164
196
|
return parse_data
|
datachain/lib/settings.py
CHANGED
|
@@ -18,19 +18,19 @@ class Settings:
|
|
|
18
18
|
|
|
19
19
|
if not isinstance(cache, bool) and cache is not None:
|
|
20
20
|
raise SettingsError(
|
|
21
|
-
|
|
21
|
+
"'cache' argument must be bool"
|
|
22
22
|
f" while {cache.__class__.__name__} was given"
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
if not isinstance(batch, int) and batch is not None:
|
|
26
26
|
raise SettingsError(
|
|
27
|
-
|
|
27
|
+
"'batch' argument must be int or None"
|
|
28
28
|
f" while {batch.__class__.__name__} was given"
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
if not isinstance(parallel, int) and parallel is not None:
|
|
32
32
|
raise SettingsError(
|
|
33
|
-
|
|
33
|
+
"'parallel' argument must be int or None"
|
|
34
34
|
f" while {parallel.__class__.__name__} was given"
|
|
35
35
|
)
|
|
36
36
|
|
|
@@ -40,13 +40,13 @@ class Settings:
|
|
|
40
40
|
and workers is not None
|
|
41
41
|
):
|
|
42
42
|
raise SettingsError(
|
|
43
|
-
|
|
43
|
+
"'workers' argument must be int or bool"
|
|
44
44
|
f" while {workers.__class__.__name__} was given"
|
|
45
45
|
)
|
|
46
46
|
|
|
47
47
|
if min_task_size is not None and not isinstance(min_task_size, int):
|
|
48
48
|
raise SettingsError(
|
|
49
|
-
|
|
49
|
+
"'min_task_size' argument must be int or None"
|
|
50
50
|
f", {min_task_size.__class__.__name__} was given"
|
|
51
51
|
)
|
|
52
52
|
|