datachain 0.1.12__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show
  1. datachain/_version.py +2 -2
  2. datachain/asyn.py +3 -3
  3. datachain/catalog/__init__.py +3 -3
  4. datachain/catalog/catalog.py +6 -6
  5. datachain/catalog/loader.py +3 -3
  6. datachain/cli.py +2 -1
  7. datachain/client/azure.py +37 -1
  8. datachain/client/fsspec.py +1 -1
  9. datachain/client/local.py +1 -1
  10. datachain/data_storage/__init__.py +1 -1
  11. datachain/data_storage/metastore.py +11 -3
  12. datachain/data_storage/schema.py +2 -3
  13. datachain/data_storage/warehouse.py +31 -30
  14. datachain/dataset.py +1 -3
  15. datachain/lib/arrow.py +85 -0
  16. datachain/lib/dc.py +377 -178
  17. datachain/lib/feature.py +41 -90
  18. datachain/lib/feature_registry.py +3 -1
  19. datachain/lib/feature_utils.py +2 -2
  20. datachain/lib/file.py +20 -20
  21. datachain/lib/image.py +9 -2
  22. datachain/lib/meta_formats.py +66 -34
  23. datachain/lib/settings.py +5 -5
  24. datachain/lib/signal_schema.py +103 -105
  25. datachain/lib/udf.py +3 -12
  26. datachain/lib/udf_signature.py +11 -6
  27. datachain/lib/webdataset_laion.py +5 -22
  28. datachain/listing.py +8 -8
  29. datachain/node.py +1 -1
  30. datachain/progress.py +1 -1
  31. datachain/query/builtins.py +1 -1
  32. datachain/query/dataset.py +39 -110
  33. datachain/query/dispatch.py +1 -1
  34. datachain/query/metrics.py +19 -0
  35. datachain/query/schema.py +13 -3
  36. datachain/sql/__init__.py +1 -1
  37. datachain/utils.py +1 -122
  38. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
  39. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
  40. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
  41. datachain/lib/parquet.py +0 -32
  42. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
  43. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
  44. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
datachain/lib/feature.py CHANGED
@@ -4,6 +4,7 @@ import re
4
4
  import warnings
5
5
  from collections.abc import Iterable, Sequence
6
6
  from datetime import datetime
7
+ from functools import lru_cache
7
8
  from types import GenericAlias
8
9
  from typing import (
9
10
  Any,
@@ -22,7 +23,7 @@ from typing_extensions import Literal as LiteralEx
22
23
 
23
24
  from datachain.lib.feature_registry import Registry
24
25
  from datachain.query import C
25
- from datachain.query.udf import UDFOutputSpec
26
+ from datachain.query.schema import DEFAULT_DELIMITER
26
27
  from datachain.sql.types import (
27
28
  JSON,
28
29
  Array,
@@ -62,6 +63,7 @@ TYPE_TO_DATACHAIN = {
62
63
  bool: Boolean,
63
64
  datetime: DateTime, # Note, list of datetime is not supported yet
64
65
  bytes: Binary, # Note, list of bytes is not supported yet
66
+ list: Array,
65
67
  dict: JSON,
66
68
  }
67
69
 
@@ -108,8 +110,6 @@ warnings.filterwarnings(
108
110
  # skipped within loops.
109
111
  feature_classes_lookup: dict[type, bool] = {}
110
112
 
111
- DEFAULT_DELIMITER = "__"
112
-
113
113
 
114
114
  class Feature(BaseModel):
115
115
  """A base class for defining data classes that serve as inputs and outputs for
@@ -117,9 +117,6 @@ class Feature(BaseModel):
117
117
  `pydantic`'s BaseModel.
118
118
  """
119
119
 
120
- _is_shallow: ClassVar[bool] = False
121
- _expand_class_name: ClassVar[bool] = False
122
- _delimiter: ClassVar[str] = DEFAULT_DELIMITER
123
120
  _is_file: ClassVar[bool] = False
124
121
  _version: ClassVar[int] = 1
125
122
 
@@ -135,20 +132,6 @@ class Feature(BaseModel):
135
132
  def _name(cls) -> str:
136
133
  return f"{cls.__name__}@{cls._version}"
137
134
 
138
- def _get_value_with_check(self, *args: Any, **kwargs: Any) -> Any:
139
- signature = inspect.signature(self.get_value)
140
- for i, (name, prm) in enumerate(signature.parameters.items()):
141
- if prm.default == inspect.Parameter.empty:
142
- if i < len(args):
143
- continue
144
- if name not in kwargs:
145
- raise ValueError(
146
- f"unable to get value for class {self.__class__.__name__}"
147
- f" due to a missing parameter {name} in get_value()"
148
- )
149
-
150
- return self.get_value(*args, **kwargs)
151
-
152
135
  @classmethod
153
136
  def __pydantic_init_subclass__(cls):
154
137
  Registry.add(cls)
@@ -162,9 +145,10 @@ class Feature(BaseModel):
162
145
 
163
146
  @classmethod
164
147
  def _normalize(cls, name: str) -> str:
165
- if cls._delimiter and cls._delimiter.lower() in name.lower():
148
+ if DEFAULT_DELIMITER in name:
166
149
  raise RuntimeError(
167
- f"variable '{name}' cannot be used because it contains {cls._delimiter}"
150
+ f"variable '{name}' cannot be used "
151
+ f"because it contains {DEFAULT_DELIMITER}"
168
152
  )
169
153
  return Feature._to_snake_case(name)
170
154
 
@@ -187,35 +171,6 @@ class Feature(BaseModel):
187
171
  if Feature.is_feature(anno):
188
172
  yield from anno.get_file_signals([*path, name]) # type: ignore[union-attr]
189
173
 
190
- @classmethod
191
- def _flatten_full_schema(cls, fields, name_path):
192
- for name, f_info in fields.items():
193
- anno = f_info.annotation
194
- name = cls._normalize(name)
195
-
196
- orig = get_origin(anno)
197
- if orig == list:
198
- anno = get_args(anno)
199
- if isinstance(anno, tuple):
200
- anno = anno[0]
201
- is_list = True
202
- else:
203
- is_list = False
204
-
205
- if Feature.is_feature(anno):
206
- lst = copy.copy(name_path)
207
- lst = [] if anno._is_shallow else [*lst, name]
208
-
209
- if is_list:
210
- yield anno._delimiter.join(lst), Array(JSON)
211
- else:
212
- yield from cls._flatten_full_schema(anno.model_fields, lst)
213
- else:
214
- typ = convert_type_to_datachain(anno)
215
- if is_list:
216
- typ = Array(typ)
217
- yield cls._delimiter.join([*name_path, name]), typ
218
-
219
174
  @classmethod
220
175
  def is_feature(cls, anno) -> bool:
221
176
  if anno in feature_classes_lookup:
@@ -242,22 +197,10 @@ class Feature(BaseModel):
242
197
  def is_feature_type(cls, t: type) -> bool:
243
198
  if cls.is_standard_type(t):
244
199
  return True
245
- if get_origin(t) == list and len(get_args(t)) == 1:
200
+ if get_origin(t) is list and len(get_args(t)) == 1:
246
201
  return cls.is_feature_type(get_args(t)[0])
247
202
  return cls.is_feature(t)
248
203
 
249
- @classmethod
250
- def _to_udf_spec(cls):
251
- return list(cls._flatten_full_schema(cls.model_fields, []))
252
-
253
- @staticmethod
254
- def _features_to_udf_spec(fr_classes: Sequence[type["Feature"]]) -> UDFOutputSpec:
255
- return dict(
256
- item
257
- for b in fr_classes
258
- for item in b._to_udf_spec() # type: ignore[attr-defined]
259
- )
260
-
261
204
  def _flatten_fields_values(self, fields, model):
262
205
  for name, f_info in fields.items():
263
206
  anno = f_info.annotation
@@ -280,16 +223,15 @@ class Feature(BaseModel):
280
223
  yield value
281
224
 
282
225
  def _flatten(self):
283
- return tuple(self._flatten_generator())
284
-
285
- def _flatten_generator(self):
286
- # Optimization: Use a generator instead of a tuple if all values are going to
287
- # be used immediately in another comprehension or function call.
288
- return self._flatten_fields_values(self.model_fields, self)
226
+ return tuple(self._flatten_fields_values(self.model_fields, self))
289
227
 
290
228
  @staticmethod
291
229
  def _flatten_list(objs):
292
- return tuple(val for obj in objs for val in obj._flatten_generator())
230
+ return tuple(
231
+ val
232
+ for obj in objs
233
+ for val in obj._flatten_fields_values(obj.model_fields, obj)
234
+ )
293
235
 
294
236
  @classmethod
295
237
  def _unflatten_with_path(cls, dump, name_path: list[str]):
@@ -300,14 +242,12 @@ class Feature(BaseModel):
300
242
  lst = copy.copy(name_path)
301
243
 
302
244
  if inspect.isclass(anno) and issubclass(anno, Feature):
303
- if not cls._is_shallow:
304
- lst.append(name_norm)
305
-
245
+ lst.append(name_norm)
306
246
  val = anno._unflatten_with_path(dump, lst)
307
247
  res[name] = val
308
248
  else:
309
249
  lst.append(name_norm)
310
- curr_path = cls._delimiter.join(lst)
250
+ curr_path = DEFAULT_DELIMITER.join(lst)
311
251
  res[name] = dump[curr_path]
312
252
  return cls(**res)
313
253
 
@@ -336,6 +276,18 @@ class Feature(BaseModel):
336
276
  pos += 1
337
277
  return res, pos
338
278
 
279
+ @classmethod
280
+ @lru_cache(maxsize=1000)
281
+ def build_tree(cls):
282
+ res = {}
283
+
284
+ for name, f_info in cls.model_fields.items():
285
+ anno = f_info.annotation
286
+ subtree = anno.build_tree() if Feature.is_feature(anno) else None
287
+ res[name] = (anno, subtree)
288
+
289
+ return res
290
+
339
291
 
340
292
  class RestrictedAttribute:
341
293
  """Descriptor implementing an attribute that can only be accessed through
@@ -374,7 +326,7 @@ class FeatureAttributeWrapper:
374
326
 
375
327
  @property
376
328
  def name(self) -> str:
377
- return self.cls._delimiter.join(self.prefix)
329
+ return DEFAULT_DELIMITER.join(self.prefix)
378
330
 
379
331
  def __getattr__(self, name):
380
332
  field_info = self.cls.model_fields.get(name)
@@ -401,22 +353,16 @@ def _resolve(cls, name, field_info, prefix: list[str]):
401
353
  except TypeError:
402
354
  anno_sql_class = NullType
403
355
  new_prefix = copy.copy(prefix)
404
- if not cls._is_shallow:
405
- new_prefix.append(norm_name)
406
- return C(cls._delimiter.join(new_prefix), anno_sql_class)
356
+ new_prefix.append(norm_name)
357
+ return C(DEFAULT_DELIMITER.join(new_prefix), anno_sql_class)
407
358
 
408
- if not cls._is_shallow:
409
- return FeatureAttributeWrapper(anno, [*prefix, norm_name])
410
-
411
- new_prefix_value = copy.copy(prefix)
412
- if not cls._is_shallow:
413
- new_prefix_value.append(norm_name)
414
- return FeatureAttributeWrapper(anno, new_prefix_value)
359
+ return FeatureAttributeWrapper(anno, [*prefix, norm_name])
415
360
 
416
361
 
417
362
  def convert_type_to_datachain(typ): # noqa: PLR0911
418
363
  if inspect.isclass(typ) and issubclass(typ, SQLType):
419
364
  return typ
365
+
420
366
  res = TYPE_TO_DATACHAIN.get(typ)
421
367
  if res:
422
368
  return res
@@ -430,7 +376,12 @@ def convert_type_to_datachain(typ): # noqa: PLR0911
430
376
  if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
431
377
  if args is None or len(args) != 1:
432
378
  raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
433
- next_type = convert_type_to_datachain(args[0])
379
+
380
+ args0 = args[0]
381
+ if Feature.is_feature(args0):
382
+ return Array(JSON())
383
+
384
+ next_type = convert_type_to_datachain(args0)
434
385
  return Array(next_type)
435
386
 
436
387
  if inspect.isclass(orig) and issubclass(dict, orig):
@@ -443,10 +394,10 @@ def convert_type_to_datachain(typ): # noqa: PLR0911
443
394
  if orig == Union and len(args) >= 2:
444
395
  args_no_nones = [arg for arg in args if arg != type(None)]
445
396
  if len(args_no_nones) == 2:
446
- args_no_dicts = [arg for arg in args_no_nones if arg != dict]
447
- if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) == list:
397
+ args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
398
+ if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
448
399
  arg = get_args(args_no_dicts[0])
449
- if len(arg) == 1 and arg[0] == dict:
400
+ if len(arg) == 1 and arg[0] is dict:
450
401
  return JSON
451
402
 
452
403
  raise TypeError(f"Cannot recognize type {typ}")
@@ -1,5 +1,7 @@
1
1
  from typing import Any, ClassVar, Optional
2
2
 
3
+ from datachain.cli import logger
4
+
3
5
 
4
6
  class Registry:
5
7
  reg: ClassVar[dict[str, dict[int, Any]]] = {}
@@ -14,7 +16,7 @@ class Registry:
14
16
  version = fr._version # type: ignore[attr-defined]
15
17
  if version in cls.reg[name]:
16
18
  full_name = f"{name}@{version}"
17
- raise ValueError(f"Feature {full_name} is already registered")
19
+ logger.warning(f"Feature {full_name} is already registered")
18
20
  cls.reg[name][version] = fr
19
21
 
20
22
  @classmethod
@@ -40,7 +40,7 @@ def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
40
40
  anno = field_info.annotation
41
41
  if anno not in TYPE_TO_DATACHAIN:
42
42
  orig = get_origin(anno)
43
- if orig == list:
43
+ if orig is list:
44
44
  anno = get_args(anno) # type: ignore[assignment]
45
45
  if isinstance(anno, Sequence):
46
46
  anno = anno[0] # type: ignore[unreachable]
@@ -122,7 +122,7 @@ def features_to_tuples(
122
122
  if isinstance(output, dict):
123
123
  raise FeatureToTupleError(
124
124
  ds_name,
125
- f"output type must be dict[str, FeatureType] while "
125
+ "output type must be dict[str, FeatureType] while "
126
126
  f"'{type(output).__name__}' is given",
127
127
  )
128
128
  else:
datachain/lib/file.py CHANGED
@@ -1,30 +1,22 @@
1
1
  import json
2
2
  from abc import ABC, abstractmethod
3
3
  from datetime import datetime
4
- from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Any, ClassVar, Literal, Optional, Union
6
+ from urllib.parse import unquote, urlparse
7
+ from urllib.request import url2pathname
7
8
 
8
9
  from fsspec import Callback
10
+ from fsspec.implementations.local import LocalFileSystem
9
11
  from pydantic import Field, field_validator
10
12
 
11
- from datachain.lib.feature import Feature
12
- from datachain.utils import TIME_ZERO
13
-
14
- try:
15
- from PIL import Image
16
- except ImportError as exc:
17
- raise ImportError(
18
- "Missing dependencies for computer vision:\n"
19
- "To install run:\n\n"
20
- " pip install 'datachain[cv]'\n"
21
- ) from exc
22
-
23
13
  from datachain.cache import UniqueId
24
14
  from datachain.client.fileslice import FileSlice
25
15
  from datachain.lib.cached_stream import PreCachedStream, PreDownloadStream
16
+ from datachain.lib.feature import Feature
26
17
  from datachain.lib.utils import DataChainError
27
18
  from datachain.sql.types import JSON, Int, String
19
+ from datachain.utils import TIME_ZERO
28
20
 
29
21
 
30
22
  class FileFeature(Feature):
@@ -49,7 +41,7 @@ class VFileError(DataChainError):
49
41
 
50
42
  class FileError(DataChainError):
51
43
  def __init__(self, file: "File", message: str):
52
- super().__init__(f"Error in file {file.get_full_path()}: {message}")
44
+ super().__init__(f"Error in file {file.get_uri()}: {message}")
53
45
 
54
46
 
55
47
  class VFile(ABC):
@@ -237,7 +229,7 @@ class File(FileFeature):
237
229
  def get_full_name(self):
238
230
  return (Path(self.parent) / self.name).as_posix()
239
231
 
240
- def get_full_path(self):
232
+ def get_uri(self):
241
233
  return f"{self.source}/{self.get_full_name()}"
242
234
 
243
235
  def _open_stream(self, cache: bool = False, cb: Optional[Callback] = None):
@@ -245,14 +237,20 @@ class File(FileFeature):
245
237
  uid = self.get_uid()
246
238
  return client.open_object(uid, use_cache=cache, cb=cb)
247
239
 
240
+ def get_path(self) -> str:
241
+ path = unquote(self.get_uri())
242
+ fs = self.get_fs()
243
+ if isinstance(fs, LocalFileSystem):
244
+ # Drop file:// protocol
245
+ path = urlparse(path).path
246
+ path = url2pathname(path)
247
+ return path
248
248
 
249
- BinaryFile = File
249
+ def get_fs(self):
250
+ return self._catalog.get_client(self.source).fs
250
251
 
251
252
 
252
- class ImageFile(File):
253
- def get_value(self):
254
- value = super().get_value()
255
- return Image.open(BytesIO(value))
253
+ BinaryFile = File
256
254
 
257
255
 
258
256
  class TextFile(File):
@@ -272,6 +270,8 @@ def get_file(type: Literal["binary", "text", "image"] = "binary"):
272
270
  if type == "text":
273
271
  file = TextFile
274
272
  elif type == "image":
273
+ from datachain.lib.image import ImageFile
274
+
275
275
  file = ImageFile # type: ignore[assignment]
276
276
 
277
277
  def get_file_type(
datachain/lib/image.py CHANGED
@@ -1,7 +1,8 @@
1
1
  import inspect
2
+ from io import BytesIO
2
3
  from typing import Any, Callable, Optional
3
4
 
4
- from datachain.lib.file import ImageFile
5
+ from datachain.lib.file import File
5
6
 
6
7
  try:
7
8
  import torch
@@ -16,6 +17,12 @@ except ImportError as exc:
16
17
  from datachain.lib.reader import FeatureReader
17
18
 
18
19
 
20
+ class ImageFile(File):
21
+ def get_value(self):
22
+ value = super().get_value()
23
+ return Image.open(BytesIO(value))
24
+
25
+
19
26
  def convert_image(
20
27
  img: Image.Image,
21
28
  mode: str = "RGB",
@@ -48,7 +55,7 @@ def convert_image(
48
55
  and inspect.ismethod(getattr(open_clip_model, method_name))
49
56
  ):
50
57
  raise ValueError(
51
- f"Unable to render Image: 'open_clip_model' doesn't support"
58
+ "Unable to render Image: 'open_clip_model' doesn't support"
52
59
  f" '{method_name}()'"
53
60
  )
54
61
  img = open_clip_model.encode_image(img)
@@ -11,6 +11,7 @@ from collections.abc import Iterator
11
11
  from typing import Any, Callable
12
12
 
13
13
  import jmespath as jsp
14
+ from pydantic import ValidationError
14
15
 
15
16
  from datachain.lib.feature_utils import pydantic_to_feature # noqa: F401
16
17
  from datachain.lib.file import File
@@ -25,46 +26,48 @@ def generate_uuid():
25
26
  # JSON decoder
26
27
  def load_json_from_string(json_string):
27
28
  try:
28
- data = json.loads(json_string)
29
- print("Successfully parsed JSON", file=sys.stderr)
30
- return data
29
+ return json.loads(json_string)
31
30
  except json.JSONDecodeError:
32
- print("Failed to decode JSON: The string is not formatted correctly.")
33
- return None
31
+ print(f"Failed to decode JSON: {json_string} is not formatted correctly.")
32
+ return None
34
33
 
35
34
 
36
- # Read valid JSON and return a data object sample
35
+ # Validate and reduce JSON
37
36
  def process_json(data_string, jmespath):
38
37
  json_dict = load_json_from_string(data_string)
39
38
  if jmespath:
40
39
  json_dict = jsp.search(jmespath, json_dict)
41
- # we allow non-list JSONs here to print the root schema
42
- # but if jmespath expression is given, we assume a list
43
- if not isinstance(json_dict, list):
44
- raise ValueError("JMESPATH expression must resolve to a list")
45
- return None
46
- json_dict = json_dict[0] # sample the first object
47
- return json.dumps(json_dict)
40
+ return json_dict
48
41
 
49
42
 
50
43
  # Print a dynamic datamodel-codegen output from JSON or CSV on stdout
51
- def read_schema(source_file, data_type="csv", expr=None):
44
+ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
52
45
  data_string = ""
53
- uid_str = str(generate_uuid()).replace("-", "") # comply with Python class names
54
46
  # using uiid to get around issue #1617
55
- model_name = f"Model{uid_str}"
47
+ if not model_name:
48
+ uid_str = str(generate_uuid()).replace(
49
+ "-", ""
50
+ ) # comply with Python class names
51
+ model_name = f"Model{data_type}{uid_str}"
56
52
  try:
57
53
  with source_file.open() as fd: # CSV can be larger than memory
58
54
  if data_type == "csv":
59
55
  data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
60
56
  data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
57
+ elif data_type == "jsonl":
58
+ data_string = fd.readline().decode("utf-8", "ignore").replace("\r", "")
61
59
  else:
62
60
  data_string = fd.read() # other meta must fit into RAM
63
61
  except OSError as e:
64
62
  print(f"An unexpected file error occurred: {e}")
65
63
  return
66
- if data_type == "json":
67
- data_string = process_json(data_string, expr)
64
+ if data_type in ("json", "jsonl"):
65
+ json_object = process_json(data_string, expr)
66
+ if data_type == "json" and isinstance(json_object, list):
67
+ json_object = json_object[0] # sample the 1st object from JSON array
68
+ if data_type == "jsonl":
69
+ data_type = "json" # treat json line as plain JSON in auto-schema
70
+ data_string = json.dumps(json_object)
68
71
  command = [
69
72
  "datamodel-codegen",
70
73
  "--input-file-type",
@@ -73,8 +76,8 @@ def read_schema(source_file, data_type="csv", expr=None):
73
76
  model_name,
74
77
  ]
75
78
  try:
76
- result = subprocess.run(
77
- command, # noqa: S603
79
+ result = subprocess.run( # noqa: S603
80
+ command,
78
81
  input=data_string,
79
82
  text=True,
80
83
  capture_output=True,
@@ -87,13 +90,19 @@ def read_schema(source_file, data_type="csv", expr=None):
87
90
  model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
88
91
  print(f"{model_output}")
89
92
  print("\n" + f"spec=pydantic_to_feature({model_name})" + "\n")
93
+ return model_output
90
94
 
91
95
 
92
96
  #
93
97
  # UDF mapper which calls chain in the setup to infer the dynamic schema
94
98
  #
95
- def read_meta(
96
- spec=None, schema_from=None, meta_type="json", jmespath=None, show_schema=False
99
+ def read_meta( # noqa: C901
100
+ spec=None,
101
+ schema_from=None,
102
+ meta_type="json",
103
+ jmespath=None,
104
+ show_schema=False,
105
+ model_name=None,
97
106
  ) -> Callable:
98
107
  from datachain.lib.dc import DataChain
99
108
 
@@ -108,7 +117,7 @@ def read_meta(
108
117
  .limit(1)
109
118
  .map( # dummy column created (#1615)
110
119
  meta_schema=lambda file: read_schema(
111
- file, data_type=meta_type, expr=jmespath
120
+ file, data_type=meta_type, expr=jmespath, model_name=model_name
112
121
  ),
113
122
  output=str,
114
123
  )
@@ -119,6 +128,7 @@ def read_meta(
119
128
  sys.stdout = current_stdout
120
129
  model_output = captured_output.getvalue()
121
130
  captured_output.close()
131
+
122
132
  if show_schema:
123
133
  print(f"{model_output}")
124
134
  # Below 'spec' should be a dynamically converted Feature from Pydantic datamodel
@@ -135,30 +145,52 @@ def read_meta(
135
145
  #
136
146
  # UDF mapper parsing a JSON or CSV file using schema spec
137
147
  #
148
+
138
149
  def parse_data(
139
- file: File, data_model=spec, meta_type=meta_type, jmespath=jmespath
150
+ file: File,
151
+ DataModel=spec, # noqa: N803
152
+ meta_type=meta_type,
153
+ jmespath=jmespath,
140
154
  ) -> Iterator[spec]:
155
+ def validator(json_object: dict) -> spec:
156
+ json_string = json.dumps(json_object)
157
+ try:
158
+ data_instance = DataModel.model_validate_json(json_string)
159
+ yield data_instance
160
+ except ValidationError as e:
161
+ print(f"Validation error occurred in file {file.name}:", e)
162
+
141
163
  if meta_type == "csv":
142
164
  with (
143
165
  file.open() as fd
144
166
  ): # TODO: if schema is statically given, should allow CSV without headers
145
167
  reader = csv.DictReader(fd)
146
168
  for row in reader: # CSV can be larger than memory
147
- json_string = json.dumps(row)
148
- yield data_model.model_validate_json(json_string)
169
+ yield from validator(row)
170
+
149
171
  if meta_type == "json":
150
172
  try:
151
173
  with file.open() as fd: # JSON must fit into RAM
152
174
  data_string = fd.read()
153
175
  except OSError as e:
154
- print(f"An unexpected file error occurred: {e}")
155
- json_object = load_json_from_string(data_string)
156
- if jmespath:
157
- json_object = jsp.search(jmespath, json_object)
176
+ print(f"An unexpected file error occurred in file {file.name}: {e}")
177
+ json_object = process_json(data_string, jmespath)
158
178
  if not isinstance(json_object, list):
159
- raise ValueError("JSON expression must resolve in a list of objects")
160
- for json_dict in json_object:
161
- json_string = json.dumps(json_dict)
162
- yield data_model.model_validate_json(json_string)
179
+ yield from validator(json_object)
180
+
181
+ else:
182
+ for json_dict in json_object:
183
+ yield from validator(json_dict)
184
+
185
+ if meta_type == "jsonl":
186
+ try:
187
+ with file.open() as fd:
188
+ data_string = fd.readline().replace("\r", "")
189
+ while data_string:
190
+ json_object = process_json(data_string, jmespath)
191
+ data_string = fd.readline()
192
+ yield from validator(json_object)
193
+ except OSError as e:
194
+ print(f"An unexpected file error occurred in file {file.name}: {e}")
163
195
 
164
196
  return parse_data
datachain/lib/settings.py CHANGED
@@ -18,19 +18,19 @@ class Settings:
18
18
 
19
19
  if not isinstance(cache, bool) and cache is not None:
20
20
  raise SettingsError(
21
- f"'cache' argument must be bool"
21
+ "'cache' argument must be bool"
22
22
  f" while {cache.__class__.__name__} was given"
23
23
  )
24
24
 
25
25
  if not isinstance(batch, int) and batch is not None:
26
26
  raise SettingsError(
27
- f"'batch' argument must be int or None"
27
+ "'batch' argument must be int or None"
28
28
  f" while {batch.__class__.__name__} was given"
29
29
  )
30
30
 
31
31
  if not isinstance(parallel, int) and parallel is not None:
32
32
  raise SettingsError(
33
- f"'parallel' argument must be int or None"
33
+ "'parallel' argument must be int or None"
34
34
  f" while {parallel.__class__.__name__} was given"
35
35
  )
36
36
 
@@ -40,13 +40,13 @@ class Settings:
40
40
  and workers is not None
41
41
  ):
42
42
  raise SettingsError(
43
- f"'workers' argument must be int or bool"
43
+ "'workers' argument must be int or bool"
44
44
  f" while {workers.__class__.__name__} was given"
45
45
  )
46
46
 
47
47
  if min_task_size is not None and not isinstance(min_task_size, int):
48
48
  raise SettingsError(
49
- f"'min_task_size' argument must be int or None"
49
+ "'min_task_size' argument must be int or None"
50
50
  f", {min_task_size.__class__.__name__} was given"
51
51
  )
52
52