PyPI - datachain - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

datachain 0.1.13py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show

datachain/__init__.py +0 -4
datachain/asyn.py +3 -3
datachain/catalog/__init__.py +3 -3
datachain/catalog/catalog.py +6 -6
datachain/catalog/loader.py +3 -3
datachain/cli.py +10 -2
datachain/client/azure.py +37 -1
datachain/client/fsspec.py +1 -1
datachain/client/local.py +1 -1
datachain/data_storage/__init__.py +1 -1
datachain/data_storage/metastore.py +11 -3
datachain/data_storage/schema.py +12 -7
datachain/data_storage/sqlite.py +3 -0
datachain/data_storage/warehouse.py +31 -30
datachain/dataset.py +1 -3
datachain/lib/arrow.py +85 -0
datachain/lib/cached_stream.py +3 -85
datachain/lib/dc.py +382 -179
datachain/lib/feature.py +46 -91
datachain/lib/feature_registry.py +4 -1
datachain/lib/feature_utils.py +2 -2
datachain/lib/file.py +30 -44
datachain/lib/image.py +9 -2
datachain/lib/meta_formats.py +66 -34
datachain/lib/settings.py +5 -5
datachain/lib/signal_schema.py +103 -105
datachain/lib/udf.py +10 -38
datachain/lib/udf_signature.py +11 -6
datachain/lib/webdataset_laion.py +5 -22
datachain/listing.py +8 -8
datachain/node.py +1 -1
datachain/progress.py +1 -1
datachain/query/builtins.py +1 -1
datachain/query/dataset.py +42 -119
datachain/query/dispatch.py +1 -1
datachain/query/metrics.py +19 -0
datachain/query/schema.py +13 -3
datachain/sql/__init__.py +1 -1
datachain/sql/sqlite/base.py +34 -2
datachain/sql/sqlite/vector.py +13 -5
datachain/utils.py +1 -122
{datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/METADATA +11 -4
{datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/RECORD +47 -47
{datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/WHEEL +1 -1
datachain/_version.py +0 -16
datachain/lib/parquet.py +0 -32
{datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/LICENSE +0 -0
{datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/entry_points.txt +0 -0
{datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/top_level.txt +0 -0

datachain/lib/feature.py CHANGED Viewed

@@ -4,8 +4,10 @@ import re
 import warnings
 from collections.abc import Iterable, Sequence
 from datetime import datetime
+from functools import lru_cache
 from types import GenericAlias
 from typing import (
+    TYPE_CHECKING,
     Any,
     ClassVar,
     Literal,
@@ -22,7 +24,7 @@ from typing_extensions import Literal as LiteralEx
 from datachain.lib.feature_registry import Registry
 from datachain.query import C
-from datachain.query.udf import UDFOutputSpec
+from datachain.query.schema import DEFAULT_DELIMITER
 from datachain.sql.types import (
     JSON,
     Array,
@@ -38,6 +40,9 @@ from datachain.sql.types import (
     String,
 )
+if TYPE_CHECKING:
+    from datachain.catalog import Catalog
 FeatureStandardType = Union[
     type[int],
     type[str],
@@ -62,6 +67,7 @@ TYPE_TO_DATACHAIN = {
     bool: Boolean,
     datetime: DateTime,  # Note, list of datetime is not supported yet
     bytes: Binary,  # Note, list of bytes is not supported yet
+    list: Array,
     dict: JSON,
 }
@@ -108,8 +114,6 @@ warnings.filterwarnings(
 # skipped within loops.
 feature_classes_lookup: dict[type, bool] = {}
-DEFAULT_DELIMITER = "__"
 class Feature(BaseModel):
     """A base class for defining data classes that serve as inputs and outputs for
@@ -117,9 +121,6 @@ class Feature(BaseModel):
     `pydantic`'s BaseModel.
     """
-    _is_shallow: ClassVar[bool] = False
-    _expand_class_name: ClassVar[bool] = False
-    _delimiter: ClassVar[str] = DEFAULT_DELIMITER
     _is_file: ClassVar[bool] = False
     _version: ClassVar[int] = 1
@@ -135,20 +136,6 @@ class Feature(BaseModel):
     def _name(cls) -> str:
         return f"{cls.__name__}@{cls._version}"
-    def _get_value_with_check(self, *args: Any, **kwargs: Any) -> Any:
-        signature = inspect.signature(self.get_value)
-        for i, (name, prm) in enumerate(signature.parameters.items()):
-            if prm.default == inspect.Parameter.empty:
-                if i < len(args):
-                    continue
-                if name not in kwargs:
-                    raise ValueError(
-                        f"unable to get value for class {self.__class__.__name__}"
-                        f" due to a missing parameter {name} in get_value()"
-                    )
-        return self.get_value(*args, **kwargs)
     @classmethod
     def __pydantic_init_subclass__(cls):
         Registry.add(cls)
@@ -162,9 +149,10 @@ class Feature(BaseModel):
     @classmethod
     def _normalize(cls, name: str) -> str:
-        if cls._delimiter and cls._delimiter.lower() in name.lower():
+        if DEFAULT_DELIMITER in name:
             raise RuntimeError(
-                f"variable '{name}' cannot be used because it contains {cls._delimiter}"
+                f"variable '{name}' cannot be used "
+                f"because it contains {DEFAULT_DELIMITER}"
             )
         return Feature._to_snake_case(name)
@@ -174,7 +162,7 @@ class Feature(BaseModel):
         s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
         return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
-    def _set_stream(self, catalog, stream=None, caching_enabled: bool = False) -> None:
+    def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
         pass
     @classmethod
@@ -187,35 +175,6 @@ class Feature(BaseModel):
             if Feature.is_feature(anno):
                 yield from anno.get_file_signals([*path, name])  # type: ignore[union-attr]
-    @classmethod
-    def _flatten_full_schema(cls, fields, name_path):
-        for name, f_info in fields.items():
-            anno = f_info.annotation
-            name = cls._normalize(name)
-            orig = get_origin(anno)
-            if orig == list:
-                anno = get_args(anno)
-                if isinstance(anno, tuple):
-                    anno = anno[0]
-                is_list = True
-            else:
-                is_list = False
-            if Feature.is_feature(anno):
-                lst = copy.copy(name_path)
-                lst = [] if anno._is_shallow else [*lst, name]
-                if is_list:
-                    yield anno._delimiter.join(lst), Array(JSON)
-                else:
-                    yield from cls._flatten_full_schema(anno.model_fields, lst)
-            else:
-                typ = convert_type_to_datachain(anno)
-                if is_list:
-                    typ = Array(typ)
-                yield cls._delimiter.join([*name_path, name]), typ
     @classmethod
     def is_feature(cls, anno) -> bool:
         if anno in feature_classes_lookup:
@@ -242,22 +201,10 @@ class Feature(BaseModel):
     def is_feature_type(cls, t: type) -> bool:
         if cls.is_standard_type(t):
             return True
-        if get_origin(t) == list and len(get_args(t)) == 1:
+        if get_origin(t) is list and len(get_args(t)) == 1:
             return cls.is_feature_type(get_args(t)[0])
         return cls.is_feature(t)
-    @classmethod
-    def _to_udf_spec(cls):
-        return list(cls._flatten_full_schema(cls.model_fields, []))
-    @staticmethod
-    def _features_to_udf_spec(fr_classes: Sequence[type["Feature"]]) -> UDFOutputSpec:
-        return dict(
-            item
-            for b in fr_classes
-            for item in b._to_udf_spec()  # type: ignore[attr-defined]
-        )
     def _flatten_fields_values(self, fields, model):
         for name, f_info in fields.items():
             anno = f_info.annotation
@@ -280,16 +227,15 @@ class Feature(BaseModel):
                 yield value
     def _flatten(self):
-        return tuple(self._flatten_generator())
-    def _flatten_generator(self):
-        # Optimization: Use a generator instead of a tuple if all values are going to
-        # be used immediately in another comprehension or function call.
-        return self._flatten_fields_values(self.model_fields, self)
+        return tuple(self._flatten_fields_values(self.model_fields, self))
     @staticmethod
     def _flatten_list(objs):
-        return tuple(val for obj in objs for val in obj._flatten_generator())
+        return tuple(
+            val
+            for obj in objs
+            for val in obj._flatten_fields_values(obj.model_fields, obj)
+        )
     @classmethod
     def _unflatten_with_path(cls, dump, name_path: list[str]):
@@ -300,14 +246,12 @@ class Feature(BaseModel):
             lst = copy.copy(name_path)
             if inspect.isclass(anno) and issubclass(anno, Feature):
-                if not cls._is_shallow:
-                    lst.append(name_norm)
+                lst.append(name_norm)
                 val = anno._unflatten_with_path(dump, lst)
                 res[name] = val
             else:
                 lst.append(name_norm)
-                curr_path = cls._delimiter.join(lst)
+                curr_path = DEFAULT_DELIMITER.join(lst)
                 res[name] = dump[curr_path]
         return cls(**res)
@@ -336,6 +280,18 @@ class Feature(BaseModel):
                 pos += 1
         return res, pos
+    @classmethod
+    @lru_cache(maxsize=1000)
+    def build_tree(cls):
+        res = {}
+        for name, f_info in cls.model_fields.items():
+            anno = f_info.annotation
+            subtree = anno.build_tree() if Feature.is_feature(anno) else None
+            res[name] = (anno, subtree)
+        return res
 class RestrictedAttribute:
     """Descriptor implementing an attribute that can only be accessed through
@@ -374,7 +330,7 @@ class FeatureAttributeWrapper:
     @property
     def name(self) -> str:
-        return self.cls._delimiter.join(self.prefix)
+        return DEFAULT_DELIMITER.join(self.prefix)
     def __getattr__(self, name):
         field_info = self.cls.model_fields.get(name)
@@ -401,22 +357,16 @@ def _resolve(cls, name, field_info, prefix: list[str]):
         except TypeError:
             anno_sql_class = NullType
         new_prefix = copy.copy(prefix)
-        if not cls._is_shallow:
-            new_prefix.append(norm_name)
-        return C(cls._delimiter.join(new_prefix), anno_sql_class)
+        new_prefix.append(norm_name)
+        return C(DEFAULT_DELIMITER.join(new_prefix), anno_sql_class)
-    if not cls._is_shallow:
-        return FeatureAttributeWrapper(anno, [*prefix, norm_name])
-    new_prefix_value = copy.copy(prefix)
-    if not cls._is_shallow:
-        new_prefix_value.append(norm_name)
-    return FeatureAttributeWrapper(anno, new_prefix_value)
+    return FeatureAttributeWrapper(anno, [*prefix, norm_name])
 def convert_type_to_datachain(typ):  # noqa: PLR0911
     if inspect.isclass(typ) and issubclass(typ, SQLType):
         return typ
     res = TYPE_TO_DATACHAIN.get(typ)
     if res:
         return res
@@ -430,7 +380,12 @@ def convert_type_to_datachain(typ):  # noqa: PLR0911
     if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
         if args is None or len(args) != 1:
             raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
-        next_type = convert_type_to_datachain(args[0])
+        args0 = args[0]
+        if Feature.is_feature(args0):
+            return Array(JSON())
+        next_type = convert_type_to_datachain(args0)
         return Array(next_type)
     if inspect.isclass(orig) and issubclass(dict, orig):
@@ -443,10 +398,10 @@ def convert_type_to_datachain(typ):  # noqa: PLR0911
     if orig == Union and len(args) >= 2:
         args_no_nones = [arg for arg in args if arg != type(None)]
         if len(args_no_nones) == 2:
-            args_no_dicts = [arg for arg in args_no_nones if arg != dict]
-            if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) == list:
+            args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
+            if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
                 arg = get_args(args_no_dicts[0])
-                if len(arg) == 1 and arg[0] == dict:
+                if len(arg) == 1 and arg[0] is dict:
                     return JSON
     raise TypeError(f"Cannot recognize type {typ}")

datachain/lib/feature_registry.py CHANGED Viewed

@@ -1,5 +1,8 @@
+import logging
 from typing import Any, ClassVar, Optional
+logger = logging.getLogger(__name__)
 class Registry:
     reg: ClassVar[dict[str, dict[int, Any]]] = {}
@@ -14,7 +17,7 @@ class Registry:
         version = fr._version  # type: ignore[attr-defined]
         if version in cls.reg[name]:
             full_name = f"{name}@{version}"
-            raise ValueError(f"Feature {full_name} is already registered")
+            logger.warning("Feature %s is already registered", full_name)
         cls.reg[name][version] = fr
     @classmethod

datachain/lib/feature_utils.py CHANGED Viewed

@@ -40,7 +40,7 @@ def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
         anno = field_info.annotation
         if anno not in TYPE_TO_DATACHAIN:
             orig = get_origin(anno)
-            if orig == list:
+            if orig is list:
                 anno = get_args(anno)  # type: ignore[assignment]
                 if isinstance(anno, Sequence):
                     anno = anno[0]  # type: ignore[unreachable]
@@ -122,7 +122,7 @@ def features_to_tuples(
         if isinstance(output, dict):
             raise FeatureToTupleError(
                 ds_name,
-                f"output type must be dict[str, FeatureType] while "
+                "output type must be dict[str, FeatureType] while "
                 f"'{type(output).__name__}' is given",
             )
     else:

datachain/lib/file.py CHANGED Viewed

@@ -1,30 +1,24 @@
 import json
 from abc import ABC, abstractmethod
 from datetime import datetime
-from io import BytesIO
 from pathlib import Path
-from typing import Any, ClassVar, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+from urllib.parse import unquote, urlparse
+from urllib.request import url2pathname
-from fsspec import Callback
+from fsspec.implementations.local import LocalFileSystem
 from pydantic import Field, field_validator
-from datachain.lib.feature import Feature
-from datachain.utils import TIME_ZERO
-try:
-    from PIL import Image
-except ImportError as exc:
-    raise ImportError(
-        "Missing dependencies for computer vision:\n"
-        "To install run:\n\n"
-        "  pip install 'datachain[cv]'\n"
-    ) from exc
 from datachain.cache import UniqueId
 from datachain.client.fileslice import FileSlice
 from datachain.lib.cached_stream import PreCachedStream, PreDownloadStream
+from datachain.lib.feature import Feature
 from datachain.lib.utils import DataChainError
 from datachain.sql.types import JSON, Int, String
+from datachain.utils import TIME_ZERO
+if TYPE_CHECKING:
+    from datachain.catalog import Catalog
 class FileFeature(Feature):
@@ -49,7 +43,7 @@ class VFileError(DataChainError):
 class FileError(DataChainError):
     def __init__(self, file: "File", message: str):
-        super().__init__(f"Error in file {file.get_full_path()}: {message}")
+        super().__init__(f"Error in file {file.get_uri()}: {message}")
 class VFile(ABC):
@@ -190,26 +184,17 @@ class File(FileFeature):
     def open(self):
         if self._stream is None:
-            if self._catalog is None:
-                raise FileError(self, "stream is not set")
-            self._stream = self._open_stream()
+            raise FileError(self, "stream is not set")
         if self.location:
             return VFileRegistry.resolve(self, self.location)
         return self._stream
-    def _set_stream(
-        self, catalog=None, stream=None, caching_enabled: bool = False
-    ) -> None:
-        if self._catalog is None and catalog is None:
-            raise DataChainError(f"Cannot set file '{stream}' without catalog")
-        if catalog:
-            self._catalog = catalog
+    def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
+        self._catalog = catalog
         stream_class = PreCachedStream if caching_enabled else PreDownloadStream
-        self._stream = stream_class(stream, self.size, self._catalog, self.get_uid())
+        self._stream = stream_class(self._catalog, self.get_uid())
         self._caching_enabled = caching_enabled
     def get_uid(self) -> UniqueId:
@@ -237,22 +222,23 @@ class File(FileFeature):
     def get_full_name(self):
         return (Path(self.parent) / self.name).as_posix()
-    def get_full_path(self):
+    def get_uri(self):
         return f"{self.source}/{self.get_full_name()}"
-    def _open_stream(self, cache: bool = False, cb: Optional[Callback] = None):
-        client = self._catalog.get_client(self.source)
-        uid = self.get_uid()
-        return client.open_object(uid, use_cache=cache, cb=cb)
+    def get_path(self) -> str:
+        path = unquote(self.get_uri())
+        fs = self.get_fs()
+        if isinstance(fs, LocalFileSystem):
+            # Drop file:// protocol
+            path = urlparse(path).path
+            path = url2pathname(path)
+        return path
-BinaryFile = File
+    def get_fs(self):
+        return self._catalog.get_client(self.source).fs
-class ImageFile(File):
-    def get_value(self):
-        value = super().get_value()
-        return Image.open(BytesIO(value))
+BinaryFile = File
 class TextFile(File):
@@ -260,10 +246,8 @@ class TextFile(File):
         super().__init__(**kwargs)
         self._stream = None
-    def _set_stream(
-        self, catalog=None, stream=None, caching_enabled: bool = False
-    ) -> None:
-        super()._set_stream(catalog, stream, caching_enabled)
+    def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
+        super()._set_stream(catalog, caching_enabled)
         self._stream.set_mode("r")
@@ -272,6 +256,8 @@ def get_file(type: Literal["binary", "text", "image"] = "binary"):
     if type == "text":
         file = TextFile
     elif type == "image":
+        from datachain.lib.image import ImageFile
         file = ImageFile  # type: ignore[assignment]
     def get_file_type(

datachain/lib/image.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import inspect
+from io import BytesIO
 from typing import Any, Callable, Optional
-from datachain.lib.file import ImageFile
+from datachain.lib.file import File
 try:
     import torch
@@ -16,6 +17,12 @@ except ImportError as exc:
 from datachain.lib.reader import FeatureReader
+class ImageFile(File):
+    def get_value(self):
+        value = super().get_value()
+        return Image.open(BytesIO(value))
 def convert_image(
     img: Image.Image,
     mode: str = "RGB",
@@ -48,7 +55,7 @@ def convert_image(
             and inspect.ismethod(getattr(open_clip_model, method_name))
         ):
             raise ValueError(
-                f"Unable to render Image: 'open_clip_model' doesn't support"
+                "Unable to render Image: 'open_clip_model' doesn't support"
                 f" '{method_name}()'"
             )
         img = open_clip_model.encode_image(img)

datachain/lib/meta_formats.py CHANGED Viewed

@@ -11,6 +11,7 @@ from collections.abc import Iterator
 from typing import Any, Callable
 import jmespath as jsp
+from pydantic import ValidationError
 from datachain.lib.feature_utils import pydantic_to_feature  # noqa: F401
 from datachain.lib.file import File
@@ -25,46 +26,48 @@ def generate_uuid():
 # JSON decoder
 def load_json_from_string(json_string):
     try:
-        data = json.loads(json_string)
-        print("Successfully parsed JSON", file=sys.stderr)
-        return data
+        return json.loads(json_string)
     except json.JSONDecodeError:
-        print("Failed to decode JSON: The string is not formatted correctly.")
-    return None
+        print(f"Failed to decode JSON: {json_string} is not formatted correctly.")
+        return None
-# Read valid JSON and return a data object sample
+# Validate and reduce JSON
 def process_json(data_string, jmespath):
     json_dict = load_json_from_string(data_string)
     if jmespath:
         json_dict = jsp.search(jmespath, json_dict)
-        # we allow non-list JSONs here to print the root schema
-        # but if jmespath expression is given, we assume a list
-        if not isinstance(json_dict, list):
-            raise ValueError("JMESPATH expression must resolve to a list")
-            return None
-        json_dict = json_dict[0]  # sample the first object
-    return json.dumps(json_dict)
+    return json_dict
 # Print a dynamic datamodel-codegen output from JSON or CSV on stdout
-def read_schema(source_file, data_type="csv", expr=None):
+def read_schema(source_file, data_type="csv", expr=None, model_name=None):
     data_string = ""
-    uid_str = str(generate_uuid()).replace("-", "")  # comply with Python class names
     # using uiid to get around issue #1617
-    model_name = f"Model{uid_str}"
+    if not model_name:
+        uid_str = str(generate_uuid()).replace(
+            "-", ""
+        )  # comply with Python class names
+        model_name = f"Model{data_type}{uid_str}"
     try:
         with source_file.open() as fd:  # CSV can be larger than memory
             if data_type == "csv":
                 data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
                 data_string += fd.readline().decode("utf-8", "ignore").replace("\r", "")
+            elif data_type == "jsonl":
+                data_string = fd.readline().decode("utf-8", "ignore").replace("\r", "")
             else:
                 data_string = fd.read()  # other meta must fit into RAM
     except OSError as e:
         print(f"An unexpected file error occurred: {e}")
         return
-    if data_type == "json":
-        data_string = process_json(data_string, expr)
+    if data_type in ("json", "jsonl"):
+        json_object = process_json(data_string, expr)
+        if data_type == "json" and isinstance(json_object, list):
+            json_object = json_object[0]  # sample the 1st object from JSON array
+        if data_type == "jsonl":
+            data_type = "json"  # treat json line as plain JSON in auto-schema
+        data_string = json.dumps(json_object)
     command = [
         "datamodel-codegen",
         "--input-file-type",
@@ -73,8 +76,8 @@ def read_schema(source_file, data_type="csv", expr=None):
         model_name,
     ]
     try:
-        result = subprocess.run(
-            command,  # noqa: S603
+        result = subprocess.run(  # noqa: S603
+            command,
             input=data_string,
             text=True,
             capture_output=True,
@@ -87,13 +90,19 @@ def read_schema(source_file, data_type="csv", expr=None):
         model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
     print(f"{model_output}")
     print("\n" + f"spec=pydantic_to_feature({model_name})" + "\n")
+    return model_output
 #
 # UDF mapper which calls chain in the setup to infer the dynamic schema
 #
-def read_meta(
-    spec=None, schema_from=None, meta_type="json", jmespath=None, show_schema=False
+def read_meta(  # noqa: C901
+    spec=None,
+    schema_from=None,
+    meta_type="json",
+    jmespath=None,
+    show_schema=False,
+    model_name=None,
 ) -> Callable:
     from datachain.lib.dc import DataChain
@@ -108,7 +117,7 @@ def read_meta(
                 .limit(1)
                 .map(  # dummy column created (#1615)
                     meta_schema=lambda file: read_schema(
-                        file, data_type=meta_type, expr=jmespath
+                        file, data_type=meta_type, expr=jmespath, model_name=model_name
                     ),
                     output=str,
                 )
@@ -119,6 +128,7 @@ def read_meta(
             sys.stdout = current_stdout
         model_output = captured_output.getvalue()
         captured_output.close()
         if show_schema:
             print(f"{model_output}")
         # Below 'spec' should be a dynamically converted Feature from Pydantic datamodel
@@ -135,30 +145,52 @@ def read_meta(
     #
     # UDF mapper parsing a JSON or CSV file using schema spec
     #
     def parse_data(
-        file: File, data_model=spec, meta_type=meta_type, jmespath=jmespath
+        file: File,
+        DataModel=spec,  # noqa: N803
+        meta_type=meta_type,
+        jmespath=jmespath,
     ) -> Iterator[spec]:
+        def validator(json_object: dict) -> spec:
+            json_string = json.dumps(json_object)
+            try:
+                data_instance = DataModel.model_validate_json(json_string)
+                yield data_instance
+            except ValidationError as e:
+                print(f"Validation error occurred in file {file.name}:", e)
         if meta_type == "csv":
             with (
                 file.open() as fd
             ):  # TODO: if schema is statically given, should allow CSV without headers
                 reader = csv.DictReader(fd)
                 for row in reader:  # CSV can be larger than memory
-                    json_string = json.dumps(row)
-                    yield data_model.model_validate_json(json_string)
+                    yield from validator(row)
         if meta_type == "json":
             try:
                 with file.open() as fd:  # JSON must fit into RAM
                     data_string = fd.read()
             except OSError as e:
-                print(f"An unexpected file error occurred: {e}")
-            json_object = load_json_from_string(data_string)
-            if jmespath:
-                json_object = jsp.search(jmespath, json_object)
+                print(f"An unexpected file error occurred in file {file.name}: {e}")
+            json_object = process_json(data_string, jmespath)
             if not isinstance(json_object, list):
-                raise ValueError("JSON expression must resolve in a list of objects")
-            for json_dict in json_object:
-                json_string = json.dumps(json_dict)
-                yield data_model.model_validate_json(json_string)
+                yield from validator(json_object)
+            else:
+                for json_dict in json_object:
+                    yield from validator(json_dict)
+        if meta_type == "jsonl":
+            try:
+                with file.open() as fd:
+                    data_string = fd.readline().replace("\r", "")
+                    while data_string:
+                        json_object = process_json(data_string, jmespath)
+                        data_string = fd.readline()
+                        yield from validator(json_object)
+            except OSError as e:
+                print(f"An unexpected file error occurred in file {file.name}: {e}")
     return parse_data

datachain 0.1.13__py3-none-any.whl → 0.2.1__py3-none-any.whl

Potentially problematic release.

datachain 0.1.13py3-none-any.whl → 0.2.1py3-none-any.whl