PyPI - datachain - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

datachain 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show

datachain/_version.py +2 -2
datachain/asyn.py +3 -3
datachain/catalog/__init__.py +3 -3
datachain/catalog/catalog.py +6 -6
datachain/catalog/loader.py +3 -3
datachain/cli.py +2 -1
datachain/client/azure.py +37 -1
datachain/client/fsspec.py +1 -1
datachain/client/local.py +1 -1
datachain/data_storage/__init__.py +1 -1
datachain/data_storage/metastore.py +11 -3
datachain/data_storage/schema.py +2 -3
datachain/data_storage/warehouse.py +31 -30
datachain/dataset.py +1 -3
datachain/lib/arrow.py +85 -0
datachain/lib/dc.py +377 -178
datachain/lib/feature.py +41 -90
datachain/lib/feature_registry.py +3 -1
datachain/lib/feature_utils.py +2 -2
datachain/lib/file.py +20 -20
datachain/lib/image.py +9 -2
datachain/lib/meta_formats.py +66 -34
datachain/lib/settings.py +5 -5
datachain/lib/signal_schema.py +103 -105
datachain/lib/udf.py +3 -12
datachain/lib/udf_signature.py +11 -6
datachain/lib/webdataset_laion.py +5 -22
datachain/listing.py +8 -8
datachain/node.py +1 -1
datachain/progress.py +1 -1
datachain/query/builtins.py +1 -1
datachain/query/dataset.py +39 -110
datachain/query/dispatch.py +1 -1
datachain/query/metrics.py +19 -0
datachain/query/schema.py +13 -3
datachain/sql/__init__.py +1 -1
datachain/utils.py +1 -122
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
datachain/lib/parquet.py +0 -32
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
{datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0

datachain/lib/dc.py CHANGED Viewed

@@ -1,12 +1,21 @@
+import re
 from collections.abc import Iterator, Sequence
-from typing import TYPE_CHECKING, Callable, ClassVar, Literal, Optional, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ClassVar,
+    Literal,
+    Optional,
+    Union,
+)
 import sqlalchemy
 from datachain.lib.feature import Feature, FeatureType
 from datachain.lib.feature_utils import features_to_tuples
 from datachain.lib.file import File, get_file
-from datachain.lib.meta_formats import read_meta
+from datachain.lib.meta_formats import read_meta, read_schema
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf import (
@@ -27,6 +36,7 @@ from datachain.query.dataset import (
 from datachain.query.schema import Column, DatasetRow
 if TYPE_CHECKING:
+    import pandas as pd
     from typing_extensions import Self
 C = Column
@@ -68,44 +78,43 @@ class DataChain(DatasetQuery):
     The supported set of field types include: majority of the type supported by the
     underlyind library `Pydantic`.
-    See Also
-    --------
-    DataChain.from_storage("s3://my-bucket/my-dir/") - reading unstructured data files
-        from storages such as S3, gs or Azure ADLS.
-    DataChain.save("name") - saving to a dataset.
-    DataChain.from_dataset("name") - reading from a dataset.
-    DataChain.from_features(fib=[1, 2, 3, 5, 8]) - generating from a values.
-    Examples
-    --------
-    >>> from datachain import DataChain, Feature
-    >>> from datachain.lib.claude import claude_processor
-    >>>
-    >>> class Rating(Feature):
-    >>>   status: str = ""
-    >>>   explanation: str = ""
-    >>>
-    >>> PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
-    >>> MODEL = "claude-3-opus-20240229"
-    >>>
-    >>> chain = (
-    >>>     DataChain.from_storage("s3://my-bucket/my")
-    >>>     .filter(C.name.glob("*.txt"))
-    >>>     .limit(5)
-    >>>     .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
-    >>>     .map(
-    >>>         rating=lambda claude: Rating(
-    >>>             **(json.loads(claude.content[0].text) if claude.content else {})
-    >>>     ),
-    >>>     output=Rating,
-    >>> )
-    >>> chain.save("ratings")
-    >>> print(chain)
+    See Also:
+        `DataChain.from_storage("s3://my-bucket/my-dir/")` - reading unstructured
+            data files from storages such as S3, gs or Azure ADLS.
+        `DataChain.save("name")` - saving to a dataset.
+        `DataChain.from_dataset("name")` - reading from a dataset.
+        `DataChain.from_features(fib=[1, 2, 3, 5, 8])` - generating from a values.
+    Example:
+        ```py
+        from datachain import DataChain, Feature
+        from datachain.lib.claude import claude_processor
+        class Rating(Feature):
+        status: str = ""
+        explanation: str = ""
+        PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
+        MODEL = "claude-3-opus-20240229"
+        chain = (
+            DataChain.from_storage("s3://my-bucket/my")
+            .filter(C.name.glob("*.txt"))
+            .limit(5)
+            .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
+            .map(
+                rating=lambda claude: Rating(
+                    **(json.loads(claude.content[0].text) if claude.content else {})
+            ),
+            output=Rating,
+        )
+        chain.save("ratings")
+        print(chain)
+        ```
     """
     DEFAULT_FILE_RECORD: ClassVar[dict] = {
@@ -119,8 +128,7 @@ class DataChain(DatasetQuery):
     def __init__(self, *args, **kwargs):
         """This method needs to be redefined as a part of Dataset and DacaChin
-        decoupling
-        """
+        decoupling."""
         super().__init__(
             *args,
             **kwargs,
@@ -133,6 +141,16 @@ class DataChain(DatasetQuery):
         else:
             self.signals_schema = SignalSchema.from_column_types(self.column_types)
+    @property
+    def schema(self):
+        return self.signals_schema.values if self.signals_schema else None
+    def print_schema(self):
+        self.signals_schema.print_tree()
+    def create_model(self, name: str) -> type[Feature]:
+        return self.signals_schema.create_model(name)
     def settings(
         self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
     ) -> "Self":
@@ -141,29 +159,28 @@ class DataChain(DatasetQuery):
         This function changes specified settings without changing not specified ones.
         It returns chain, so, it can be chained later with next operation.
-        Parameters
-        ----------
-        cache : data caching (default=False)
-        batch : size of the batch (default=1000)
-        parallel : number of thread for processors. True is a special value to
-               enable all available CPUs (default=1)
-        workers : number of distributed workers. Only for Studio mode. (default=1)
-        min_task_size : minimum number of tasks (default=1)
-        Examples
-        --------
-        >>> chain = (
-        >>>     chain
-        >>>     .settings(cache=True, parallel=8)
-        >>>     .map(laion=process_webdataset(spec=WDSLaion), params="file")
-        >>> )
+        Parameters:
+            cache : data caching (default=False)
+            batch : size of the batch (default=1000)
+            parallel : number of thread for processors. True is a special value to
+                enable all available CPUs (default=1)
+            workers : number of distributed workers. Only for Studio mode. (default=1)
+            min_task_size : minimum number of tasks (default=1)
+        Example:
+            ```py
+            chain = (
+                chain
+                .settings(cache=True, parallel=8)
+                .map(laion=process_webdataset(spec=WDSLaion), params="file")
+            )
+            ```
         """
         self._settings.add(Settings(cache, batch, parallel, workers, min_task_size))
         return self
     def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
-        """Reset all settings to default values"""
+        """Reset all settings to default values."""
         self._settings = settings if settings else Settings()
         return self
@@ -184,39 +201,37 @@ class DataChain(DatasetQuery):
         cls,
         path,
         type: Literal["binary", "text", "image"] = "binary",
+        recursive: Optional[bool] = True,
         anon: bool = False,
     ) -> "DataChain":
-        """Get data from a storage as a list of file with all file attributes.
-        It returns the chain itself as usual.
+        """Get data from a storage as a list of file with all file attributes. It
+        returns the chain itself as usual.
-        Parameters
-        ----------
-        path : storage URI with directory. URI must start with storage prefix such
-               as `s3://`, `gs://`, `az://` or "file:///"
-        type : read file as "binary", "text", or "image" data. Default is "binary".
-        anon : use anonymous mode to access the storage.
-        Examples
-        --------
+        Parameters:
+            path : storage URI with directory. URI must start with storage prefix such
+                as `s3://`, `gs://`, `az://` or "file:///"
+            type : read file as "binary", "text", or "image" data. Default is "binary".
+            recursive : search recursively for the given path.
+            anon : use anonymous mode to access the storage.
-        >>> chain = DataChain.from_storage("s3://my-bucket/my-dir")
+        Example:
+            ```py
+            chain = DataChain.from_storage("s3://my-bucket/my-dir")
+            ```
         """
         func = get_file(type)
-        return DataChain(path, anon=anon).map(file=func)
+        return DataChain(path, recursive=recursive, anon=anon).map(file=func)
     @classmethod
     def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
         """Get data from dataset. It returns the chain itself.
-        Parameters
-        ----------
-        name : dataset name
-        version : dataset version
-        Examples
-        --------
+        Parameters:
+            name : dataset name
+            version : dataset version
-        >>> chain = DataChain.from_dataset("my_cats")
+        Examples:
+            >>> chain = DataChain.from_dataset("my_cats")
         """
         return DataChain(name=name, version=version)
@@ -228,37 +243,44 @@ class DataChain(DatasetQuery):
         anon: bool = False,
         spec: Optional[FeatureType] = None,
         schema_from: Optional[str] = "auto",
+        object_name: Optional[str] = "csv",
+        model_name: Optional[str] = None,
         show_schema: Optional[bool] = False,
     ) -> "DataChain":
         """Get data from CSV. It returns the chain itself.
-        Parameters
-        ----------
-        path : storage URI with directory. URI must start with storage prefix such
-               as `s3://`, `gs://`, `az://` or "file:///"
-        type : read file as "binary", "text", or "image" data. Default is "binary".
-        anon : use anonymous mode to access the storage.
-        spec : optional Data Model
-        schema_from : path to sample to infer spec from
-        show_schema : print auto-generated schema
+        Parameters:
+            path : storage URI with directory. URI must start with storage prefix such
+                as `s3://`, `gs://`, `az://` or "file:///"
+            type : read file as "binary", "text", or "image" data. Default is "text".
+            anon : use anonymous mode to access the storage.
+            spec : Data Model for CSV file
+            object_name : generated object column name
+            model_name : generated model name
+            schema_from : path to sample to infer spec from
+            show_schema : print auto-generated schema
-        Examples
-        --------
+        Examples:
+            infer model from the first two lines (header + data)
+            >>> chain = DataChain.from_csv("gs://csv")
-        >>> chain = DataChain.from_csv("gs://csv")
+            use a particular data model
+            >>> chain = DataChain.from_csv("gs://csv"i, spec=MyModel)
         """
         if schema_from == "auto":
             schema_from = path
         chain = DataChain.from_storage(path=path, type=type, anon=anon)
-        return chain.gen(
-            csv=read_meta(
+        signal_dict = {
+            object_name: read_meta(
                 schema_from=schema_from,
                 meta_type="csv",
                 spec=spec,
+                model_name=model_name,
                 show_schema=show_schema,
             )
-        )
+        }
+        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
     @classmethod
     def from_json(
@@ -269,50 +291,104 @@ class DataChain(DatasetQuery):
         spec: Optional[FeatureType] = None,
         schema_from: Optional[str] = "auto",
         jmespath: Optional[str] = None,
+        object_name: Optional[str] = None,
+        model_name: Optional[str] = None,
         show_schema: Optional[bool] = False,
+        meta_type: Optional[str] = "json",
     ) -> "DataChain":
-        """Get data from CSV. It returns the chain itself.
+        """Get data from JSON. It returns the chain itself.
-        Parameters
-        ----------
-        path : storage URI with directory. URI must start with storage prefix such
-               as `s3://`, `gs://`, `az://` or "file:///"
-        type : read file as "binary", "text", or "image" data. Default is "binary".
-        anon : use anonymous mode to access the storage.
-        spec : optional Data Model
-        schema_from : path to sample to infer spec from
-        show_schema : print auto-generated schema
-        jmespath : JMESPATH expression to reduce JSON
-        name : return object name
-        Examples
-        --------
-        >>> chain = DataChain.from_json("gs://json")
+        Parameters:
+            path : storage URI with directory. URI must start with storage prefix such
+                as `s3://`, `gs://`, `az://` or "file:///"
+            type : read file as "binary", "text", or "image" data. Default is "binary".
+            anon : use anonymous mode to access the storage.
+            spec : optional Data Model
+            schema_from : path to sample to infer spec from
+            object_name : generated object column name
+            model_name : generated model name
+            show_schema : print auto-generated schema
+            jmespath : JMESPATH expression to reduce JSON
+        Examples:
+            infer JSON schema from data, reduce using JMESPATH, print schema
+            >>> chain = DataChain.from_json("gs://json", jmespath="key1.key2")
+            infer JSON schema from a particular path, print data model
+            >>> chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
         """
         if schema_from == "auto":
             schema_from = path
+        def jmespath_to_name(s: str):
+            name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
+            return s[:name_end]
+        if (not object_name) and jmespath:
+            object_name = jmespath_to_name(jmespath)
+        if not object_name:
+            object_name = "json"
         chain = DataChain.from_storage(path=path, type=type, anon=anon)
-        return chain.gen(
-            json=read_meta(
+        signal_dict = {
+            object_name: read_meta(
                 schema_from=schema_from,
-                meta_type="json",
+                meta_type=meta_type,
                 spec=spec,
+                model_name=model_name,
                 show_schema=show_schema,
                 jmespath=jmespath,
             )
+        }
+        return chain.gen(**signal_dict)  # type: ignore[arg-type]
+    def show_json_schema(  # type: ignore[override]
+        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
+    ) -> "DataChain":
+        """Print JSON data model and save it. It returns the chain itself.
+        Parameters:
+            jmespath : JMESPATH expression to reduce JSON
+            model_name : generated model name
+        Examples:
+            print JSON schema and save to column "meta_from":
+            >>> uri = "gs://datachain-demo/coco2017/annotations_captions/"
+            >>> chain = DataChain.from_storage(uri)
+            >>> chain = chain.show_json_schema()
+            >>> chain.save()
+        """
+        return self.map(
+            meta_schema=lambda file: read_schema(
+                file, data_type="json", expr=jmespath, model_name=model_name
+            ),
+            output=str,
+        )
+    def show_jsonl_schema(  # type: ignore[override]
+        self, jmespath: Optional[str] = None, model_name: Optional[str] = None
+    ) -> "DataChain":
+        """Print JSON data model and save it. It returns the chain itself.
+        Parameters:
+            jmespath : JMESPATH expression to reduce JSON
+            model_name : generated model name
+        """
+        return self.map(
+            meta_schema=lambda file: read_schema(
+                file, data_type="jsonl", expr=jmespath, model_name=model_name
+            ),
+            output=str,
         )
     def save(  # type: ignore[override]
         self, name: Optional[str] = None, version: Optional[int] = None
     ) -> "DataChain":
-        """Save to a Dataset. It returns the chain itself
+        """Save to a Dataset. It returns the chain itself.
-        Parameters
-        ----------
-        name : dataset name. Empty name saves to a temporary dataset that will be
-               removed after process ends. Temp dataset are useful for optimization.
-        version : version of a dataset. Default - the last version that exist.
+        Parameters:
+            name : dataset name. Empty name saves to a temporary dataset that will be
+                removed after process ends. Temp dataset are useful for optimization.
+            version : version of a dataset. Default - the last version that exist.
         """
         schema = self.signals_schema.serialize()
         return super().save(name=name, version=version, feature_schema=schema)
@@ -333,29 +409,26 @@ class DataChain(DatasetQuery):
         Input-output relationship: 1:1
         Parameters:
+            func : Function applied to each row.
+            params : List of column names used as input for the function. Default
+                    is taken from function signature.
+            output : Dictionary defining new signals and their corresponding types.
+                    Default type is taken from function signature. Default can be also
+                    taken from kwargs - **signal_map (see below).
+                    If signal name is defined using signal_map (see below) only a single
+                    type value can be used.
+            **signal_map : kwargs can be used to define `func` together with it's return
+                    signal name in format of `map(my_sign=my_func)`. This helps define
+                    signal names and function in a nicer way.
+        Examples:
+            Using signal_map and single type in output:
+            >>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
+            >>> chain.save("new_dataset")
-        func : Function applied to each row.
-        params : List of column names used as input for the function. Default
-                is taken from function signature.
-        output : Dictionary defining new signals and their corresponding types. Default
-                type is taken from function signature. Default can be also taken from
-                kwargs - **signal_map (see below).
-                If signal name is defined using signal_map (see below) only a single
-                type value can be used.
-        **signal_map : kwargs can be used to define `func` together with it's return
-                signal name in format of `map(my_sign=my_func)`. This helps define
-                signal names and function in a nicer way.
-        Examples
-        --------
-        Using signal_map and single type in output:
-        >>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
-        >>> chain.save("new_dataset")
-        Using func and output as a map:
-        >>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
-        >>> chain.save("new_dataset")
+            Using func and output as a map:
+            >>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
+            >>> chain.save("new_dataset")
         """
         udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
@@ -375,9 +448,8 @@ class DataChain(DatasetQuery):
         output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
         **signal_map,
     ) -> "Self":
-        """
-        Apply a function to each row to create new rows (with potentially new signals).
-        The function needs to return a new objects for each of the new rows.
+        """Apply a function to each row to create new rows (with potentially new
+        signals). The function needs to return a new objects for each of the new rows.
         It returns a chain itself with new signals.
         Input-output relationship: 1:N
@@ -435,7 +507,9 @@ class DataChain(DatasetQuery):
         output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
         **signal_map,
     ) -> "Self":
-        """This is a batch version of map(). It accepts the same parameters plus an
+        """This is a batch version of map().
+        It accepts the same parameters plus an
         additional parameter:
         """
         udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
@@ -455,7 +529,7 @@ class DataChain(DatasetQuery):
         output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]],
         signal_map,
     ) -> UDFBase:
-        is_generator = issubclass(target_class, (Generator, Aggregator, BatchMapper))
+        is_generator = target_class.is_output_batched
         name = self.name or "Unknown"
         sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
@@ -476,7 +550,7 @@ class DataChain(DatasetQuery):
     @detach
     def select(self, *args: str) -> "Self":
-        """Select only a specified set of signals"""
+        """Select only a specified set of signals."""
         new_schema = self.signals_schema.resolve(*args)
         columns = new_schema.db_signals()
         chain = super().select(*columns)
@@ -485,7 +559,7 @@ class DataChain(DatasetQuery):
     @detach
     def select_except(self, *args: str) -> "Self":
-        """Select all the signals expect the specified signals"""
+        """Select all the signals expect the specified signals."""
         new_schema = self.signals_schema.select_except_signals(*args)
         columns = new_schema.db_signals()
         chain = super().select(*columns)
@@ -494,6 +568,7 @@ class DataChain(DatasetQuery):
     def get_values(self, *cols: str) -> Iterator[list]:
         """Iterate over rows, getting feature values and applying reader calls.
         If columns are specified - limit them to specified columns.
         """
         for features in self.iterate(*cols):
@@ -504,7 +579,9 @@ class DataChain(DatasetQuery):
             yield item[0]
     def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
-        """Iterate over rows. If columns are specified - limit them to specified
+        """Iterate over rows.
+        If columns are specified - limit them to specified
         columns.
         """
         chain = self.select(*cols) if cols else self
@@ -563,20 +640,19 @@ class DataChain(DatasetQuery):
     ) -> "Self":
         """Merge two chains based on the specified criteria.
-        Parameters
-        ----------
-        right_ds : Chain to join with.
-        on : Predicate or list of Predicates to join on. If both chains have the same
-             predicates then this predicate is enough for the join. Otherwise,
-             `right_on` parameter has to specify the predicates for the other chain.
-        right_on: Optional predicate or list of Predicates for the `right_ds` to join.
-        inner: Whether to run inner join or outer join. Default is False.
-        rname: name prefix for conflicting signal names. Default: "{name}_right"
+        Parameters:
+            right_ds : Chain to join with.
+            on : Predicate or list of Predicates to join on. If both chains have the
+                same predicates then this predicate is enough for the join. Otherwise,
+                `right_on` parameter has to specify the predicates for the other chain.
+            right_on: Optional predicate or list of Predicates
+                    for the `right_ds` to join.
+            inner (bool): Whether to run inner join or outer join.
+            rname (str): name prefix for conflicting signal names.
         Examples:
-        >>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
-                                  right_on=(C.name, C.pq__index))
+            >>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
+                                    right_on=(C.name, C.pq__index))
         """
         if on is None:
             raise DatasetMergeError(["None"], None, "'on' must be specified")
@@ -599,7 +675,7 @@ class DataChain(DatasetQuery):
                 raise DatasetMergeError(
                     on,
                     right_on,
-                    f"'right_on' must be 'str' or 'Sequence' object"
+                    "'right_on' must be 'str' or 'Sequence' object"
                     f" but got type '{right_on}'",
                 )
@@ -616,7 +692,7 @@ class DataChain(DatasetQuery):
                 raise DatasetMergeError(
                     on,
                     right_on,
-                    f"'on' and 'right_on' must have the same number of columns in db'."
+                    "'on' and 'right_on' must have the same number of columns in db'."
                     f" on -> {on_str}, right_on -> {right_on_str}",
                 )
         else:
@@ -654,7 +730,7 @@ class DataChain(DatasetQuery):
     @classmethod
     def from_pandas(  # type: ignore[override]
-        cls, df, name: str = "", session: Optional[Session] = None
+        cls, df: "pd.DataFrame", name: str = "", session: Optional[Session] = None
     ) -> "DataChain":
         """Generate chain from pandas data-frame."""
         fr_map = {col.lower(): df[col].tolist() for col in df.columns}
@@ -664,7 +740,7 @@ class DataChain(DatasetQuery):
                 raise DatasetPrepareError(
                     name,
                     f"import from pandas error - column '{column}' conflicts with"
-                    f" default schema",
+                    " default schema",
                 )
             if not column.isidentifier():
                 raise DatasetPrepareError(
@@ -674,6 +750,131 @@ class DataChain(DatasetQuery):
         return cls.from_features(name, session, **fr_map)
+    def parse_tabular(
+        self,
+        output: Optional[dict[str, FeatureType]] = None,
+        **kwargs,
+    ) -> "DataChain":
+        """Generate chain from list of tabular files.
+        Parameters:
+            output : Dictionary defining column names and their corresponding types.
+            kwargs : Parameters to pass to pyarrow.dataset.dataset.
+        Examples:
+            Reading a json lines file:
+            >>> dc = DataChain.from_storage("s3://mybucket/file.jsonl")
+            >>> dc = dc.parse_tabular(format="json")
+            Reading a filtered list of files as a dataset:
+            >>> dc = DataChain.from_storage("s3://mybucket")
+            >>> dc = dc.filter(C("file.name").glob("*.jsonl"))
+            >>> dc = dc.parse_tabular(format="json")
+        """
+        from pyarrow import unify_schemas
+        from pyarrow.dataset import dataset
+        from datachain.lib.arrow import ArrowGenerator, Source, schema_to_output
+        schema = None
+        if output:
+            output = {"source": Source} | output
+        else:
+            schemas = []
+            for row in self.select("file").iterate():
+                file = row[0]
+                ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs)  # type: ignore[union-attr]
+                schemas.append(ds.schema)
+            if not schemas:
+                msg = "error parsing tabular data schema - found no files to parse"
+                raise DatasetPrepareError(self.name, msg)
+            schema = unify_schemas(schemas)
+            try:
+                output = schema_to_output(schema)
+                print(f"Inferred tabular data schema: {output}")
+            except ValueError as e:
+                raise DatasetPrepareError(self.name, e) from e
+        return self.gen(ArrowGenerator(schema, **kwargs), output=output)
+    def parse_csv(
+        self,
+        delimiter: str = ",",
+        header: bool = True,
+        column_names: Optional[list[str]] = None,
+        output: Optional[dict[str, FeatureType]] = None,
+    ) -> "DataChain":
+        """Generate chain from list of csv files.
+        Parameters:
+            delimiter : Character for delimiting columns.
+            header : Whether the files include a header row.
+            column_names : Column names if no header. Implies `header = False`.
+            output : Dictionary defining column names and their corresponding types.
+        Examples:
+            Reading a csv file:
+            >>> dc = DataChain.from_storage("s3://mybucket/file.csv")
+            >>> dc = dc.parse_tabular(format="csv")
+            Reading a filtered list of csv files as a dataset:
+            >>> dc = DataChain.from_storage("s3://mybucket")
+            >>> dc = dc.filter(C("file.name").glob("*.csv"))
+            >>> dc = dc.parse_tabular()
+        """
+        from pyarrow.csv import ParseOptions, ReadOptions
+        from pyarrow.dataset import CsvFileFormat
+        if column_names and output:
+            msg = "error parsing csv - only one of column_names or output is allowed"
+            raise DatasetPrepareError(self.name, msg)
+        if not header and not column_names:
+            if output:
+                column_names = list(output.keys())
+            else:
+                msg = "error parsing csv - provide column_names or output if no header"
+                raise DatasetPrepareError(self.name, msg)
+        parse_options = ParseOptions(delimiter=delimiter)
+        read_options = ReadOptions(column_names=column_names)
+        format = CsvFileFormat(parse_options=parse_options, read_options=read_options)
+        return self.parse_tabular(output=output, format=format)
+    def parse_parquet(
+        self,
+        partitioning: Any = "hive",
+        output: Optional[dict[str, FeatureType]] = None,
+    ) -> "DataChain":
+        """Generate chain from list of parquet files.
+        Parameters:
+            partitioning : Any pyarrow partitioning schema.
+            output : Dictionary defining column names and their corresponding types.
+        Examples:
+            Reading a single file:
+            >>> dc = DataChain.from_storage("s3://mybucket/file.parquet")
+            >>> dc = dc.parse_tabular()
+            Reading a partitioned dataset from a directory:
+            >>> dc = DataChain.from_storage("path/to/dir")
+            >>> dc = dc.parse_tabular()
+            Reading a filtered list of files as a dataset:
+            >>> dc = DataChain.from_storage("s3://mybucket")
+            >>> dc = dc.filter(C("file.name").glob("*.parquet"))
+            >>> dc = dc.parse_tabular()
+            Reading a filtered list of partitions as a dataset:
+            >>> dc = DataChain.from_storage("s3://mybucket")
+            >>> dc = dc.filter(C("file.parent").glob("*month=1*"))
+            >>> dc = dc.parse_tabular()
+        """
+        return self.parse_tabular(
+            output=output, format="parquet", partitioning=partitioning
+        )
     @classmethod
     def create_empty(
         cls,
@@ -683,17 +884,13 @@ class DataChain(DatasetQuery):
         """Create empty chain. Returns a chain. This method is used for programmatically
         generating a chains in contrast of reading data from storages or other sources.
-        Parameters
-        ----------
-        to_insert : records (or a single record) to insert. Each record is a dictionary
-                    of signals and theirs values.
-        Examples
-        --------
+        Parameters:
+            to_insert : records (or a single record) to insert. Each record is
+                        a dictionary of signals and theirs values.
-        >>> empty = DataChain.create_empty()
-        >>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
+        Examples:
+            >>> empty = DataChain.create_empty()
+            >>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
         """
         session = Session.get(session)
         dsr = cls.create_empty_record(session=session)
@@ -740,10 +937,12 @@ class DataChain(DatasetQuery):
     @detach
     def chunk(self, index: int, total: int) -> "DataChain":
         """Split a query into smaller chunks for e.g. parallelization.
-        Example:
+        Examples:
             >>> dc = DataChain(...)
             >>> chunk_1 = dc._chunk(0, 2)
             >>> chunk_2 = dc._chunk(1, 2)
         Note:
             Bear in mind that `index` is 0-indexed but `total` isn't.
             Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.

datachain 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

datachain 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl