PyPI - datachain - Versions diffs - 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

datachain 0.7.11py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (27) hide show

datachain/catalog/catalog.py +56 -45
datachain/cli.py +25 -3
datachain/client/gcs.py +9 -0
datachain/data_storage/sqlite.py +20 -6
datachain/data_storage/warehouse.py +0 -1
datachain/lib/arrow.py +82 -58
datachain/lib/dc.py +167 -166
datachain/lib/diff.py +197 -0
datachain/lib/file.py +3 -1
datachain/lib/listing.py +44 -0
datachain/lib/meta_formats.py +38 -42
datachain/lib/udf.py +0 -1
datachain/query/batch.py +32 -6
datachain/query/dataset.py +18 -17
datachain/query/dispatch.py +125 -125
datachain/query/session.py +8 -5
datachain/query/udf.py +20 -0
datachain/query/utils.py +42 -0
datachain/remote/studio.py +53 -1
datachain/studio.py +47 -2
datachain/utils.py +1 -1
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/METADATA +4 -3
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/RECORD +27 -24
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/LICENSE +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/WHEEL +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/entry_points.txt +0 -0
{datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/top_level.txt +0 -0

datachain/lib/listing.py CHANGED Viewed

@@ -15,6 +15,7 @@ from datachain.utils import uses_glob
 if TYPE_CHECKING:
     from datachain.lib.dc import DataChain
+    from datachain.query.session import Session
 LISTING_TTL = 4 * 60 * 60  # cached listing lasts 4 hours
 LISTING_PREFIX = "lst__"  # listing datasets start with this name
@@ -108,3 +109,46 @@ def listing_uri_from_name(dataset_name: str) -> str:
     if not is_listing_dataset(dataset_name):
         raise ValueError(f"Dataset {dataset_name} is not a listing")
     return dataset_name.removeprefix(LISTING_PREFIX)
+def get_listing(
+    uri: str, session: "Session", update: bool = False
+) -> tuple[str, str, str, bool]:
+    """Returns correct listing dataset name that must be used for saving listing
+    operation. It takes into account existing listings and reusability of those.
+    It also returns boolean saying if returned dataset name is reused / already
+    exists or not (on update it always returns False - just because there was no
+    reason to complicate it so far). And it returns correct listing path that should
+    be used to find rows based on uri.
+    """
+    from datachain.client.local import FileClient
+    catalog = session.catalog
+    cache = catalog.cache
+    client_config = catalog.client_config
+    client = Client.get_client(uri, cache, **client_config)
+    ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
+    listing = None
+    listings = [
+        ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
+    ]
+    # if no need to update - choosing the most recent one;
+    # otherwise, we'll using the exact original `ds_name`` in this case:
+    # - if a "bigger" listing exists, we don't want to update it, it's better
+    #   to create a new "smaller" one on "update=True"
+    # - if an exact listing exists it will have the same name as `ds_name`
+    #   anyway below
+    if listings and not update:
+        listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
+    # for local file system we need to fix listing path / prefix
+    # if we are reusing existing listing
+    if isinstance(client, FileClient) and listing and listing.name != ds_name:
+        list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
+    ds_name = listing.name if listing else ds_name
+    return ds_name, list_uri, list_path, bool(listing)

datachain/lib/meta_formats.py CHANGED Viewed

@@ -38,38 +38,41 @@ def process_json(data_string, jmespath):
     return json_dict
-# Print a dynamic datamodel-codegen output from JSON or CSV on stdout
-def read_schema(source_file, data_type="csv", expr=None, model_name=None):
+def gen_datamodel_code(
+    source_file, format="json", jmespath=None, model_name=None
+) -> str:
+    """Generates Python code with Pydantic models that corresponds
+    to the provided JSON, CSV, or JSONL file.
+    It support root JSON arrays (samples the first entry).
+    """
     data_string = ""
     # using uiid to get around issue #1617
     if not model_name:
         # comply with Python class names
         uid_str = str(generate_uuid()).replace("-", "")
-        model_name = f"Model{data_type}{uid_str}"
-    try:
-        with source_file.open() as fd:  # CSV can be larger than memory
-            if data_type == "csv":
-                data_string += fd.readline().replace("\r", "")
-                data_string += fd.readline().replace("\r", "")
-            elif data_type == "jsonl":
-                data_string = fd.readline().replace("\r", "")
-            else:
-                data_string = fd.read()  # other meta must fit into RAM
-    except OSError as e:
-        print(f"An unexpected file error occurred: {e}")
-        return
-    if data_type in ("json", "jsonl"):
-        json_object = process_json(data_string, expr)
-        if data_type == "json" and isinstance(json_object, list):
+        model_name = f"Model{format}{uid_str}"
+    with source_file.open() as fd:  # CSV can be larger than memory
+        if format == "csv":
+            data_string += fd.readline().replace("\r", "")
+            data_string += fd.readline().replace("\r", "")
+        elif format == "jsonl":
+            data_string = fd.readline().replace("\r", "")
+        else:
+            data_string = fd.read()  # other meta must fit into RAM
+    if format in ("json", "jsonl"):
+        json_object = process_json(data_string, jmespath)
+        if format == "json" and isinstance(json_object, list):
             json_object = json_object[0]  # sample the 1st object from JSON array
-        if data_type == "jsonl":
-            data_type = "json"  # treat json line as plain JSON in auto-schema
+        if format == "jsonl":
+            format = "json"  # treat json line as plain JSON in auto-schema
         data_string = json.dumps(json_object)
     import datamodel_code_generator
     input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
-    input_file_type = input_file_types[data_type]
+    input_file_type = input_file_types[format]
     with tempfile.TemporaryDirectory() as tmpdir:
         output = Path(tmpdir) / "model.py"
         datamodel_code_generator.generate(
@@ -95,36 +98,29 @@ spec = {model_name}
 def read_meta(  # noqa: C901
     spec=None,
     schema_from=None,
-    meta_type="json",
+    format="json",
     jmespath=None,
-    print_schema=False,
     model_name=None,
     nrows=None,
 ) -> Callable:
     from datachain.lib.dc import DataChain
     if schema_from:
-        chain = (
-            DataChain.from_storage(schema_from, type="text")
-            .limit(1)
-            .map(  # dummy column created (#1615)
-                meta_schema=lambda file: read_schema(
-                    file, data_type=meta_type, expr=jmespath, model_name=model_name
-                ),
-                output=str,
-            )
+        file = next(
+            DataChain.from_storage(schema_from, type="text").limit(1).collect("file")
         )
-        (model_output,) = chain.collect("meta_schema")
-        assert isinstance(model_output, str)
-        if print_schema:
-            print(f"{model_output}")
+        model_code = gen_datamodel_code(
+            file, format=format, jmespath=jmespath, model_name=model_name
+        )
+        assert isinstance(model_code, str)
         # Below 'spec' should be a dynamically converted DataModel from Pydantic
         if not spec:
             gl = globals()
-            exec(model_output, gl)  # type: ignore[arg-type] # noqa: S102
+            exec(model_code, gl)  # type: ignore[arg-type] # noqa: S102
             spec = gl["spec"]
-    if not (spec) and not (schema_from):
+    if not spec and not schema_from:
         raise ValueError(
             "Must provide a static schema in spec: or metadata sample in schema_from:"
         )
@@ -136,7 +132,7 @@ def read_meta(  # noqa: C901
     def parse_data(
         file: File,
         data_model=spec,
-        meta_type=meta_type,
+        format=format,
         jmespath=jmespath,
         nrows=nrows,
     ) -> Iterator[spec]:
@@ -148,7 +144,7 @@ def read_meta(  # noqa: C901
             except ValidationError as e:
                 print(f"Validation error occurred in row {nrow} file {file.name}:", e)
-        if meta_type == "csv":
+        if format == "csv":
             with (
                 file.open() as fd
             ):  # TODO: if schema is statically given, should allow CSV without headers
@@ -156,7 +152,7 @@ def read_meta(  # noqa: C901
                 for row in reader:  # CSV can be larger than memory
                     yield from validator(row)
-        if meta_type == "json":
+        if format == "json":
             try:
                 with file.open() as fd:  # JSON must fit into RAM
                     data_string = fd.read()
@@ -174,7 +170,7 @@ def read_meta(  # noqa: C901
                         return
                     yield from validator(json_dict, nrow)
-        if meta_type == "jsonl":
+        if format == "jsonl":
             try:
                 nrow = 0
                 with file.open() as fd:

datachain/lib/udf.py CHANGED Viewed

@@ -85,7 +85,6 @@ class UDFAdapter:
         udf_fields: "Sequence[str]",
         udf_inputs: "Iterable[RowsOutput]",
         catalog: "Catalog",
-        is_generator: bool,
         cache: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,

datachain/query/batch.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Callable, Optional, Union
 from datachain.data_storage.schema import PARTITION_COLUMN_ID
 from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
+from datachain.query.utils import get_query_column, get_query_id_column
 if TYPE_CHECKING:
     from sqlalchemy import Select
@@ -23,11 +24,14 @@ RowsOutput = Union[Sequence, RowsOutputBatch]
 class BatchingStrategy(ABC):
     """BatchingStrategy provides means of batching UDF executions."""
+    is_batching: bool
     @abstractmethod
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[RowsOutput, None, None]:
         """Apply the provided parameters to the UDF."""
@@ -38,11 +42,16 @@ class NoBatching(BatchingStrategy):
     batch UDF calls.
     """
+    is_batching = False
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[Sequence, None, None]:
+        if ids_only:
+            query = query.with_only_columns(get_query_id_column(query))
         return execute(query)
@@ -52,14 +61,20 @@ class Batch(BatchingStrategy):
     is passed a sequence of multiple parameter sets.
     """
+    is_batching = True
     def __init__(self, count: int):
         self.count = count
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[RowsOutputBatch, None, None]:
+        if ids_only:
+            query = query.with_only_columns(get_query_id_column(query))
         # choose page size that is a multiple of the batch size
         page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
@@ -84,19 +99,30 @@ class Partition(BatchingStrategy):
     Dataset rows need to be sorted by the grouping column.
     """
+    is_batching = True
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[RowsOutputBatch, None, None]:
+        id_col = get_query_id_column(query)
+        if (partition_col := get_query_column(query, PARTITION_COLUMN_ID)) is None:
+            raise RuntimeError("partition column not found in query")
+        if ids_only:
+            query = query.with_only_columns(id_col, partition_col)
         current_partition: Optional[int] = None
         batch: list[Sequence] = []
         query_fields = [str(c.name) for c in query.selected_columns]
+        id_column_idx = query_fields.index("sys__id")
         partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
         ordered_query = query.order_by(None).order_by(
-            PARTITION_COLUMN_ID,
+            partition_col,
             *query._order_by_clauses,
         )
@@ -108,7 +134,7 @@ class Partition(BatchingStrategy):
                     if len(batch) > 0:
                         yield RowsOutputBatch(batch)
                         batch = []
-                batch.append(row)
+                batch.append([row[id_column_idx]] if ids_only else row)
             if len(batch) > 0:
                 yield RowsOutputBatch(batch)

datachain/query/dataset.py CHANGED Viewed

@@ -43,8 +43,9 @@ from datachain.data_storage.schema import (
 from datachain.dataset import DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.func.base import Function
-from datachain.lib.udf import UDFAdapter
 from datachain.progress import CombinedDownloadCallback
+from datachain.query.schema import C, UDFParamSpec, normalize_param
+from datachain.query.session import Session
 from datachain.sql.functions.random import rand
 from datachain.utils import (
     batched,
@@ -53,9 +54,6 @@ from datachain.utils import (
     get_datachain_executable,
 )
-from .schema import C, UDFParamSpec, normalize_param
-from .session import Session
 if TYPE_CHECKING:
     from sqlalchemy.sql.elements import ClauseElement
     from sqlalchemy.sql.schema import Table
@@ -65,7 +63,8 @@ if TYPE_CHECKING:
     from datachain.catalog import Catalog
     from datachain.data_storage import AbstractWarehouse
     from datachain.dataset import DatasetRecord
-    from datachain.lib.udf import UDFResult
+    from datachain.lib.udf import UDFAdapter, UDFResult
+    from datachain.query.udf import UdfInfo
     P = ParamSpec("P")
@@ -301,7 +300,7 @@ def adjust_outputs(
     return row
-def get_udf_col_types(warehouse: "AbstractWarehouse", udf: UDFAdapter) -> list[tuple]:
+def get_udf_col_types(warehouse: "AbstractWarehouse", udf: "UDFAdapter") -> list[tuple]:
     """Optimization: Precompute UDF column types so these don't have to be computed
     in the convert_type function for each row in a loop."""
     dialect = warehouse.db.dialect
@@ -322,7 +321,7 @@ def process_udf_outputs(
     warehouse: "AbstractWarehouse",
     udf_table: "Table",
     udf_results: Iterator[Iterable["UDFResult"]],
-    udf: UDFAdapter,
+    udf: "UDFAdapter",
     batch_size: int = INSERT_BATCH_SIZE,
     cb: Callback = DEFAULT_CALLBACK,
 ) -> None:
@@ -347,6 +346,8 @@ def process_udf_outputs(
         for row_chunk in batched(rows, batch_size):
             warehouse.insert_rows(udf_table, row_chunk)
+    warehouse.insert_rows_done(udf_table)
 def get_download_callback() -> Callback:
     return CombinedDownloadCallback(
@@ -366,7 +367,7 @@ def get_generated_callback(is_generator: bool = False) -> Callback:
 @frozen
 class UDFStep(Step, ABC):
-    udf: UDFAdapter
+    udf: "UDFAdapter"
     catalog: "Catalog"
     partition_by: Optional[PartitionByType] = None
     parallel: Optional[int] = None
@@ -440,7 +441,7 @@ class UDFStep(Step, ABC):
                     raise RuntimeError(
                         "In-memory databases cannot be used with parallel processing."
                     )
-                udf_info = {
+                udf_info: UdfInfo = {
                     "udf_data": filtered_cloudpickle_dumps(self.udf),
                     "catalog_init": self.catalog.get_init_params(),
                     "metastore_clone_params": self.catalog.metastore.clone_params(),
@@ -464,8 +465,8 @@ class UDFStep(Step, ABC):
                 with subprocess.Popen(cmd, env=envs, stdin=subprocess.PIPE) as process:  # noqa: S603
                     process.communicate(process_data)
-                    if process.poll():
-                        raise RuntimeError("UDF Execution Failed!")
+                    if retval := process.poll():
+                        raise RuntimeError(f"UDF Execution Failed! Exit code: {retval}")
             else:
                 # Otherwise process single-threaded (faster for smaller UDFs)
                 warehouse = self.catalog.warehouse
@@ -479,7 +480,6 @@ class UDFStep(Step, ABC):
                         udf_fields,
                         udf_inputs,
                         self.catalog,
-                        self.is_generator,
                         self.cache,
                         download_cb,
                         processed_cb,
@@ -496,8 +496,6 @@ class UDFStep(Step, ABC):
                     processed_cb.close()
                     generated_cb.close()
-                warehouse.insert_rows_done(udf_table)
         except QueryScriptCancelError:
             self.catalog.warehouse.close()
             sys.exit(QUERY_SCRIPT_CANCELED_EXIT_CODE)
@@ -1069,6 +1067,7 @@ class DatasetQuery:
         if "sys__id" in self.column_types:
             self.column_types.pop("sys__id")
         self.starting_step = QueryStep(self.catalog, name, self.version)
+        self.dialect = self.catalog.warehouse.db.dialect
     def __iter__(self):
         return iter(self.db_results())
@@ -1490,7 +1489,7 @@ class DatasetQuery:
     @detach
     def add_signals(
         self,
-        udf: UDFAdapter,
+        udf: "UDFAdapter",
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,
@@ -1534,7 +1533,7 @@ class DatasetQuery:
     @detach
     def generate(
         self,
-        udf: UDFAdapter,
+        udf: "UDFAdapter",
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,
@@ -1616,7 +1615,9 @@ class DatasetQuery:
             )
             version = version or dataset.latest_version
-            self.session.add_dataset_version(dataset=dataset, version=version)
+            self.session.add_dataset_version(
+                dataset=dataset, version=version, listing=kwargs.get("listing", False)
+            )
             dr = self.catalog.warehouse.dataset_rows(dataset)

datachain 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

datachain 0.7.11py3-none-any.whl → 0.8.1py3-none-any.whl