datachain 0.17.1__tar.gz → 0.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.17.1 → datachain-0.18.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.17.1/src/datachain.egg-info → datachain-0.18.0}/PKG-INFO +2 -2
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/run.md +6 -0
- {datachain-0.17.1 → datachain-0.18.0}/pyproject.toml +1 -1
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/catalog.py +6 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/job.py +7 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/warehouse.py +1 -1
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/dataset.py +7 -10
- datachain-0.18.0/src/datachain/delta.py +119 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/diff/__init__.py +10 -4
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/datachain.py +89 -2
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/datasets.py +41 -1
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/storage.py +45 -11
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/signal_schema.py +12 -6
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/dataset.py +27 -10
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/remote/studio.py +2 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/studio.py +3 -0
- {datachain-0.17.1 → datachain-0.18.0/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/SOURCES.txt +2 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_datachain.py +2 -4
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_dataset_query.py +18 -4
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_datasets.py +2 -1
- datachain-0.18.0/tests/func/test_delta.py +383 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_warehouse.py +2 -2
- {datachain-0.17.1 → datachain-0.18.0}/tests/test_cli_studio.py +1 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_datachain.py +47 -9
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_signal_schema.py +11 -11
- {datachain-0.17.1 → datachain-0.18.0}/.cruft.json +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.gitattributes +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/codecov.yaml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/dependabot.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/release.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/.gitignore +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/LICENSE +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/README.rst +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/login.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/logout.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/team.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/auth/token.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/index.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/cancel.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/logs.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/commands/job/ls.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/contributing.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/examples.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/index.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/overrides/main.html +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/quick-start.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/datachain.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/func.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/index.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/remotes.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/toolkit.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/torch.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/references/udf.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/docs/tutorials.md +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/mkdocs.yml +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/noxfile.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/setup.cfg +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/__main__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/asyn.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cache.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/local.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/config.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/error.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/array.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/base.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/func.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/path.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/random.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/string.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/func/window.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/job.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/listing.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/node.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/progress.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/py.typed +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/params.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/session.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/semver.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain/utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/conftest.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/data.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/examples/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/examples/test_examples.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_batching.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_client.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_file.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_func.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_hf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_image.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_listing.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_ls.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_pull.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_query.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_read_database.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_session.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/func/test_video.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/test_atomicity.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/test_import_time.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/test_telemetry.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_client.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_config.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_func.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_query.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_semver.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_session.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.17.1 → datachain-0.18.0}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -44,7 +44,7 @@ Requires-Dist: datamodel-code-generator>=0.25
|
|
|
44
44
|
Requires-Dist: Pillow<12,>=10.0.0
|
|
45
45
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
46
46
|
Requires-Dist: psutil
|
|
47
|
-
Requires-Dist: huggingface_hub
|
|
47
|
+
Requires-Dist: huggingface_hub
|
|
48
48
|
Requires-Dist: iterative-telemetry>=0.0.10
|
|
49
49
|
Requires-Dist: platformdirs
|
|
50
50
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
@@ -29,6 +29,7 @@ This command runs a job in Studio using the specified query file. You can config
|
|
|
29
29
|
* `--python-version PYTHON_VERSION` - Python version for the job (e.g., 3.9, 3.10, 3.11)
|
|
30
30
|
* `--req-file REQ_FILE` - Python requirements file
|
|
31
31
|
* `--req REQ` - Python package requirements
|
|
32
|
+
* `--priority PRIORITY` - Priority for the job in range 0-5. Lower value is higher priority (default: 5)
|
|
32
33
|
* `-h`, `--help` - Show the help message and exit.
|
|
33
34
|
* `-v`, `--verbose` - Be verbose.
|
|
34
35
|
* `-q`, `--quiet` - Be quiet.
|
|
@@ -65,6 +66,11 @@ datachain job run --env API_KEY=123 --req pandas numpy query.py
|
|
|
65
66
|
datachain job run --repository https://github.com/iterative/datachain query.py
|
|
66
67
|
```
|
|
67
68
|
|
|
69
|
+
7. Run a job with higher priority
|
|
70
|
+
```bash
|
|
71
|
+
datachain job run --priority 2 query.py
|
|
72
|
+
```
|
|
73
|
+
|
|
68
74
|
## Notes
|
|
69
75
|
|
|
70
76
|
* Closing the logs command (e.g., with Ctrl+C) will only stop displaying the logs but will not cancel the job execution
|
|
@@ -48,7 +48,7 @@ dependencies = [
|
|
|
48
48
|
"Pillow>=10.0.0,<12",
|
|
49
49
|
"msgpack>=1.0.4,<2",
|
|
50
50
|
"psutil",
|
|
51
|
-
"huggingface_hub
|
|
51
|
+
"huggingface_hub",
|
|
52
52
|
"iterative-telemetry>=0.0.10",
|
|
53
53
|
"platformdirs",
|
|
54
54
|
"dvc-studio-client>=0.21,<1",
|
|
@@ -779,6 +779,7 @@ class Catalog:
|
|
|
779
779
|
uuid: Optional[str] = None,
|
|
780
780
|
description: Optional[str] = None,
|
|
781
781
|
attrs: Optional[list[str]] = None,
|
|
782
|
+
update_version: Optional[str] = "patch",
|
|
782
783
|
) -> "DatasetRecord":
|
|
783
784
|
"""
|
|
784
785
|
Creates new dataset of a specific version.
|
|
@@ -795,6 +796,11 @@ class Catalog:
|
|
|
795
796
|
try:
|
|
796
797
|
dataset = self.get_dataset(name)
|
|
797
798
|
default_version = dataset.next_version_patch
|
|
799
|
+
if update_version == "major":
|
|
800
|
+
default_version = dataset.next_version_major
|
|
801
|
+
if update_version == "minor":
|
|
802
|
+
default_version = dataset.next_version_minor
|
|
803
|
+
|
|
798
804
|
if (description or attrs) and (
|
|
799
805
|
dataset.description != description or dataset.attrs != attrs
|
|
800
806
|
):
|
|
@@ -82,6 +82,13 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
82
82
|
nargs="+",
|
|
83
83
|
help="Python package requirements",
|
|
84
84
|
)
|
|
85
|
+
studio_run_parser.add_argument(
|
|
86
|
+
"--priority",
|
|
87
|
+
type=int,
|
|
88
|
+
default=5,
|
|
89
|
+
help="Priority for the job in range 0-5. "
|
|
90
|
+
"Lower value is higher priority (default: 5)",
|
|
91
|
+
)
|
|
85
92
|
|
|
86
93
|
studio_ls_help = "List jobs in Studio"
|
|
87
94
|
studio_ls_description = "List jobs in Studio."
|
|
@@ -258,7 +258,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
258
258
|
if Client.is_data_source_uri(dataset_name):
|
|
259
259
|
# for datasets that are created for bucket listing we use different prefix
|
|
260
260
|
prefix = self.DATASET_SOURCE_TABLE_PREFIX
|
|
261
|
-
return f"{prefix}{dataset_name}_{version}"
|
|
261
|
+
return f"{prefix}{dataset_name}_{version.replace('.', '_')}"
|
|
262
262
|
|
|
263
263
|
def temp_table_name(self) -> str:
|
|
264
264
|
return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
|
|
@@ -107,24 +107,21 @@ class DatasetDependency:
|
|
|
107
107
|
dataset_version: Optional[str],
|
|
108
108
|
dataset_version_created_at: Optional[datetime],
|
|
109
109
|
) -> Optional["DatasetDependency"]:
|
|
110
|
-
from datachain.
|
|
111
|
-
from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
|
|
110
|
+
from datachain.lib.listing import is_listing_dataset
|
|
112
111
|
|
|
113
112
|
if not dataset_id:
|
|
114
113
|
return None
|
|
115
114
|
|
|
116
115
|
assert dataset_name is not None
|
|
117
|
-
dependency_type = DatasetDependencyType.DATASET
|
|
118
|
-
dependency_name = dataset_name
|
|
119
|
-
|
|
120
|
-
if is_listing_dataset(dataset_name):
|
|
121
|
-
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
122
|
-
dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
|
|
123
116
|
|
|
124
117
|
return cls(
|
|
125
118
|
id,
|
|
126
|
-
|
|
127
|
-
|
|
119
|
+
(
|
|
120
|
+
DatasetDependencyType.STORAGE
|
|
121
|
+
if is_listing_dataset(dataset_name)
|
|
122
|
+
else DatasetDependencyType.DATASET
|
|
123
|
+
),
|
|
124
|
+
dataset_name,
|
|
128
125
|
(
|
|
129
126
|
dataset_version # type: ignore[arg-type]
|
|
130
127
|
if dataset_version
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from copy import copy
|
|
3
|
+
from functools import wraps
|
|
4
|
+
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
5
|
+
|
|
6
|
+
import datachain
|
|
7
|
+
from datachain.dataset import DatasetDependency
|
|
8
|
+
from datachain.error import DatasetNotFoundError
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from typing_extensions import Concatenate, ParamSpec
|
|
12
|
+
|
|
13
|
+
from datachain.lib.dc import DataChain
|
|
14
|
+
|
|
15
|
+
P = ParamSpec("P")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T", bound="DataChain")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def delta_disabled(
|
|
22
|
+
method: "Callable[Concatenate[T, P], T]",
|
|
23
|
+
) -> "Callable[Concatenate[T, P], T]":
|
|
24
|
+
"""
|
|
25
|
+
Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
|
|
26
|
+
work with delta updates. It throws `NotImplementedError` if chain on which
|
|
27
|
+
method is called is marked as delta.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@wraps(method)
|
|
31
|
+
def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
|
|
32
|
+
if self.delta:
|
|
33
|
+
raise NotImplementedError(
|
|
34
|
+
f"Delta update cannot be used with {method.__name__}"
|
|
35
|
+
)
|
|
36
|
+
return method(self, *args, **kwargs)
|
|
37
|
+
|
|
38
|
+
return _inner
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _append_steps(dc: "DataChain", other: "DataChain"):
|
|
42
|
+
"""Returns cloned chain with appended steps from other chain.
|
|
43
|
+
Steps are all those modification methods applied like filters, mappers etc.
|
|
44
|
+
"""
|
|
45
|
+
dc = dc.clone()
|
|
46
|
+
dc._query.steps += other._query.steps.copy()
|
|
47
|
+
dc.signals_schema = other.signals_schema
|
|
48
|
+
return dc
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def delta_update(
|
|
52
|
+
dc: "DataChain",
|
|
53
|
+
name: str,
|
|
54
|
+
on: Union[str, Sequence[str]],
|
|
55
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
56
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
|
+
) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
|
|
58
|
+
"""
|
|
59
|
+
Creates new chain that consists of the last version of current delta dataset
|
|
60
|
+
plus diff from the source with all needed modifications.
|
|
61
|
+
This way we don't need to re-calculate the whole chain from the source again(
|
|
62
|
+
apply all the DataChain methods like filters, mappers, generators etc.)
|
|
63
|
+
but just the diff part which is very important for performance.
|
|
64
|
+
|
|
65
|
+
Note that currently delta update works only if there is only one direct dependency.
|
|
66
|
+
"""
|
|
67
|
+
catalog = dc.session.catalog
|
|
68
|
+
dc._query.apply_listing_pre_step()
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
latest_version = catalog.get_dataset(name).latest_version
|
|
72
|
+
except DatasetNotFoundError:
|
|
73
|
+
# first creation of delta update dataset
|
|
74
|
+
return None, None, True
|
|
75
|
+
|
|
76
|
+
dependencies = catalog.get_dataset_dependencies(
|
|
77
|
+
name, latest_version, indirect=False
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
dep = dependencies[0]
|
|
81
|
+
if not dep:
|
|
82
|
+
# starting dataset (e.g listing) was removed so we are backing off to normal
|
|
83
|
+
# dataset creation, as it was created first time
|
|
84
|
+
return None, None, True
|
|
85
|
+
|
|
86
|
+
source_ds_name = dep.name
|
|
87
|
+
source_ds_version = dep.version
|
|
88
|
+
source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
|
|
89
|
+
dependencies = copy(dependencies)
|
|
90
|
+
dependencies = [d for d in dependencies if d is not None] # filter out removed dep
|
|
91
|
+
dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
|
|
92
|
+
|
|
93
|
+
source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
|
|
94
|
+
source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
|
|
95
|
+
|
|
96
|
+
diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
|
|
97
|
+
# We append all the steps from the original chain to diff, e.g filters, mappers.
|
|
98
|
+
diff = _append_steps(diff, dc)
|
|
99
|
+
|
|
100
|
+
# to avoid re-calculating diff multiple times
|
|
101
|
+
diff = diff.persist()
|
|
102
|
+
|
|
103
|
+
if diff.empty:
|
|
104
|
+
return None, None, False
|
|
105
|
+
|
|
106
|
+
# merging diff and the latest version of dataset
|
|
107
|
+
delta_chain = (
|
|
108
|
+
datachain.read_dataset(name, latest_version)
|
|
109
|
+
.compare(
|
|
110
|
+
diff,
|
|
111
|
+
on=right_on or on,
|
|
112
|
+
added=True,
|
|
113
|
+
modified=False,
|
|
114
|
+
deleted=False,
|
|
115
|
+
)
|
|
116
|
+
.union(diff)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return delta_chain, dependencies, True # type: ignore[return-value]
|
|
@@ -30,7 +30,7 @@ class CompareStatus(str, Enum):
|
|
|
30
30
|
SAME = "S"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def _compare( # noqa: C901
|
|
33
|
+
def _compare( # noqa: C901, PLR0912
|
|
34
34
|
left: "DataChain",
|
|
35
35
|
right: "DataChain",
|
|
36
36
|
on: Union[str, Sequence[str]],
|
|
@@ -77,14 +77,16 @@ def _compare( # noqa: C901
|
|
|
77
77
|
cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
|
|
78
78
|
|
|
79
79
|
# getting correct on and right_on column names
|
|
80
|
+
on_ = on
|
|
80
81
|
on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
81
|
-
right_on = right.signals_schema.resolve(*(right_on or
|
|
82
|
+
right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals() # type: ignore[assignment]
|
|
82
83
|
|
|
83
84
|
# getting correct compare and right_compare column names if they are defined
|
|
84
85
|
if compare:
|
|
86
|
+
compare_ = compare
|
|
85
87
|
compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
|
|
86
88
|
right_compare = right.signals_schema.resolve(
|
|
87
|
-
*(right_compare or
|
|
89
|
+
*(right_compare or compare_)
|
|
88
90
|
).db_signals() # type: ignore[assignment]
|
|
89
91
|
elif not compare and len(cols) != len(right_cols):
|
|
90
92
|
# here we will mark all rows that are not added or deleted as modified since
|
|
@@ -155,7 +157,11 @@ def _compare( # noqa: C901
|
|
|
155
157
|
if status_col:
|
|
156
158
|
cols_select.append(diff_col)
|
|
157
159
|
|
|
158
|
-
|
|
160
|
+
if not dc_diff._sys:
|
|
161
|
+
# TODO workaround when sys signal is not available in diff
|
|
162
|
+
dc_diff = dc_diff.settings(sys=True).select(*cols_select).settings(sys=False)
|
|
163
|
+
else:
|
|
164
|
+
dc_diff = dc_diff.select(*cols_select)
|
|
159
165
|
|
|
160
166
|
# final schema is schema from the left chain with status column added if needed
|
|
161
167
|
dc_diff.signals_schema = (
|
|
@@ -25,6 +25,7 @@ from tqdm import tqdm
|
|
|
25
25
|
|
|
26
26
|
from datachain import semver
|
|
27
27
|
from datachain.dataset import DatasetRecord
|
|
28
|
+
from datachain.delta import delta_disabled, delta_update
|
|
28
29
|
from datachain.func import literal
|
|
29
30
|
from datachain.func.base import Function
|
|
30
31
|
from datachain.func.func import Func
|
|
@@ -72,6 +73,9 @@ if TYPE_CHECKING:
|
|
|
72
73
|
P = ParamSpec("P")
|
|
73
74
|
|
|
74
75
|
|
|
76
|
+
T = TypeVar("T", bound="DataChain")
|
|
77
|
+
|
|
78
|
+
|
|
75
79
|
class DataChain:
|
|
76
80
|
"""DataChain - a data structure for batch data processing and evaluation.
|
|
77
81
|
|
|
@@ -164,6 +168,7 @@ class DataChain:
|
|
|
164
168
|
self.signals_schema = signal_schema
|
|
165
169
|
self._setup: dict = setup or {}
|
|
166
170
|
self._sys = _sys
|
|
171
|
+
self._delta = False
|
|
167
172
|
|
|
168
173
|
def __repr__(self) -> str:
|
|
169
174
|
"""Return a string representation of the chain."""
|
|
@@ -177,6 +182,32 @@ class DataChain:
|
|
|
177
182
|
self.print_schema(file=file)
|
|
178
183
|
return file.getvalue()
|
|
179
184
|
|
|
185
|
+
def _as_delta(
|
|
186
|
+
self,
|
|
187
|
+
on: Optional[Union[str, Sequence[str]]] = None,
|
|
188
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
189
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
190
|
+
) -> "Self":
|
|
191
|
+
"""Marks this chain as delta, which means special delta process will be
|
|
192
|
+
called on saving dataset for optimization"""
|
|
193
|
+
if on is None:
|
|
194
|
+
raise ValueError("'delta on' fields must be defined")
|
|
195
|
+
self._delta = True
|
|
196
|
+
self._delta_on = on
|
|
197
|
+
self._delta_result_on = right_on
|
|
198
|
+
self._delta_compare = compare
|
|
199
|
+
return self
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def empty(self) -> bool:
|
|
203
|
+
"""Returns True if chain has zero number of rows"""
|
|
204
|
+
return not bool(self.count())
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def delta(self) -> bool:
|
|
208
|
+
"""Returns True if this chain is ran in "delta" update mode"""
|
|
209
|
+
return self._delta
|
|
210
|
+
|
|
180
211
|
@property
|
|
181
212
|
def schema(self) -> dict[str, DataType]:
|
|
182
213
|
"""Get schema of the chain."""
|
|
@@ -254,9 +285,17 @@ class DataChain:
|
|
|
254
285
|
signal_schema = copy.deepcopy(self.signals_schema)
|
|
255
286
|
if _sys is None:
|
|
256
287
|
_sys = self._sys
|
|
257
|
-
|
|
288
|
+
chain = type(self)(
|
|
258
289
|
query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
|
|
259
290
|
)
|
|
291
|
+
if self.delta:
|
|
292
|
+
chain = chain._as_delta(
|
|
293
|
+
on=self._delta_on,
|
|
294
|
+
right_on=self._delta_result_on,
|
|
295
|
+
compare=self._delta_compare,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
return chain
|
|
260
299
|
|
|
261
300
|
def settings(
|
|
262
301
|
self,
|
|
@@ -461,8 +500,9 @@ class DataChain:
|
|
|
461
500
|
version: Optional[str] = None,
|
|
462
501
|
description: Optional[str] = None,
|
|
463
502
|
attrs: Optional[list[str]] = None,
|
|
503
|
+
update_version: Optional[str] = "patch",
|
|
464
504
|
**kwargs,
|
|
465
|
-
) -> "
|
|
505
|
+
) -> "DataChain":
|
|
466
506
|
"""Save to a Dataset. It returns the chain itself.
|
|
467
507
|
|
|
468
508
|
Parameters:
|
|
@@ -472,11 +512,52 @@ class DataChain:
|
|
|
472
512
|
description : description of a dataset.
|
|
473
513
|
attrs : attributes of a dataset. They can be without value, e.g "NLP",
|
|
474
514
|
or with a value, e.g "location=US".
|
|
515
|
+
update_version: which part of the dataset version to automatically increase.
|
|
516
|
+
Available values: `major`, `minor` or `patch`. Default is `patch`.
|
|
475
517
|
"""
|
|
476
518
|
if version is not None:
|
|
477
519
|
semver.validate(version)
|
|
478
520
|
|
|
521
|
+
if update_version is not None and update_version not in [
|
|
522
|
+
"patch",
|
|
523
|
+
"major",
|
|
524
|
+
"minor",
|
|
525
|
+
]:
|
|
526
|
+
raise ValueError(
|
|
527
|
+
"update_version can have one of the following values: major, minor or"
|
|
528
|
+
" patch"
|
|
529
|
+
)
|
|
530
|
+
|
|
479
531
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
532
|
+
if self.delta and name:
|
|
533
|
+
delta_ds, dependencies, has_changes = delta_update(
|
|
534
|
+
self,
|
|
535
|
+
name,
|
|
536
|
+
on=self._delta_on,
|
|
537
|
+
right_on=self._delta_result_on,
|
|
538
|
+
compare=self._delta_compare,
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
if delta_ds:
|
|
542
|
+
return self._evolve(
|
|
543
|
+
query=delta_ds._query.save(
|
|
544
|
+
name=name,
|
|
545
|
+
version=version,
|
|
546
|
+
feature_schema=schema,
|
|
547
|
+
dependencies=dependencies,
|
|
548
|
+
**kwargs,
|
|
549
|
+
)
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
if not has_changes:
|
|
553
|
+
# sources have not been changed so new version of resulting dataset
|
|
554
|
+
# would be the same as previous one. To avoid duplicating exact
|
|
555
|
+
# datasets, we won't create new version of it and we will return
|
|
556
|
+
# current latest version instead.
|
|
557
|
+
from .datasets import read_dataset
|
|
558
|
+
|
|
559
|
+
return read_dataset(name, **kwargs)
|
|
560
|
+
|
|
480
561
|
return self._evolve(
|
|
481
562
|
query=self._query.save(
|
|
482
563
|
name=name,
|
|
@@ -484,6 +565,7 @@ class DataChain:
|
|
|
484
565
|
description=description,
|
|
485
566
|
attrs=attrs,
|
|
486
567
|
feature_schema=schema,
|
|
568
|
+
update_version=update_version,
|
|
487
569
|
**kwargs,
|
|
488
570
|
)
|
|
489
571
|
)
|
|
@@ -601,6 +683,7 @@ class DataChain:
|
|
|
601
683
|
signal_schema=udf_obj.output,
|
|
602
684
|
)
|
|
603
685
|
|
|
686
|
+
@delta_disabled
|
|
604
687
|
def agg(
|
|
605
688
|
self,
|
|
606
689
|
func: Optional[Callable] = None,
|
|
@@ -754,6 +837,7 @@ class DataChain:
|
|
|
754
837
|
|
|
755
838
|
return self._evolve(query=self._query.order_by(*args))
|
|
756
839
|
|
|
840
|
+
@delta_disabled
|
|
757
841
|
def distinct(self, arg: str, *args: str) -> "Self": # type: ignore[override]
|
|
758
842
|
"""Removes duplicate rows based on uniqueness of some input column(s)
|
|
759
843
|
i.e if rows are found with the same value of input column(s), only one
|
|
@@ -788,6 +872,7 @@ class DataChain:
|
|
|
788
872
|
query=self._query.select(*columns), signal_schema=new_schema
|
|
789
873
|
)
|
|
790
874
|
|
|
875
|
+
@delta_disabled # type: ignore[arg-type]
|
|
791
876
|
def group_by(
|
|
792
877
|
self,
|
|
793
878
|
*,
|
|
@@ -1146,6 +1231,7 @@ class DataChain:
|
|
|
1146
1231
|
schema = self.signals_schema.clone_without_file_signals()
|
|
1147
1232
|
return self.select(*schema.values.keys())
|
|
1148
1233
|
|
|
1234
|
+
@delta_disabled
|
|
1149
1235
|
def merge(
|
|
1150
1236
|
self,
|
|
1151
1237
|
right_ds: "DataChain",
|
|
@@ -1254,6 +1340,7 @@ class DataChain:
|
|
|
1254
1340
|
|
|
1255
1341
|
return ds
|
|
1256
1342
|
|
|
1343
|
+
@delta_disabled
|
|
1257
1344
|
def union(self, other: "Self") -> "Self":
|
|
1258
1345
|
"""Return the set union of the two datasets.
|
|
1259
1346
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
1
2
|
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
|
|
2
3
|
|
|
3
4
|
from datachain.error import DatasetVersionNotFoundError
|
|
@@ -27,6 +28,10 @@ def read_dataset(
|
|
|
27
28
|
session: Optional[Session] = None,
|
|
28
29
|
settings: Optional[dict] = None,
|
|
29
30
|
fallback_to_studio: bool = True,
|
|
31
|
+
delta: Optional[bool] = False,
|
|
32
|
+
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
33
|
+
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
34
|
+
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
30
35
|
) -> "DataChain":
|
|
31
36
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
32
37
|
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
@@ -38,6 +43,36 @@ def read_dataset(
|
|
|
38
43
|
settings : Settings to use for the chain.
|
|
39
44
|
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
40
45
|
Default is True.
|
|
46
|
+
delta: If set to True, we optimize the creation of new dataset versions by
|
|
47
|
+
calculating the diff between the latest version of this storage and the
|
|
48
|
+
version used to create the most recent version of the resulting chain
|
|
49
|
+
dataset (the one specified in `.save()`). We then run the "diff" chain
|
|
50
|
+
using only the diff data, rather than the entire storage data, and merge
|
|
51
|
+
that diff chain with the latest version of the resulting dataset to create
|
|
52
|
+
a new version. This approach avoids applying modifications to all records
|
|
53
|
+
from storage every time, which can be an expensive operation.
|
|
54
|
+
The diff is calculated using the `DataChain.compare()` method, which
|
|
55
|
+
compares the `delta_on` fields to find matches and checks the compare
|
|
56
|
+
fields to determine if a record has changed. Note that this process only
|
|
57
|
+
considers added and modified records in storage; deleted records are not
|
|
58
|
+
removed from the new dataset version.
|
|
59
|
+
This calculation is based on the difference between the current version
|
|
60
|
+
of the source and the version used to create the dataset.
|
|
61
|
+
delta_on: A list of fields that uniquely identify rows in the source.
|
|
62
|
+
If two rows have the same values, they are considered the same (e.g., they
|
|
63
|
+
could be different versions of the same row in a versioned source).
|
|
64
|
+
This is used in the delta update to calculate the diff.
|
|
65
|
+
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
66
|
+
to the `delta_on` fields from the source.
|
|
67
|
+
This is needed to identify rows that have changed in the source but are
|
|
68
|
+
already present in the current version of the resulting dataset, in order
|
|
69
|
+
to avoid including outdated versions of those rows in the new dataset.
|
|
70
|
+
We retain only the latest versions of rows to prevent duplication.
|
|
71
|
+
There is no need to define this if the `delta_on` fields are present in
|
|
72
|
+
the final dataset and have not been renamed.
|
|
73
|
+
delta_compare: A list of fields used to check if the same row has been modified
|
|
74
|
+
in the new version of the source.
|
|
75
|
+
If not defined, all fields except those defined in delta_on will be used.
|
|
41
76
|
|
|
42
77
|
Example:
|
|
43
78
|
```py
|
|
@@ -113,7 +148,12 @@ def read_dataset(
|
|
|
113
148
|
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
114
149
|
else:
|
|
115
150
|
signals_schema |= SignalSchema.from_column_types(query.column_types or {})
|
|
116
|
-
|
|
151
|
+
chain = DataChain(query, _settings, signals_schema)
|
|
152
|
+
if delta:
|
|
153
|
+
chain = chain._as_delta(
|
|
154
|
+
on=delta_on, right_on=delta_result_on, compare=delta_compare
|
|
155
|
+
)
|
|
156
|
+
return chain
|
|
117
157
|
|
|
118
158
|
|
|
119
159
|
def datasets(
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import os.path
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from functools import reduce
|
|
2
4
|
from typing import (
|
|
3
5
|
TYPE_CHECKING,
|
|
4
6
|
Optional,
|
|
5
7
|
Union,
|
|
6
8
|
)
|
|
7
9
|
|
|
8
|
-
from datachain.error import DatasetNotFoundError
|
|
9
10
|
from datachain.lib.file import (
|
|
10
11
|
FileType,
|
|
11
12
|
get_file_type,
|
|
@@ -33,6 +34,10 @@ def read_storage(
|
|
|
33
34
|
column: str = "file",
|
|
34
35
|
update: bool = False,
|
|
35
36
|
anon: bool = False,
|
|
37
|
+
delta: Optional[bool] = False,
|
|
38
|
+
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
39
|
+
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
40
|
+
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
36
41
|
client_config: Optional[dict] = None,
|
|
37
42
|
) -> "DataChain":
|
|
38
43
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
@@ -48,6 +53,36 @@ def read_storage(
|
|
|
48
53
|
update : force storage reindexing. Default is False.
|
|
49
54
|
anon : If True, we will treat cloud bucket as public one
|
|
50
55
|
client_config : Optional client configuration for the storage client.
|
|
56
|
+
delta: If set to True, we optimize the creation of new dataset versions by
|
|
57
|
+
calculating the diff between the latest version of this storage and the
|
|
58
|
+
version used to create the most recent version of the resulting chain
|
|
59
|
+
dataset (the one specified in `.save()`). We then run the "diff" chain
|
|
60
|
+
using only the diff data, rather than the entire storage data, and merge
|
|
61
|
+
that diff chain with the latest version of the resulting dataset to create
|
|
62
|
+
a new version. This approach avoids applying modifications to all records
|
|
63
|
+
from storage every time, which can be an expensive operation.
|
|
64
|
+
The diff is calculated using the `DataChain.compare()` method, which
|
|
65
|
+
compares the `delta_on` fields to find matches and checks the compare
|
|
66
|
+
fields to determine if a record has changed. Note that this process only
|
|
67
|
+
considers added and modified records in storage; deleted records are not
|
|
68
|
+
removed from the new dataset version.
|
|
69
|
+
This calculation is based on the difference between the current version
|
|
70
|
+
of the source and the version used to create the dataset.
|
|
71
|
+
delta_on: A list of fields that uniquely identify rows in the source.
|
|
72
|
+
If two rows have the same values, they are considered the same (e.g., they
|
|
73
|
+
could be different versions of the same row in a versioned source).
|
|
74
|
+
This is used in the delta update to calculate the diff.
|
|
75
|
+
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
76
|
+
to the `delta_on` fields from the source.
|
|
77
|
+
This is needed to identify rows that have changed in the source but are
|
|
78
|
+
already present in the current version of the resulting dataset, in order
|
|
79
|
+
to avoid including outdated versions of those rows in the new dataset.
|
|
80
|
+
We retain only the latest versions of rows to prevent duplication.
|
|
81
|
+
There is no need to define this if the `delta_on` fields are present in
|
|
82
|
+
the final dataset and have not been renamed.
|
|
83
|
+
delta_compare: A list of fields used to check if the same row has been modified
|
|
84
|
+
in the new version of the source.
|
|
85
|
+
If not defined, all fields except those defined in `delta_on` will be used.
|
|
51
86
|
|
|
52
87
|
Returns:
|
|
53
88
|
DataChain: A DataChain object containing the file information.
|
|
@@ -107,7 +142,7 @@ def read_storage(
|
|
|
107
142
|
if not uris:
|
|
108
143
|
raise ValueError("No URIs provided")
|
|
109
144
|
|
|
110
|
-
|
|
145
|
+
chains = []
|
|
111
146
|
listed_ds_name = set()
|
|
112
147
|
file_values = []
|
|
113
148
|
|
|
@@ -132,11 +167,6 @@ def read_storage(
|
|
|
132
167
|
|
|
133
168
|
def lst_fn(ds_name, lst_uri):
|
|
134
169
|
# disable prefetch for listing, as it pre-downloads all files
|
|
135
|
-
try:
|
|
136
|
-
version = catalog.get_dataset(ds_name).next_version_major
|
|
137
|
-
except DatasetNotFoundError:
|
|
138
|
-
version = None
|
|
139
|
-
|
|
140
170
|
(
|
|
141
171
|
read_records(
|
|
142
172
|
DataChain.DEFAULT_FILE_RECORD,
|
|
@@ -150,18 +180,18 @@ def read_storage(
|
|
|
150
180
|
output={f"{column}": file_type},
|
|
151
181
|
)
|
|
152
182
|
# for internal listing datasets, we always bump major version
|
|
153
|
-
.save(ds_name, listing=True,
|
|
183
|
+
.save(ds_name, listing=True, update_version="major")
|
|
154
184
|
)
|
|
155
185
|
|
|
156
186
|
dc._query.set_listing_fn(
|
|
157
187
|
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
158
188
|
)
|
|
159
189
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
storage_chain = storage_chain.union(chain) if storage_chain else chain
|
|
190
|
+
chains.append(ls(dc, list_path, recursive=recursive, column=column))
|
|
163
191
|
listed_ds_name.add(list_ds_name)
|
|
164
192
|
|
|
193
|
+
storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
|
|
194
|
+
|
|
165
195
|
if file_values:
|
|
166
196
|
file_chain = read_values(
|
|
167
197
|
session=session,
|
|
@@ -176,4 +206,8 @@ def read_storage(
|
|
|
176
206
|
|
|
177
207
|
assert storage_chain is not None
|
|
178
208
|
|
|
209
|
+
if delta:
|
|
210
|
+
storage_chain = storage_chain._as_delta(
|
|
211
|
+
on=delta_on, right_on=delta_result_on, compare=delta_compare
|
|
212
|
+
)
|
|
179
213
|
return storage_chain
|