datachain 0.30.5__tar.gz → 0.30.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.30.5 → datachain-0.30.6}/PKG-INFO +1 -1
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/delta.md +20 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/json-csv-reader.py +8 -6
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/datasets.py +32 -17
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/delta.py +36 -20
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/datachain.py +8 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/datasets.py +4 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/storage.py +5 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/conftest.py +3 -5
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_delta.py +88 -33
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_retry.py +40 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_datachain.py +7 -0
- datachain-0.30.6/tests/unit/test_cli_datasets.py +64 -0
- {datachain-0.30.5 → datachain-0.30.6}/.cruft.json +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.gitattributes +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/codecov.yaml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/dependabot.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/release.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/tests.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.gitignore +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/.pre-commit-config.yaml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/LICENSE +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/README.rst +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/assets/datachain.svg +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/login.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/logout.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/team.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/auth/token.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/index.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/cancel.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/clusters.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/logs.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/ls.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/commands/job/run.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/contributing.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/examples.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/db_migrations.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/env.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/index.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/namespaces.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/processing.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/remotes.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/guide/retry.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/index.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/overrides/main.html +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/quick-start.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/file.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/index.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/pose.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/segment.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/datachain.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/func.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/aggregate.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/array.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/conditional.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/numeric.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/path.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/random.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/string.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/functions/window.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/index.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/toolkit.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/torch.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/references/udf.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/docs/tutorials.md +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/nested_datamodel.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/wds.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/mkdocs.yml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/noxfile.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/pyproject.toml +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/setup.cfg +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/__main__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/asyn.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cache.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/cli/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/azure.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/gcs.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/hf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/local.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/client/s3.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/config.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/dataset.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/error.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/fs/reference.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/fs/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/array.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/base.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/conditional.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/func.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/numeric.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/path.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/random.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/string.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/func/window.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/job.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/audio.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/clip.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/file.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/hf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/image.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/listing.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/projects.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/settings.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/tar.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/text.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/udf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/video.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/listing.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/bbox.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/pose.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/segment.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/model/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/namespace.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/node.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/progress.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/project.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/py.typed +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/batch.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/dataset.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/metrics.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/params.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/queue.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/schema.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/session.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/udf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/query/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/remote/studio.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/script_meta.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/semver.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/postgresql_dialect.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/postgresql_types.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/types.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/sql/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/studio.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/telemetry.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain/utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/data.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/examples/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/examples/test_examples.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/examples/wds_data.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/data/lena.jpg +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_array.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_path.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_random.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/functions/test_string.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_audio.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_batching.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_catalog.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_client.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_data_storage.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_datachain.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_datasets.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_file.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_hf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_image.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_listing.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_ls.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_metastore.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_metrics.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_mutate.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_pull.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_pytorch.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_query.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_read_database.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_session.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_to_database.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_toolkit.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_video.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/func/test_warehouse.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/test_atomicity.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/test_cli_e2e.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/test_cli_studio.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/test_import_time.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/test_query_e2e.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/test_telemetry.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_asyn.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_cache.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_catalog.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_client.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_config.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_dataset.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_func.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_listing.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_metastore.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_query.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_query_params.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_semver.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_serializer.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_session.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_utils.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.30.5 → datachain-0.30.6}/tests/utils.py +0 -0
|
@@ -80,3 +80,23 @@ Delta processing can be combined with [retry processing](./retry.md) to create a
|
|
|
80
80
|
|
|
81
81
|
1. Processes only new or changed records (delta)
|
|
82
82
|
2. Reprocesses records with errors or that are missing (retry)
|
|
83
|
+
|
|
84
|
+
## Using Delta with Restricted Methods
|
|
85
|
+
|
|
86
|
+
By default, delta updates cannot be combined with the following methods:
|
|
87
|
+
|
|
88
|
+
1. `merge`
|
|
89
|
+
2. `union`
|
|
90
|
+
3. `distinct`
|
|
91
|
+
4. `agg`
|
|
92
|
+
5. `group_by`
|
|
93
|
+
|
|
94
|
+
These methods are restricted because they may produce **unexpected results** when used with delta processing. Delta runs the chain only on a subset of rows (new and changed records), while methods like `distinct`, `agg`, or `group_by` are designed to operate on the entire dataset.
|
|
95
|
+
|
|
96
|
+
Similarly, combining delta with methods like `merge` or `union` may result in duplicated rows when merging with a static dataset.
|
|
97
|
+
|
|
98
|
+
If you still need to use these methods together with delta, you can override this restriction by setting the additional flag:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
delta_unsafe=True
|
|
102
|
+
```
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import datachain as dc
|
|
@@ -39,7 +40,7 @@ def main():
|
|
|
39
40
|
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
40
41
|
|
|
41
42
|
# Print JSON schema in Pydantic format from main COCO annotation
|
|
42
|
-
chain = dc.read_storage(uri, anon=
|
|
43
|
+
chain = dc.read_storage(uri, anon=True).filter(dc.C("file.path").glob("*.json"))
|
|
43
44
|
file = chain.limit(1).to_values("file")[0]
|
|
44
45
|
print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
|
|
45
46
|
|
|
@@ -65,11 +66,12 @@ def main():
|
|
|
65
66
|
dynamic_csv_ds.print_schema()
|
|
66
67
|
dynamic_csv_ds.show()
|
|
67
68
|
|
|
68
|
-
print(
|
|
69
|
-
"Note: script might hang at the end due to https://github.com/apache/arrow/issues/43497"
|
|
70
|
-
)
|
|
71
|
-
print("Just press Ctrl+C to exit.")
|
|
72
|
-
|
|
73
69
|
|
|
74
70
|
if __name__ == "__main__":
|
|
75
71
|
main()
|
|
72
|
+
|
|
73
|
+
# Force exit without cleanup to avoid hanging due to arrow issue
|
|
74
|
+
print(
|
|
75
|
+
"Note: script might warn about leaked semaphore at the end due to https://github.com/apache/arrow/issues/43497"
|
|
76
|
+
)
|
|
77
|
+
os._exit(0)
|
|
@@ -1,30 +1,41 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
4
|
|
|
4
5
|
from tabulate import tabulate
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
from datachain.catalog import Catalog
|
|
8
|
-
|
|
7
|
+
from datachain import semver
|
|
9
8
|
from datachain.catalog import is_namespace_local
|
|
10
9
|
from datachain.cli.utils import determine_flavors
|
|
11
10
|
from datachain.config import Config
|
|
12
11
|
from datachain.error import DataChainError, DatasetNotFoundError
|
|
13
12
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
14
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datachain.catalog import Catalog
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def group_dataset_versions(
|
|
19
|
+
datasets: Iterable[tuple[str, str]], latest_only=True
|
|
20
|
+
) -> dict[str, Union[str, list[str]]]:
|
|
21
|
+
grouped: dict[str, list[tuple[int, int, int]]] = {}
|
|
15
22
|
|
|
16
|
-
def group_dataset_versions(datasets, latest_only=True):
|
|
17
|
-
grouped = {}
|
|
18
23
|
# Sort to ensure groupby works as expected
|
|
19
24
|
# (groupby expects consecutive items with the same key)
|
|
20
25
|
for name, version in sorted(datasets):
|
|
21
|
-
grouped.setdefault(name, []).append(version)
|
|
26
|
+
grouped.setdefault(name, []).append(semver.parse(version))
|
|
22
27
|
|
|
23
28
|
if latest_only:
|
|
24
29
|
# For each dataset name, pick the highest version.
|
|
25
|
-
return {
|
|
30
|
+
return {
|
|
31
|
+
name: semver.create(*(max(versions))) for name, versions in grouped.items()
|
|
32
|
+
}
|
|
33
|
+
|
|
26
34
|
# For each dataset name, return a sorted list of unique versions.
|
|
27
|
-
return {
|
|
35
|
+
return {
|
|
36
|
+
name: [semver.create(*v) for v in sorted(set(versions))]
|
|
37
|
+
for name, versions in grouped.items()
|
|
38
|
+
}
|
|
28
39
|
|
|
29
40
|
|
|
30
41
|
def list_datasets(
|
|
@@ -35,7 +46,7 @@ def list_datasets(
|
|
|
35
46
|
team: Optional[str] = None,
|
|
36
47
|
latest_only: bool = True,
|
|
37
48
|
name: Optional[str] = None,
|
|
38
|
-
):
|
|
49
|
+
) -> None:
|
|
39
50
|
token = Config().read().get("studio", {}).get("token")
|
|
40
51
|
all, local, studio = determine_flavors(studio, local, all, token)
|
|
41
52
|
if name:
|
|
@@ -95,27 +106,31 @@ def list_datasets(
|
|
|
95
106
|
print(tabulate(rows, headers="keys"))
|
|
96
107
|
|
|
97
108
|
|
|
98
|
-
def list_datasets_local(
|
|
109
|
+
def list_datasets_local(
|
|
110
|
+
catalog: "Catalog", name: Optional[str] = None
|
|
111
|
+
) -> Iterator[tuple[str, str]]:
|
|
99
112
|
if name:
|
|
100
113
|
yield from list_datasets_local_versions(catalog, name)
|
|
101
114
|
return
|
|
102
115
|
|
|
103
116
|
for d in catalog.ls_datasets():
|
|
104
117
|
for v in d.versions:
|
|
105
|
-
yield
|
|
118
|
+
yield d.full_name, v.version
|
|
106
119
|
|
|
107
120
|
|
|
108
|
-
def list_datasets_local_versions(
|
|
121
|
+
def list_datasets_local_versions(
|
|
122
|
+
catalog: "Catalog", name: str
|
|
123
|
+
) -> Iterator[tuple[str, str]]:
|
|
109
124
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
110
125
|
|
|
111
126
|
ds = catalog.get_dataset(
|
|
112
127
|
name, namespace_name=namespace_name, project_name=project_name
|
|
113
128
|
)
|
|
114
129
|
for v in ds.versions:
|
|
115
|
-
yield
|
|
130
|
+
yield name, v.version
|
|
116
131
|
|
|
117
132
|
|
|
118
|
-
def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
133
|
+
def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
|
|
119
134
|
row = {
|
|
120
135
|
"Name": name,
|
|
121
136
|
}
|
|
@@ -136,7 +151,7 @@ def rm_dataset(
|
|
|
136
151
|
force: Optional[bool] = False,
|
|
137
152
|
studio: Optional[bool] = False,
|
|
138
153
|
team: Optional[str] = None,
|
|
139
|
-
):
|
|
154
|
+
) -> None:
|
|
140
155
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
141
156
|
|
|
142
157
|
if studio:
|
|
@@ -166,7 +181,7 @@ def edit_dataset(
|
|
|
166
181
|
description: Optional[str] = None,
|
|
167
182
|
attrs: Optional[list[str]] = None,
|
|
168
183
|
team: Optional[str] = None,
|
|
169
|
-
):
|
|
184
|
+
) -> None:
|
|
170
185
|
from datachain.lib.dc.utils import is_studio
|
|
171
186
|
|
|
172
187
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
@@ -4,7 +4,7 @@ from functools import wraps
|
|
|
4
4
|
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
5
5
|
|
|
6
6
|
import datachain
|
|
7
|
-
from datachain.dataset import DatasetDependency
|
|
7
|
+
from datachain.dataset import DatasetDependency, DatasetRecord
|
|
8
8
|
from datachain.error import DatasetNotFoundError
|
|
9
9
|
from datachain.project import Project
|
|
10
10
|
|
|
@@ -30,9 +30,10 @@ def delta_disabled(
|
|
|
30
30
|
|
|
31
31
|
@wraps(method)
|
|
32
32
|
def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
|
|
33
|
-
if self.delta:
|
|
33
|
+
if self.delta and not self._delta_unsafe:
|
|
34
34
|
raise NotImplementedError(
|
|
35
|
-
f"
|
|
35
|
+
f"Cannot use {method.__name__} with delta datasets - may cause"
|
|
36
|
+
" inconsistency. Use delta_unsafe flag to allow this operation."
|
|
36
37
|
)
|
|
37
38
|
return method(self, *args, **kwargs)
|
|
38
39
|
|
|
@@ -124,10 +125,19 @@ def _get_retry_chain(
|
|
|
124
125
|
# Subtract also diff chain since some items might be picked
|
|
125
126
|
# up by `delta=True` itself (e.g. records got modified AND are missing in the
|
|
126
127
|
# result dataset atm)
|
|
127
|
-
|
|
128
|
+
on = [on] if isinstance(on, str) else on
|
|
129
|
+
|
|
130
|
+
return (
|
|
131
|
+
retry_chain.diff(
|
|
132
|
+
diff_chain, on=on, added=True, same=True, modified=False, deleted=False
|
|
133
|
+
).distinct(*on)
|
|
134
|
+
if retry_chain
|
|
135
|
+
else None
|
|
136
|
+
)
|
|
128
137
|
|
|
129
138
|
|
|
130
139
|
def _get_source_info(
|
|
140
|
+
source_ds: DatasetRecord,
|
|
131
141
|
name: str,
|
|
132
142
|
namespace_name: str,
|
|
133
143
|
project_name: str,
|
|
@@ -154,25 +164,23 @@ def _get_source_info(
|
|
|
154
164
|
indirect=False,
|
|
155
165
|
)
|
|
156
166
|
|
|
157
|
-
|
|
158
|
-
if not
|
|
167
|
+
source_ds_dep = next((d for d in dependencies if d.name == source_ds.name), None)
|
|
168
|
+
if not source_ds_dep:
|
|
159
169
|
# Starting dataset was removed, back off to normal dataset creation
|
|
160
170
|
return None, None, None, None, None
|
|
161
171
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
project_name=source_ds_project.name,
|
|
169
|
-
).latest_version
|
|
172
|
+
# Refresh starting dataset to have new versions if they are created
|
|
173
|
+
source_ds = catalog.get_dataset(
|
|
174
|
+
source_ds.name,
|
|
175
|
+
namespace_name=source_ds.project.namespace.name,
|
|
176
|
+
project_name=source_ds.project.name,
|
|
177
|
+
)
|
|
170
178
|
|
|
171
179
|
return (
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
180
|
+
source_ds.name,
|
|
181
|
+
source_ds.project,
|
|
182
|
+
source_ds_dep.version,
|
|
183
|
+
source_ds.latest_version,
|
|
176
184
|
dependencies,
|
|
177
185
|
)
|
|
178
186
|
|
|
@@ -244,7 +252,14 @@ def delta_retry_update(
|
|
|
244
252
|
source_ds_version,
|
|
245
253
|
source_ds_latest_version,
|
|
246
254
|
dependencies,
|
|
247
|
-
) = _get_source_info(
|
|
255
|
+
) = _get_source_info(
|
|
256
|
+
dc._query.starting_step.dataset, # type: ignore[union-attr]
|
|
257
|
+
name,
|
|
258
|
+
namespace_name,
|
|
259
|
+
project_name,
|
|
260
|
+
latest_version,
|
|
261
|
+
catalog,
|
|
262
|
+
)
|
|
248
263
|
|
|
249
264
|
# If source_ds_name is None, starting dataset was removed
|
|
250
265
|
if source_ds_name is None:
|
|
@@ -267,8 +282,9 @@ def delta_retry_update(
|
|
|
267
282
|
if dependencies:
|
|
268
283
|
dependencies = copy(dependencies)
|
|
269
284
|
dependencies = [d for d in dependencies if d is not None]
|
|
285
|
+
source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
|
|
270
286
|
# Update to latest version
|
|
271
|
-
|
|
287
|
+
source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
|
|
272
288
|
|
|
273
289
|
# Handle retry functionality if enabled
|
|
274
290
|
if delta_retry:
|
|
@@ -193,6 +193,7 @@ class DataChain:
|
|
|
193
193
|
self._setup: dict = setup or {}
|
|
194
194
|
self._sys = _sys
|
|
195
195
|
self._delta = False
|
|
196
|
+
self._delta_unsafe = False
|
|
196
197
|
self._delta_on: Optional[Union[str, Sequence[str]]] = None
|
|
197
198
|
self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
|
|
198
199
|
self._delta_compare: Optional[Union[str, Sequence[str]]] = None
|
|
@@ -216,6 +217,7 @@ class DataChain:
|
|
|
216
217
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
217
218
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
218
219
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
220
|
+
delta_unsafe: bool = False,
|
|
219
221
|
) -> "Self":
|
|
220
222
|
"""Marks this chain as delta, which means special delta process will be
|
|
221
223
|
called on saving dataset for optimization"""
|
|
@@ -226,6 +228,7 @@ class DataChain:
|
|
|
226
228
|
self._delta_result_on = right_on
|
|
227
229
|
self._delta_compare = compare
|
|
228
230
|
self._delta_retry = delta_retry
|
|
231
|
+
self._delta_unsafe = delta_unsafe
|
|
229
232
|
return self
|
|
230
233
|
|
|
231
234
|
@property
|
|
@@ -238,6 +241,10 @@ class DataChain:
|
|
|
238
241
|
"""Returns True if this chain is ran in "delta" update mode"""
|
|
239
242
|
return self._delta
|
|
240
243
|
|
|
244
|
+
@property
|
|
245
|
+
def delta_unsafe(self) -> bool:
|
|
246
|
+
return self._delta_unsafe
|
|
247
|
+
|
|
241
248
|
@property
|
|
242
249
|
def schema(self) -> dict[str, DataType]:
|
|
243
250
|
"""Get schema of the chain."""
|
|
@@ -328,6 +335,7 @@ class DataChain:
|
|
|
328
335
|
right_on=self._delta_result_on,
|
|
329
336
|
compare=self._delta_compare,
|
|
330
337
|
delta_retry=self._delta_retry,
|
|
338
|
+
delta_unsafe=self._delta_unsafe,
|
|
331
339
|
)
|
|
332
340
|
|
|
333
341
|
return chain
|
|
@@ -40,6 +40,7 @@ def read_dataset(
|
|
|
40
40
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
41
41
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
42
42
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
43
|
+
delta_unsafe: bool = False,
|
|
43
44
|
update: bool = False,
|
|
44
45
|
) -> "DataChain":
|
|
45
46
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
@@ -80,6 +81,8 @@ def read_dataset(
|
|
|
80
81
|
update: If True always checks for newer versions available on Studio, even if
|
|
81
82
|
some version of the dataset exists locally already. If False (default), it
|
|
82
83
|
will only fetch the dataset from Studio if it is not found locally.
|
|
84
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
85
|
+
distinct.
|
|
83
86
|
|
|
84
87
|
|
|
85
88
|
Example:
|
|
@@ -205,6 +208,7 @@ def read_dataset(
|
|
|
205
208
|
right_on=delta_result_on,
|
|
206
209
|
compare=delta_compare,
|
|
207
210
|
delta_retry=delta_retry,
|
|
211
|
+
delta_unsafe=delta_unsafe,
|
|
208
212
|
)
|
|
209
213
|
|
|
210
214
|
return chain
|
|
@@ -43,6 +43,7 @@ def read_storage(
|
|
|
43
43
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
44
44
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
45
45
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
46
|
+
delta_unsafe: bool = False,
|
|
46
47
|
client_config: Optional[dict] = None,
|
|
47
48
|
) -> "DataChain":
|
|
48
49
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
@@ -77,6 +78,9 @@ def read_storage(
|
|
|
77
78
|
(error mode)
|
|
78
79
|
- True: Reprocess records missing from the result dataset (missing mode)
|
|
79
80
|
- None: No retry processing (default)
|
|
81
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
82
|
+
distinct. Caller must ensure datasets are consistent and not partially
|
|
83
|
+
updated.
|
|
80
84
|
|
|
81
85
|
Returns:
|
|
82
86
|
DataChain: A DataChain object containing the file information.
|
|
@@ -218,6 +222,7 @@ def read_storage(
|
|
|
218
222
|
right_on=delta_result_on,
|
|
219
223
|
compare=delta_compare,
|
|
220
224
|
delta_retry=delta_retry,
|
|
225
|
+
delta_unsafe=delta_unsafe,
|
|
221
226
|
)
|
|
222
227
|
|
|
223
228
|
return storage_chain
|
|
@@ -348,6 +348,7 @@ tests/unit/test_asyn.py
|
|
|
348
348
|
tests/unit/test_cache.py
|
|
349
349
|
tests/unit/test_catalog.py
|
|
350
350
|
tests/unit/test_catalog_loader.py
|
|
351
|
+
tests/unit/test_cli_datasets.py
|
|
351
352
|
tests/unit/test_cli_parsing.py
|
|
352
353
|
tests/unit/test_client.py
|
|
353
354
|
tests/unit/test_client_gcs.py
|
|
@@ -547,11 +547,9 @@ def is_studio():
|
|
|
547
547
|
|
|
548
548
|
@pytest.fixture(autouse=True)
|
|
549
549
|
def mock_is_studio(monkeypatch, is_studio):
|
|
550
|
-
if
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
monkeypatch.setenv("DATACHAIN_IS_STUDIO", True)
|
|
554
|
-
yield
|
|
550
|
+
if is_studio:
|
|
551
|
+
monkeypatch.setenv("DATACHAIN_IS_STUDIO", "True")
|
|
552
|
+
yield
|
|
555
553
|
|
|
556
554
|
|
|
557
555
|
@pytest.fixture
|
|
@@ -14,26 +14,16 @@ from datachain.lib.file import File, ImageFile
|
|
|
14
14
|
def _get_dependencies(catalog, name, version) -> list[tuple[str, str]]:
|
|
15
15
|
return sorted(
|
|
16
16
|
[
|
|
17
|
-
(
|
|
17
|
+
(d.name, d.version)
|
|
18
18
|
for d in catalog.get_dataset_dependencies(name, version, indirect=False)
|
|
19
19
|
]
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
|
|
23
|
+
def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
|
|
25
24
|
catalog = test_session.catalog
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
if project:
|
|
30
|
-
starting_ds_name = f"{project}.starting_ds"
|
|
31
|
-
dependency_ds_name = starting_ds_name
|
|
32
|
-
else:
|
|
33
|
-
starting_ds_name = "starting_ds"
|
|
34
|
-
dependency_ds_name = (
|
|
35
|
-
f"{default_namespace_name}.{default_project_name}.{starting_ds_name}"
|
|
36
|
-
)
|
|
25
|
+
|
|
26
|
+
starting_ds_name = "starting_ds"
|
|
37
27
|
ds_name = "delta_ds"
|
|
38
28
|
|
|
39
29
|
images = [
|
|
@@ -66,16 +56,12 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
|
|
|
66
56
|
create_image_dataset(starting_ds_name, images[:2])
|
|
67
57
|
# first version of delta dataset
|
|
68
58
|
create_delta_dataset(ds_name)
|
|
69
|
-
assert _get_dependencies(catalog, ds_name, "1.0.0") == [
|
|
70
|
-
(dependency_ds_name, "1.0.0")
|
|
71
|
-
]
|
|
59
|
+
assert _get_dependencies(catalog, ds_name, "1.0.0") == [(starting_ds_name, "1.0.0")]
|
|
72
60
|
# second version of starting dataset
|
|
73
61
|
create_image_dataset(starting_ds_name, images[2:])
|
|
74
62
|
# second version of delta dataset
|
|
75
63
|
create_delta_dataset(ds_name)
|
|
76
|
-
assert _get_dependencies(catalog, ds_name, "1.0.1") == [
|
|
77
|
-
(dependency_ds_name, "1.0.1")
|
|
78
|
-
]
|
|
64
|
+
assert _get_dependencies(catalog, ds_name, "1.0.1") == [(starting_ds_name, "1.0.1")]
|
|
79
65
|
|
|
80
66
|
assert (dc.read_dataset(ds_name, version="1.0.0").order_by("file.path")).to_values(
|
|
81
67
|
"file.path"
|
|
@@ -96,6 +82,66 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
|
|
|
96
82
|
create_delta_dataset(ds_name)
|
|
97
83
|
|
|
98
84
|
|
|
85
|
+
def test_delta_update_unsafe(test_session):
|
|
86
|
+
catalog = test_session.catalog
|
|
87
|
+
|
|
88
|
+
starting_ds_name = "starting_ds"
|
|
89
|
+
merge_ds_name = "merge_ds"
|
|
90
|
+
ds_name = "delta_ds"
|
|
91
|
+
|
|
92
|
+
# create dataset which will be merged to delta one
|
|
93
|
+
merge_ds = dc.read_values(
|
|
94
|
+
id=[1, 2, 3, 4, 5, 6], value=[1, 2, 3, 4, 5, 6], session=test_session
|
|
95
|
+
).save(merge_ds_name)
|
|
96
|
+
|
|
97
|
+
# first version of starting dataset
|
|
98
|
+
dc.read_values(id=[1, 2, 3], session=test_session).save(starting_ds_name)
|
|
99
|
+
# first version of delta dataset
|
|
100
|
+
dc.read_dataset(
|
|
101
|
+
starting_ds_name,
|
|
102
|
+
session=test_session,
|
|
103
|
+
delta_on="id",
|
|
104
|
+
delta=True,
|
|
105
|
+
delta_unsafe=True,
|
|
106
|
+
).merge(merge_ds, on="id", inner=True).save(ds_name)
|
|
107
|
+
|
|
108
|
+
assert set(_get_dependencies(catalog, ds_name, "1.0.0")) == {
|
|
109
|
+
(starting_ds_name, "1.0.0"),
|
|
110
|
+
(merge_ds_name, "1.0.0"),
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# second version of starting dataset
|
|
114
|
+
dc.read_values(id=[1, 2, 3, 4, 5, 6], session=test_session).save(starting_ds_name)
|
|
115
|
+
# second version of delta dataset
|
|
116
|
+
dc.read_dataset(
|
|
117
|
+
starting_ds_name,
|
|
118
|
+
session=test_session,
|
|
119
|
+
delta_on="id",
|
|
120
|
+
delta=True,
|
|
121
|
+
delta_unsafe=True,
|
|
122
|
+
).merge(merge_ds, on="id", inner=True).save(ds_name)
|
|
123
|
+
|
|
124
|
+
assert set(_get_dependencies(catalog, ds_name, "1.0.1")) == {
|
|
125
|
+
(starting_ds_name, "1.0.1"),
|
|
126
|
+
(merge_ds_name, "1.0.0"),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
assert set((dc.read_dataset(ds_name, version="1.0.0")).to_list("id", "value")) == {
|
|
130
|
+
(1, 1),
|
|
131
|
+
(2, 2),
|
|
132
|
+
(3, 3),
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
assert set((dc.read_dataset(ds_name, version="1.0.1")).to_list("id", "value")) == {
|
|
136
|
+
(1, 1),
|
|
137
|
+
(2, 2),
|
|
138
|
+
(3, 3),
|
|
139
|
+
(4, 4),
|
|
140
|
+
(5, 5),
|
|
141
|
+
(6, 6),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
99
145
|
def test_delta_update_from_storage(test_session, tmp_dir, tmp_path):
|
|
100
146
|
ds_name = "delta_ds"
|
|
101
147
|
path = tmp_dir.as_uri()
|
|
@@ -249,8 +295,6 @@ def test_delta_update_check_num_calls(test_session, tmp_dir, tmp_path, capsys):
|
|
|
249
295
|
|
|
250
296
|
def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
|
|
251
297
|
catalog = test_session.catalog
|
|
252
|
-
default_namespace_name = catalog.metastore.default_namespace_name
|
|
253
|
-
default_project_name = catalog.metastore.default_project_name
|
|
254
298
|
ds_name = "delta_ds"
|
|
255
299
|
path = tmp_dir.as_uri()
|
|
256
300
|
tmp_dir = tmp_dir / "images"
|
|
@@ -301,7 +345,8 @@ def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
|
|
|
301
345
|
|
|
302
346
|
assert str(exc_info.value) == (
|
|
303
347
|
f"Dataset {ds_name} version 1.0.1 not found in namespace "
|
|
304
|
-
f"{default_namespace_name}
|
|
348
|
+
f"{catalog.metastore.default_namespace_name}"
|
|
349
|
+
f" and project {catalog.metastore.default_project_name}"
|
|
305
350
|
)
|
|
306
351
|
|
|
307
352
|
|
|
@@ -325,11 +370,13 @@ def test_delta_update_union(test_session, file_dataset):
|
|
|
325
370
|
file_dataset.name,
|
|
326
371
|
session=test_session,
|
|
327
372
|
delta=True,
|
|
328
|
-
delta_on=["file.source", "file.path"],
|
|
329
373
|
).union(dc.read_dataset("numbers"), session=test_session)
|
|
330
374
|
)
|
|
331
375
|
|
|
332
|
-
assert str(excinfo.value) ==
|
|
376
|
+
assert str(excinfo.value) == (
|
|
377
|
+
"Cannot use union with delta datasets - may cause inconsistency."
|
|
378
|
+
" Use delta_unsafe flag to allow this operation."
|
|
379
|
+
)
|
|
333
380
|
|
|
334
381
|
|
|
335
382
|
def test_delta_update_merge(test_session, file_dataset):
|
|
@@ -341,11 +388,13 @@ def test_delta_update_merge(test_session, file_dataset):
|
|
|
341
388
|
file_dataset.name,
|
|
342
389
|
session=test_session,
|
|
343
390
|
delta=True,
|
|
344
|
-
delta_on=["file.source", "file.path"],
|
|
345
391
|
).merge(dc.read_dataset("numbers"), on="id", session=test_session)
|
|
346
392
|
)
|
|
347
393
|
|
|
348
|
-
assert str(excinfo.value) ==
|
|
394
|
+
assert str(excinfo.value) == (
|
|
395
|
+
"Cannot use merge with delta datasets - may cause inconsistency."
|
|
396
|
+
" Use delta_unsafe flag to allow this operation."
|
|
397
|
+
)
|
|
349
398
|
|
|
350
399
|
|
|
351
400
|
def test_delta_update_distinct(test_session, file_dataset):
|
|
@@ -355,11 +404,13 @@ def test_delta_update_distinct(test_session, file_dataset):
|
|
|
355
404
|
file_dataset.name,
|
|
356
405
|
session=test_session,
|
|
357
406
|
delta=True,
|
|
358
|
-
delta_on=["file.source", "file.path"],
|
|
359
407
|
).distinct("file.path")
|
|
360
408
|
)
|
|
361
409
|
|
|
362
|
-
assert str(excinfo.value) ==
|
|
410
|
+
assert str(excinfo.value) == (
|
|
411
|
+
"Cannot use distinct with delta datasets - may cause inconsistency."
|
|
412
|
+
" Use delta_unsafe flag to allow this operation."
|
|
413
|
+
)
|
|
363
414
|
|
|
364
415
|
|
|
365
416
|
def test_delta_update_group_by(test_session, file_dataset):
|
|
@@ -369,11 +420,13 @@ def test_delta_update_group_by(test_session, file_dataset):
|
|
|
369
420
|
file_dataset.name,
|
|
370
421
|
session=test_session,
|
|
371
422
|
delta=True,
|
|
372
|
-
delta_on=["file.source", "file.path"],
|
|
373
423
|
).group_by(cnt=func.count(), partition_by="file.path")
|
|
374
424
|
)
|
|
375
425
|
|
|
376
|
-
assert str(excinfo.value) ==
|
|
426
|
+
assert str(excinfo.value) == (
|
|
427
|
+
"Cannot use group_by with delta datasets - may cause inconsistency."
|
|
428
|
+
" Use delta_unsafe flag to allow this operation."
|
|
429
|
+
)
|
|
377
430
|
|
|
378
431
|
|
|
379
432
|
def test_delta_update_agg(test_session, file_dataset):
|
|
@@ -383,8 +436,10 @@ def test_delta_update_agg(test_session, file_dataset):
|
|
|
383
436
|
file_dataset.name,
|
|
384
437
|
session=test_session,
|
|
385
438
|
delta=True,
|
|
386
|
-
delta_on=["file.source", "file.path"],
|
|
387
439
|
).agg(cnt=func.count(), partition_by="file.path")
|
|
388
440
|
)
|
|
389
441
|
|
|
390
|
-
assert str(excinfo.value) ==
|
|
442
|
+
assert str(excinfo.value) == (
|
|
443
|
+
"Cannot use agg with delta datasets - may cause inconsistency."
|
|
444
|
+
" Use delta_unsafe flag to allow this operation."
|
|
445
|
+
)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from collections.abc import Iterator
|
|
1
2
|
from datetime import datetime, timezone
|
|
2
3
|
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
@@ -425,3 +426,42 @@ def test_delta_and_delta_retry_no_duplicates(test_session):
|
|
|
425
426
|
assert len(ids_in_result) == 4
|
|
426
427
|
assert len(set(ids_in_result)) == 4 # No duplicate IDs
|
|
427
428
|
assert set(ids_in_result) == {1, 2, 3, 4}
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def test_repeating_errors(test_session):
|
|
432
|
+
def run_delta():
|
|
433
|
+
def func(id) -> Iterator[tuple[int, str, str]]:
|
|
434
|
+
yield id, "name1", "error"
|
|
435
|
+
yield id, "name2", "error"
|
|
436
|
+
|
|
437
|
+
return (
|
|
438
|
+
dc.read_dataset(
|
|
439
|
+
"sample_data",
|
|
440
|
+
delta=True,
|
|
441
|
+
delta_on="id",
|
|
442
|
+
delta_result_on="id",
|
|
443
|
+
delta_retry="error",
|
|
444
|
+
session=test_session,
|
|
445
|
+
)
|
|
446
|
+
.gen(func, output={"id": int, "name": str, "error": str})
|
|
447
|
+
.save("processed_data")
|
|
448
|
+
)
|
|
449
|
+
return dc.read_dataset("processed_data")
|
|
450
|
+
|
|
451
|
+
_create_sample_data(
|
|
452
|
+
test_session, ids=list(range(1)), contents=[str(i) for i in range(1)]
|
|
453
|
+
)
|
|
454
|
+
ch1 = run_delta()
|
|
455
|
+
assert sorted(ch1.collect("id")) == [0, 0]
|
|
456
|
+
|
|
457
|
+
_create_sample_data(
|
|
458
|
+
test_session, ids=list(range(2)), contents=[str(i) for i in range(2)]
|
|
459
|
+
)
|
|
460
|
+
ch2 = run_delta()
|
|
461
|
+
assert sorted(ch2.collect("id")) == [0, 0, 1, 1]
|
|
462
|
+
|
|
463
|
+
_create_sample_data(
|
|
464
|
+
test_session, ids=list(range(3)), contents=[str(i) for i in range(3)]
|
|
465
|
+
)
|
|
466
|
+
ch3 = run_delta()
|
|
467
|
+
assert sorted(ch3.collect("id")) == [0, 0, 1, 1, 2, 2]
|