datachain 0.24.0__tar.gz → 0.24.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.24.0 → datachain-0.24.1}/PKG-INFO +1 -1
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/delta.py +82 -25
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/datachain.py +2 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_delta.py +20 -5
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_retry.py +164 -50
- {datachain-0.24.0 → datachain-0.24.1}/.cruft.json +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.gitattributes +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/codecov.yaml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/dependabot.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/release.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.gitignore +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/.pre-commit-config.yaml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/LICENSE +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/README.rst +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/login.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/logout.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/team.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/auth/token.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/cancel.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/clusters.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/logs.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/ls.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/commands/job/run.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/contributing.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/examples.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/db_migrations.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/delta.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/env.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/namespaces.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/processing.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/remotes.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/guide/retry.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/overrides/main.html +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/quick-start.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/datachain.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/func.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/toolkit.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/torch.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/references/udf.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/docs/tutorials.md +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/mkdocs.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/noxfile.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/pyproject.toml +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/setup.cfg +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/__main__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/asyn.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cache.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/local.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/config.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/dataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/error.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/base.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/func.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/func/window.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/job.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/projects.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/udf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/namespace.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/node.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/progress.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/project.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/py.typed +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/dataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/params.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/session.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/query/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/semver.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/conftest.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/data.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/examples/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/functions/test_string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_batching.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_client.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_datachain.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_file.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_image.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_ls.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_metastore.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_pull.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_read_database.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_session.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_video.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/test_atomicity.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/test_import_time.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/test_telemetry.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_client.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_config.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_func.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_semver.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_session.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.24.0 → datachain-0.24.1}/tests/utils.py +0 -0
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
|
6
6
|
import datachain
|
|
7
7
|
from datachain.dataset import DatasetDependency
|
|
8
8
|
from datachain.error import DatasetNotFoundError
|
|
9
|
+
from datachain.project import Project
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from typing_extensions import Concatenate, ParamSpec
|
|
@@ -50,15 +51,24 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
|
|
|
50
51
|
|
|
51
52
|
def _get_delta_chain(
|
|
52
53
|
source_ds_name: str,
|
|
54
|
+
source_ds_project: Project,
|
|
53
55
|
source_ds_version: str,
|
|
54
56
|
source_ds_latest_version: str,
|
|
55
57
|
on: Union[str, Sequence[str]],
|
|
56
58
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
59
|
) -> "DataChain":
|
|
58
60
|
"""Get delta chain for processing changes between versions."""
|
|
59
|
-
source_dc = datachain.read_dataset(
|
|
61
|
+
source_dc = datachain.read_dataset(
|
|
62
|
+
source_ds_name,
|
|
63
|
+
namespace=source_ds_project.namespace.name,
|
|
64
|
+
project=source_ds_project.name,
|
|
65
|
+
version=source_ds_version,
|
|
66
|
+
)
|
|
60
67
|
source_dc_latest = datachain.read_dataset(
|
|
61
|
-
source_ds_name,
|
|
68
|
+
source_ds_name,
|
|
69
|
+
namespace=source_ds_project.namespace.name,
|
|
70
|
+
project=source_ds_project.name,
|
|
71
|
+
version=source_ds_latest_version,
|
|
62
72
|
)
|
|
63
73
|
|
|
64
74
|
# Calculate diff between source versions
|
|
@@ -67,12 +77,15 @@ def _get_delta_chain(
|
|
|
67
77
|
|
|
68
78
|
def _get_retry_chain(
|
|
69
79
|
name: str,
|
|
80
|
+
project: Project,
|
|
70
81
|
latest_version: str,
|
|
71
82
|
source_ds_name: str,
|
|
72
|
-
|
|
83
|
+
source_ds_project: Project,
|
|
84
|
+
source_ds_version: str,
|
|
73
85
|
on: Union[str, Sequence[str]],
|
|
74
86
|
right_on: Optional[Union[str, Sequence[str]]],
|
|
75
87
|
delta_retry: Optional[Union[bool, str]],
|
|
88
|
+
diff_chain: "DataChain",
|
|
76
89
|
) -> Optional["DataChain"]:
|
|
77
90
|
"""Get retry chain for processing error records and missing records."""
|
|
78
91
|
# Import here to avoid circular import
|
|
@@ -81,35 +94,49 @@ def _get_retry_chain(
|
|
|
81
94
|
retry_chain = None
|
|
82
95
|
|
|
83
96
|
# Read the latest version of the result dataset for retry logic
|
|
84
|
-
result_dataset = datachain.read_dataset(
|
|
85
|
-
|
|
86
|
-
|
|
97
|
+
result_dataset = datachain.read_dataset(
|
|
98
|
+
name,
|
|
99
|
+
namespace=project.namespace.name,
|
|
100
|
+
project=project.name,
|
|
101
|
+
version=latest_version,
|
|
102
|
+
)
|
|
103
|
+
source_dc = datachain.read_dataset(
|
|
104
|
+
source_ds_name,
|
|
105
|
+
namespace=source_ds_project.namespace.name,
|
|
106
|
+
project=source_ds_project.name,
|
|
107
|
+
version=source_ds_version,
|
|
87
108
|
)
|
|
88
109
|
|
|
89
110
|
# Handle error records if delta_retry is a string (column name)
|
|
90
111
|
if isinstance(delta_retry, str):
|
|
91
112
|
error_records = result_dataset.filter(C(delta_retry) != "")
|
|
92
|
-
error_source_records =
|
|
113
|
+
error_source_records = source_dc.merge(
|
|
93
114
|
error_records, on=on, right_on=right_on, inner=True
|
|
94
|
-
).select(*list(
|
|
115
|
+
).select(*list(source_dc.signals_schema.values))
|
|
95
116
|
retry_chain = error_source_records
|
|
96
117
|
|
|
97
118
|
# Handle missing records if delta_retry is True
|
|
98
119
|
elif delta_retry is True:
|
|
99
|
-
missing_records =
|
|
100
|
-
result_dataset, on=on, right_on=right_on
|
|
101
|
-
)
|
|
120
|
+
missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
|
|
102
121
|
retry_chain = missing_records
|
|
103
122
|
|
|
104
|
-
|
|
123
|
+
# Subtract also diff chain since some items might be picked
|
|
124
|
+
# up by `delta=True` itself (e.g. records got modified AND are missing in the
|
|
125
|
+
# result dataset atm)
|
|
126
|
+
return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
|
|
105
127
|
|
|
106
128
|
|
|
107
129
|
def _get_source_info(
|
|
108
130
|
name: str,
|
|
131
|
+
project: Project,
|
|
109
132
|
latest_version: str,
|
|
110
133
|
catalog,
|
|
111
134
|
) -> tuple[
|
|
112
|
-
Optional[str],
|
|
135
|
+
Optional[str],
|
|
136
|
+
Optional[Project],
|
|
137
|
+
Optional[str],
|
|
138
|
+
Optional[str],
|
|
139
|
+
Optional[list[DatasetDependency]],
|
|
113
140
|
]:
|
|
114
141
|
"""Get source dataset information and dependencies.
|
|
115
142
|
|
|
@@ -118,23 +145,34 @@ def _get_source_info(
|
|
|
118
145
|
Returns (None, None, None, None) if source dataset was removed.
|
|
119
146
|
"""
|
|
120
147
|
dependencies = catalog.get_dataset_dependencies(
|
|
121
|
-
name, latest_version, indirect=False
|
|
148
|
+
name, latest_version, project=project, indirect=False
|
|
122
149
|
)
|
|
123
150
|
|
|
124
151
|
dep = dependencies[0]
|
|
125
152
|
if not dep:
|
|
126
153
|
# Starting dataset was removed, back off to normal dataset creation
|
|
127
|
-
return None, None, None, None
|
|
154
|
+
return None, None, None, None, None
|
|
128
155
|
|
|
156
|
+
source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
|
|
129
157
|
source_ds_name = dep.name
|
|
130
158
|
source_ds_version = dep.version
|
|
131
|
-
source_ds_latest_version = catalog.get_dataset(
|
|
132
|
-
|
|
133
|
-
|
|
159
|
+
source_ds_latest_version = catalog.get_dataset(
|
|
160
|
+
source_ds_name, project=source_ds_project
|
|
161
|
+
).latest_version
|
|
162
|
+
|
|
163
|
+
return (
|
|
164
|
+
source_ds_name,
|
|
165
|
+
source_ds_project,
|
|
166
|
+
source_ds_version,
|
|
167
|
+
source_ds_latest_version,
|
|
168
|
+
dependencies,
|
|
169
|
+
)
|
|
134
170
|
|
|
135
171
|
|
|
136
172
|
def delta_retry_update(
|
|
137
173
|
dc: "DataChain",
|
|
174
|
+
namespace_name: str,
|
|
175
|
+
project_name: str,
|
|
138
176
|
name: str,
|
|
139
177
|
on: Union[str, Sequence[str]],
|
|
140
178
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
@@ -173,11 +211,12 @@ def delta_retry_update(
|
|
|
173
211
|
"""
|
|
174
212
|
|
|
175
213
|
catalog = dc.session.catalog
|
|
214
|
+
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
176
215
|
dc._query.apply_listing_pre_step()
|
|
177
216
|
|
|
178
217
|
# Check if dataset exists
|
|
179
218
|
try:
|
|
180
|
-
dataset = catalog.get_dataset(name)
|
|
219
|
+
dataset = catalog.get_dataset(name, project=project)
|
|
181
220
|
latest_version = dataset.latest_version
|
|
182
221
|
except DatasetNotFoundError:
|
|
183
222
|
# First creation of result dataset
|
|
@@ -189,19 +228,29 @@ def delta_retry_update(
|
|
|
189
228
|
retry_chain = None
|
|
190
229
|
processing_chain = None
|
|
191
230
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
231
|
+
(
|
|
232
|
+
source_ds_name,
|
|
233
|
+
source_ds_project,
|
|
234
|
+
source_ds_version,
|
|
235
|
+
source_ds_latest_version,
|
|
236
|
+
dependencies,
|
|
237
|
+
) = _get_source_info(name, project, latest_version, catalog)
|
|
195
238
|
|
|
196
239
|
# If source_ds_name is None, starting dataset was removed
|
|
197
240
|
if source_ds_name is None:
|
|
198
241
|
return None, None, True
|
|
199
242
|
|
|
243
|
+
assert source_ds_project
|
|
200
244
|
assert source_ds_version
|
|
201
245
|
assert source_ds_latest_version
|
|
202
246
|
|
|
203
247
|
diff_chain = _get_delta_chain(
|
|
204
|
-
source_ds_name,
|
|
248
|
+
source_ds_name,
|
|
249
|
+
source_ds_project,
|
|
250
|
+
source_ds_version,
|
|
251
|
+
source_ds_latest_version,
|
|
252
|
+
on,
|
|
253
|
+
compare,
|
|
205
254
|
)
|
|
206
255
|
|
|
207
256
|
# Filter out removed dep
|
|
@@ -215,12 +264,15 @@ def delta_retry_update(
|
|
|
215
264
|
if delta_retry:
|
|
216
265
|
retry_chain = _get_retry_chain(
|
|
217
266
|
name,
|
|
267
|
+
project,
|
|
218
268
|
latest_version,
|
|
219
269
|
source_ds_name,
|
|
220
|
-
|
|
270
|
+
source_ds_project,
|
|
271
|
+
source_ds_version,
|
|
221
272
|
on,
|
|
222
273
|
right_on,
|
|
223
274
|
delta_retry,
|
|
275
|
+
diff_chain,
|
|
224
276
|
)
|
|
225
277
|
|
|
226
278
|
# Combine delta and retry chains
|
|
@@ -236,7 +288,12 @@ def delta_retry_update(
|
|
|
236
288
|
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
237
289
|
return None, None, False
|
|
238
290
|
|
|
239
|
-
latest_dataset = datachain.read_dataset(
|
|
291
|
+
latest_dataset = datachain.read_dataset(
|
|
292
|
+
name,
|
|
293
|
+
namespace=project.namespace.name,
|
|
294
|
+
project=project.name,
|
|
295
|
+
version=latest_version,
|
|
296
|
+
)
|
|
240
297
|
compared_chain = latest_dataset.diff(
|
|
241
298
|
processing_chain,
|
|
242
299
|
on=right_on or on,
|
|
@@ -14,15 +14,26 @@ from datachain.lib.file import File, ImageFile
|
|
|
14
14
|
def _get_dependencies(catalog, name, version) -> list[tuple[str, str]]:
|
|
15
15
|
return sorted(
|
|
16
16
|
[
|
|
17
|
-
(d.name, d.version)
|
|
17
|
+
(f"{d.namespace}.{d.project}.{d.name}", d.version)
|
|
18
18
|
for d in catalog.get_dataset_dependencies(name, version, indirect=False)
|
|
19
19
|
]
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
@pytest.mark.parametrize("project", ("global.dev", ""))
|
|
24
|
+
def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
|
|
24
25
|
catalog = test_session.catalog
|
|
25
|
-
|
|
26
|
+
default_namespace_name = catalog.metastore.default_namespace_name
|
|
27
|
+
default_project_name = catalog.metastore.default_project_name
|
|
28
|
+
|
|
29
|
+
if project:
|
|
30
|
+
starting_ds_name = f"{project}.starting_ds"
|
|
31
|
+
dependency_ds_name = starting_ds_name
|
|
32
|
+
else:
|
|
33
|
+
starting_ds_name = "starting_ds"
|
|
34
|
+
dependency_ds_name = (
|
|
35
|
+
f"{default_namespace_name}.{default_project_name}.{starting_ds_name}"
|
|
36
|
+
)
|
|
26
37
|
ds_name = "delta_ds"
|
|
27
38
|
|
|
28
39
|
images = [
|
|
@@ -55,12 +66,16 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
|
|
|
55
66
|
create_image_dataset(starting_ds_name, images[:2])
|
|
56
67
|
# first version of delta dataset
|
|
57
68
|
create_delta_dataset(ds_name)
|
|
58
|
-
assert _get_dependencies(catalog, ds_name, "1.0.0") == [
|
|
69
|
+
assert _get_dependencies(catalog, ds_name, "1.0.0") == [
|
|
70
|
+
(dependency_ds_name, "1.0.0")
|
|
71
|
+
]
|
|
59
72
|
# second version of starting dataset
|
|
60
73
|
create_image_dataset(starting_ds_name, images[2:])
|
|
61
74
|
# second version of delta dataset
|
|
62
75
|
create_delta_dataset(ds_name)
|
|
63
|
-
assert _get_dependencies(catalog, ds_name, "1.0.1") == [
|
|
76
|
+
assert _get_dependencies(catalog, ds_name, "1.0.1") == [
|
|
77
|
+
(dependency_ds_name, "1.0.1")
|
|
78
|
+
]
|
|
64
79
|
|
|
65
80
|
assert (dc.read_dataset(ds_name, version="1.0.0").order_by("file.path")).to_values(
|
|
66
81
|
"file.path"
|
|
@@ -30,6 +30,23 @@ def _process_with_errors(id: int, content: str, attempt: int) -> ProcessingResul
|
|
|
30
30
|
)
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
def _create_sample_data(test_session, ids=None, contents=None):
|
|
34
|
+
"""Helper function to create sample data for retry tests."""
|
|
35
|
+
ids = ids or [1, 2, 3, 4]
|
|
36
|
+
contents = contents or ["first item", "second item", "third item", "fourth item"]
|
|
37
|
+
dc.read_values(id=ids, content=contents, session=test_session).save("sample_data")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _simple_process(id: int, content: str, attempt: int = 1) -> ProcessingResult:
|
|
41
|
+
"""Helper function for simple processing in retry tests."""
|
|
42
|
+
return ProcessingResult(
|
|
43
|
+
processed_content=content.upper(),
|
|
44
|
+
processed_at=datetime.now(tz=timezone.utc).isoformat(),
|
|
45
|
+
error="",
|
|
46
|
+
attempt=attempt,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
33
50
|
def test_retry_with_error_records(test_session):
|
|
34
51
|
"""Test retry functionality with records that have errors."""
|
|
35
52
|
|
|
@@ -48,13 +65,7 @@ def test_retry_with_error_records(test_session):
|
|
|
48
65
|
)
|
|
49
66
|
|
|
50
67
|
# First processing pass - some records will fail
|
|
51
|
-
|
|
52
|
-
sample_contents = ["first item", "second item", "third item", "fourth item"]
|
|
53
|
-
|
|
54
|
-
dc.read_values(id=sample_ids, content=sample_contents, session=test_session).save(
|
|
55
|
-
"sample_data"
|
|
56
|
-
)
|
|
57
|
-
|
|
68
|
+
_create_sample_data(test_session)
|
|
58
69
|
first_pass = _run_processing(1)
|
|
59
70
|
|
|
60
71
|
# Check that some records failed
|
|
@@ -74,72 +85,103 @@ def test_retry_with_error_records(test_session):
|
|
|
74
85
|
|
|
75
86
|
def test_retry_with_missing_records(test_session):
|
|
76
87
|
"""Test retry functionality with missing records."""
|
|
77
|
-
|
|
78
|
-
source_ids = [1, 2, 3]
|
|
79
|
-
source_contents = ["first", "second", "third"]
|
|
88
|
+
_create_sample_data(test_session)
|
|
80
89
|
|
|
81
|
-
|
|
82
|
-
|
|
90
|
+
# Process only first 2 records
|
|
91
|
+
# Create partial result dataset (missing id=3)
|
|
92
|
+
partial_result = (
|
|
93
|
+
dc.read_dataset("sample_data", session=test_session)
|
|
94
|
+
.setup(attempt=lambda: 1)
|
|
95
|
+
.filter(C("id") < 3)
|
|
96
|
+
.map(result=_simple_process)
|
|
97
|
+
.save("partial_result")
|
|
83
98
|
)
|
|
84
99
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
100
|
+
assert partial_result.count() == 2
|
|
101
|
+
|
|
102
|
+
# Use retry with delta_retry=True to process missing records
|
|
103
|
+
retry_chain = (
|
|
104
|
+
dc.read_dataset(
|
|
105
|
+
"sample_data",
|
|
106
|
+
session=test_session,
|
|
107
|
+
delta=True,
|
|
108
|
+
delta_on="id",
|
|
109
|
+
delta_retry=True,
|
|
91
110
|
)
|
|
111
|
+
.setup(attempt=lambda: 2)
|
|
112
|
+
.map(result=_simple_process)
|
|
113
|
+
.save("partial_result")
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Should now have all 4 records
|
|
117
|
+
assert retry_chain.count() == 4
|
|
118
|
+
|
|
119
|
+
# Verify all records are present
|
|
120
|
+
ids = set(retry_chain.to_values("id"))
|
|
121
|
+
assert ids == {1, 2, 3, 4}
|
|
122
|
+
|
|
123
|
+
final_first_attempts_count = retry_chain.filter(C("result.attempt") == 1).count()
|
|
124
|
+
final_missing_attempts_count = retry_chain.filter(C("result.attempt") == 2).count()
|
|
125
|
+
|
|
126
|
+
# Only missing records should have attempt 2
|
|
127
|
+
assert final_missing_attempts_count == 2
|
|
128
|
+
assert final_first_attempts_count == 2
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def test_retry_with_missing_and_new_records(test_session):
|
|
132
|
+
"""Test retry functionality with missing records (e.g. ignored
|
|
133
|
+
in first pass since they failed). Also we add new records to the source
|
|
134
|
+
to test that retry and delta don't pick records twice."""
|
|
135
|
+
_create_sample_data(test_session)
|
|
92
136
|
|
|
93
137
|
# Process only first 2 records
|
|
94
138
|
# Create partial result dataset (missing id=3)
|
|
95
139
|
partial_result = (
|
|
96
|
-
dc.read_dataset("
|
|
140
|
+
dc.read_dataset("sample_data", session=test_session)
|
|
97
141
|
.setup(attempt=lambda: 1)
|
|
98
142
|
.filter(C("id") < 3)
|
|
99
|
-
.map(result=
|
|
143
|
+
.map(result=_simple_process)
|
|
100
144
|
.save("partial_result")
|
|
101
145
|
)
|
|
102
146
|
|
|
103
147
|
assert partial_result.count() == 2
|
|
104
148
|
|
|
149
|
+
ids = [1, 2, 3, 4, 5]
|
|
150
|
+
contents = ["first item", "second item", "third item", "fourth item", "fifth item"]
|
|
151
|
+
_create_sample_data(test_session, ids, contents)
|
|
152
|
+
|
|
105
153
|
# Use retry with delta_retry=True to process missing records
|
|
106
154
|
retry_chain = (
|
|
107
155
|
dc.read_dataset(
|
|
108
|
-
"
|
|
156
|
+
"sample_data",
|
|
109
157
|
session=test_session,
|
|
110
158
|
delta=True,
|
|
111
159
|
delta_on="id",
|
|
112
160
|
delta_retry=True,
|
|
113
161
|
)
|
|
114
162
|
.setup(attempt=lambda: 2)
|
|
115
|
-
.map(result=
|
|
163
|
+
.map(result=_simple_process)
|
|
116
164
|
.save("partial_result")
|
|
117
165
|
)
|
|
118
166
|
|
|
119
167
|
# Should now have all 3 records
|
|
120
|
-
assert retry_chain.count() ==
|
|
168
|
+
assert retry_chain.count() == 5
|
|
121
169
|
|
|
122
170
|
# Verify all records are present
|
|
123
171
|
ids = set(retry_chain.to_values("id"))
|
|
124
|
-
assert ids == {1, 2, 3}
|
|
172
|
+
assert ids == {1, 2, 3, 4, 5}
|
|
125
173
|
|
|
126
174
|
final_first_attempts_count = retry_chain.filter(C("result.attempt") == 1).count()
|
|
127
175
|
final_missing_attempts_count = retry_chain.filter(C("result.attempt") == 2).count()
|
|
128
176
|
|
|
129
177
|
# Only missing records should have attempt 2
|
|
130
|
-
assert final_missing_attempts_count ==
|
|
178
|
+
assert final_missing_attempts_count == 3
|
|
131
179
|
assert final_first_attempts_count == 2
|
|
132
180
|
|
|
133
181
|
|
|
134
182
|
def test_retry_no_records_to_retry(test_session):
|
|
135
183
|
"""Test retry when no records need to be retried."""
|
|
136
|
-
|
|
137
|
-
source_ids = [1, 2]
|
|
138
|
-
source_contents = ["first", "second"]
|
|
139
|
-
|
|
140
|
-
dc.read_values(id=source_ids, content=source_contents, session=test_session).save(
|
|
141
|
-
"source_data"
|
|
142
|
-
)
|
|
184
|
+
_create_sample_data(test_session, ids=[1, 2], contents=["first", "second"])
|
|
143
185
|
|
|
144
186
|
def successful_process(id: int, content: str) -> ProcessingResult:
|
|
145
187
|
return ProcessingResult(
|
|
@@ -151,7 +193,7 @@ def test_retry_no_records_to_retry(test_session):
|
|
|
151
193
|
|
|
152
194
|
# First pass - all succeed
|
|
153
195
|
first_pass = (
|
|
154
|
-
dc.read_dataset("
|
|
196
|
+
dc.read_dataset("sample_data", session=test_session)
|
|
155
197
|
.map(result=successful_process)
|
|
156
198
|
.save("successful_data")
|
|
157
199
|
)
|
|
@@ -162,7 +204,7 @@ def test_retry_no_records_to_retry(test_session):
|
|
|
162
204
|
# Retry - should not create a new version since no records need retry
|
|
163
205
|
(
|
|
164
206
|
dc.read_dataset(
|
|
165
|
-
"
|
|
207
|
+
"sample_data",
|
|
166
208
|
session=test_session,
|
|
167
209
|
delta=True,
|
|
168
210
|
delta_on="id",
|
|
@@ -179,32 +221,20 @@ def test_retry_no_records_to_retry(test_session):
|
|
|
179
221
|
|
|
180
222
|
def test_retry_first_dataset_creation(test_session):
|
|
181
223
|
"""Test retry when dataset doesn't exist yet (first creation)."""
|
|
182
|
-
|
|
183
|
-
source_contents = ["first", "second"]
|
|
184
|
-
|
|
185
|
-
dc.read_values(id=source_ids, content=source_contents, session=test_session).save(
|
|
186
|
-
"source_data"
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
def simple_process(id: int, content: str) -> ProcessingResult:
|
|
190
|
-
return ProcessingResult(
|
|
191
|
-
processed_content=content.upper(),
|
|
192
|
-
processed_at=datetime.now(tz=timezone.utc).isoformat(),
|
|
193
|
-
error="",
|
|
194
|
-
attempt=1,
|
|
195
|
-
)
|
|
224
|
+
_create_sample_data(test_session, ids=[1, 2], contents=["first", "second"])
|
|
196
225
|
|
|
197
226
|
# First run with retry enabled on non-existent dataset
|
|
198
227
|
# Should process all records
|
|
199
228
|
retry_chain = (
|
|
200
229
|
dc.read_dataset(
|
|
201
|
-
"
|
|
230
|
+
"sample_data",
|
|
202
231
|
session=test_session,
|
|
203
232
|
delta=True,
|
|
204
233
|
delta_on="id",
|
|
205
234
|
delta_retry="result.error",
|
|
206
235
|
)
|
|
207
|
-
.
|
|
236
|
+
.setup(attempt=lambda: 1)
|
|
237
|
+
.map(result=_simple_process)
|
|
208
238
|
.save("new_dataset")
|
|
209
239
|
)
|
|
210
240
|
|
|
@@ -311,3 +341,87 @@ def test_retry_with_delta_functionality(test_session):
|
|
|
311
341
|
(2, "", 1),
|
|
312
342
|
(3, "", 2),
|
|
313
343
|
}
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def test_delta_and_delta_retry_no_duplicates(test_session):
|
|
347
|
+
"""Test that delta and delta_retry work together without creating duplicates
|
|
348
|
+
when the same records are picked up for different reasons:
|
|
349
|
+
- delta_retry=True picks up unprocessed records missing from result dataset
|
|
350
|
+
- delta=True picks up modified records from source dataset
|
|
351
|
+
"""
|
|
352
|
+
_create_sample_data(test_session)
|
|
353
|
+
|
|
354
|
+
# First pass - process only records 1 and 2
|
|
355
|
+
partial_result = (
|
|
356
|
+
dc.read_dataset("sample_data", session=test_session)
|
|
357
|
+
.setup(attempt=lambda: 1)
|
|
358
|
+
.filter(C("id") < 3) # Only process id=1,2, leaving id=3,4 unprocessed
|
|
359
|
+
.map(result=_simple_process)
|
|
360
|
+
.save("delta_retry_combined_result")
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
assert partial_result.count() == 2
|
|
364
|
+
initial_results = set(partial_result.to_iter("id", "result.attempt"))
|
|
365
|
+
assert initial_results == {(1, 1), (2, 1)}
|
|
366
|
+
|
|
367
|
+
# Modify the source data - update content for records 3 and 4
|
|
368
|
+
# This will make delta=True pick them up as "changed"
|
|
369
|
+
# But delta_retry=True will also pick them up as "missing from result"
|
|
370
|
+
modified_ids = [1, 2, 3, 4]
|
|
371
|
+
modified_contents = [
|
|
372
|
+
"first item", # unchanged
|
|
373
|
+
"second item", # unchanged
|
|
374
|
+
"MODIFIED third item", # modified - delta will pick this up
|
|
375
|
+
"MODIFIED fourth item", # modified - delta will pick this up
|
|
376
|
+
]
|
|
377
|
+
_create_sample_data(test_session, modified_ids, modified_contents)
|
|
378
|
+
|
|
379
|
+
# Second pass with both delta=True and delta_retry=True
|
|
380
|
+
# Records 3,4 should be picked up by BOTH:
|
|
381
|
+
# - delta_retry=True (because they're missing from result dataset)
|
|
382
|
+
# - delta=True (because their content was modified in source)
|
|
383
|
+
# But they should only be processed ONCE (no duplicates)
|
|
384
|
+
combined_result = (
|
|
385
|
+
dc.read_dataset(
|
|
386
|
+
"sample_data",
|
|
387
|
+
session=test_session,
|
|
388
|
+
delta=True,
|
|
389
|
+
delta_on="id",
|
|
390
|
+
delta_retry=True,
|
|
391
|
+
)
|
|
392
|
+
.setup(attempt=lambda: 2)
|
|
393
|
+
.map(result=_simple_process)
|
|
394
|
+
.save("delta_retry_combined_result")
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# Should have 4 total records: 2 from first pass + 2 newly processed
|
|
398
|
+
assert combined_result.count() == 4
|
|
399
|
+
|
|
400
|
+
# Get all results and verify no duplicates
|
|
401
|
+
all_results = set(
|
|
402
|
+
combined_result.to_iter("id", "result.attempt", "result.processed_content")
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Records 1,2 should have attempt=1 (from first pass)
|
|
406
|
+
# Records 3,4 should have attempt=2 (from second pass) and MODIFIED content
|
|
407
|
+
expected_results = {
|
|
408
|
+
(1, 1, "FIRST ITEM"),
|
|
409
|
+
(2, 1, "SECOND ITEM"),
|
|
410
|
+
(3, 2, "MODIFIED THIRD ITEM"),
|
|
411
|
+
(4, 2, "MODIFIED FOURTH ITEM"),
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
assert all_results == expected_results
|
|
415
|
+
|
|
416
|
+
# Verify counts by attempt
|
|
417
|
+
first_attempt_count = combined_result.filter(C("result.attempt") == 1).count()
|
|
418
|
+
second_attempt_count = combined_result.filter(C("result.attempt") == 2).count()
|
|
419
|
+
|
|
420
|
+
assert first_attempt_count == 2 # Records 1,2 from first pass
|
|
421
|
+
assert second_attempt_count == 2 # Records 3,4 from second pass (no duplicates)
|
|
422
|
+
|
|
423
|
+
# Verify that each id appears exactly once
|
|
424
|
+
ids_in_result = list(combined_result.to_values("id"))
|
|
425
|
+
assert len(ids_in_result) == 4
|
|
426
|
+
assert len(set(ids_in_result)) == 4 # No duplicate IDs
|
|
427
|
+
assert set(ids_in_result) == {1, 2, 3, 4}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|