datachain 0.36.6__tar.gz → 0.37.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.36.6 → datachain-0.37.1}/.pre-commit-config.yaml +1 -1
- {datachain-0.36.6 → datachain-0.37.1}/PKG-INFO +2 -2
- datachain-0.37.1/docs/guide/checkpoints.md +207 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/index.md +1 -0
- {datachain-0.36.6 → datachain-0.37.1}/mkdocs.yml +1 -0
- {datachain-0.36.6 → datachain-0.37.1}/pyproject.toml +1 -1
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/catalog/catalog.py +2 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/metastore.py +16 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/delta.py +3 -1
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/diff/__init__.py +3 -1
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/job.py +1 -1
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/datachain.py +10 -17
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/records.py +0 -2
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/dataset.py +0 -4
- datachain-0.37.1/src/datachain/query/session.py +347 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain.egg-info/SOURCES.txt +4 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.36.6 → datachain-0.37.1}/tests/conftest.py +21 -4
- datachain-0.37.1/tests/func/test_checkpoints.py +52 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_datachain.py +0 -20
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_datasets.py +6 -4
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_delta.py +64 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_session.py +6 -3
- {datachain-0.36.6 → datachain-0.37.1}/tests/scripts/feature_class_exception.py +13 -8
- {datachain-0.36.6 → datachain-0.37.1}/tests/test_atomicity.py +7 -4
- datachain-0.37.1/tests/test_job_management_e2e.py +158 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/test_query_e2e.py +5 -4
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_checkpoints.py +53 -41
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_diff.py +19 -1
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_datachain_hash.py +1 -1
- datachain-0.37.1/tests/unit/test_job_management.py +174 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/utils.py +25 -0
- datachain-0.36.6/src/datachain/query/session.py +0 -205
- {datachain-0.36.6 → datachain-0.37.1}/.cruft.json +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.gitattributes +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/codecov.yaml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/dependabot.yml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/workflows/release.yml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/workflows/tests.yml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/.gitignore +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/LICENSE +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/README.rst +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/api_hooks.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/assets/webhook_dialog.png +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/assets/webhook_list.png +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/auth/login.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/auth/logout.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/auth/team.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/auth/token.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/index.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/job/cancel.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/job/clusters.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/job/logs.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/job/ls.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/commands/job/run.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/contributing.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/examples.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/db_migrations.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/delta.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/env.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/namespaces.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/processing.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/remotes.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/guide/retry.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/index.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/overrides/main.html +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/quick-start.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/datachain.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/func.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/aggregate.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/array.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/conditional.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/numeric.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/path.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/random.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/string.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/functions/window.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/index.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/toolkit.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/torch.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/references/udf.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/studio/api/.gitkeep +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/studio/webhooks.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/templates/main.dot +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/templates/operation.dot +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/templates/responses.def +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/docs/tutorials.md +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/get_started/nested_datamodel.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/noxfile.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/setup.cfg +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/__main__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/asyn.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cache.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/catalog/dependency.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/checkpoint.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/http.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/local.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/config.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/dataset.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/error.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/array.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/base.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/func.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/path.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/random.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/string.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/func/window.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/hash_utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/audio.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/storage_pattern.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/projects.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/udf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/listing.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/namespace.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/node.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/plugins.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/progress.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/project.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/py.typed +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/params.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/semver.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/postgresql_dialect.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/postgresql_types.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/studio.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain/utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/data.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/examples/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/test_array.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/test_path.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/test_random.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/functions/test_string.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_audio.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_client.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_file.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_hf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_image.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_listing.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_ls.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_metastore.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_mutate.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_pull.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_query.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_read_database.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_retry.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_storage_pattern.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_temp_table_tracking.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_to_database.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_udf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_union.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_video.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/test_import_time.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/test_telemetry.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_audio.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_storage_pattern.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_batching.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_cli_datasets.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_client.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_client_http.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_config.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_func.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_hash_utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_query.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_query_steps_hash.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_semver.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_session.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.36.6 → datachain-0.37.1}/tests/unit/test_warehouse.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.37.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -41,7 +41,7 @@ Requires-Dist: cloudpickle
|
|
|
41
41
|
Requires-Dist: pydantic
|
|
42
42
|
Requires-Dist: jmespath>=1.0
|
|
43
43
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
44
|
-
Requires-Dist: Pillow<
|
|
44
|
+
Requires-Dist: Pillow<13,>=10.0.0
|
|
45
45
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
46
46
|
Requires-Dist: psutil
|
|
47
47
|
Requires-Dist: huggingface_hub
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Checkpoints
|
|
2
|
+
|
|
3
|
+
Checkpoints allow DataChain to automatically skip re-creating datasets that were successfully saved in previous script runs. When a script fails or is interrupted, you can re-run it and DataChain will resume from where it left off, reusing datasets that were already created.
|
|
4
|
+
|
|
5
|
+
**Note:** Checkpoints are currently available only for local script runs. Support for Studio is planned for future releases.
|
|
6
|
+
|
|
7
|
+
## How Checkpoints Work
|
|
8
|
+
|
|
9
|
+
When you run a Python script locally (e.g., `python my_script.py`), DataChain automatically:
|
|
10
|
+
|
|
11
|
+
1. **Creates a job** for the script execution, using the script's absolute path as the job name
|
|
12
|
+
2. **Tracks parent jobs** by finding the last job with the same script name
|
|
13
|
+
3. **Calculates hashes** for each dataset save operation based on the DataChain operations chain
|
|
14
|
+
4. **Creates checkpoints** after each successful `.save()` call, storing the hash
|
|
15
|
+
5. **Checks for existing checkpoints** on subsequent runs - if a matching checkpoint exists in the parent job, DataChain skips the save and reuses the existing dataset
|
|
16
|
+
|
|
17
|
+
This means that if your script creates multiple datasets and fails partway through, the next run will skip recreating the datasets that were already successfully saved.
|
|
18
|
+
|
|
19
|
+
## Example
|
|
20
|
+
|
|
21
|
+
Consider this script that processes data in multiple stages:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import datachain as dc
|
|
25
|
+
|
|
26
|
+
# Stage 1: Load and filter data
|
|
27
|
+
filtered = (
|
|
28
|
+
dc.read_csv("s3://mybucket/data.csv")
|
|
29
|
+
.filter(dc.C("score") > 0.5)
|
|
30
|
+
.save("filtered_data")
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Stage 2: Transform data
|
|
34
|
+
transformed = (
|
|
35
|
+
filtered
|
|
36
|
+
.map(value=lambda x: x * 2, output=float)
|
|
37
|
+
.save("transformed_data")
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Stage 3: Aggregate results
|
|
41
|
+
result = (
|
|
42
|
+
transformed
|
|
43
|
+
.agg(
|
|
44
|
+
total=lambda values: sum(values),
|
|
45
|
+
partition_by="category",
|
|
46
|
+
)
|
|
47
|
+
.save("final_results")
|
|
48
|
+
)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**First run:** The script executes all three stages and creates three datasets: `filtered_data`, `transformed_data`, and `final_results`. If the script fails during Stage 3, only `filtered_data` and `transformed_data` are saved.
|
|
52
|
+
|
|
53
|
+
**Second run:** DataChain detects that `filtered_data` and `transformed_data` were already created in the parent job with matching hashes. It skips recreating them and proceeds directly to Stage 3, creating only `final_results`.
|
|
54
|
+
|
|
55
|
+
## When Checkpoints Are Used
|
|
56
|
+
|
|
57
|
+
Checkpoints are automatically used when:
|
|
58
|
+
|
|
59
|
+
- Running a Python script locally (e.g., `python my_script.py`)
|
|
60
|
+
- The script has been run before
|
|
61
|
+
- A dataset with the same name is being saved
|
|
62
|
+
- The chain hash matches a checkpoint from the parent job
|
|
63
|
+
|
|
64
|
+
Checkpoints are **not** used when:
|
|
65
|
+
|
|
66
|
+
- Running code interactively (Python REPL, Jupyter notebooks)
|
|
67
|
+
- Running code as a module (e.g., `python -m mymodule`)
|
|
68
|
+
- The `DATACHAIN_CHECKPOINTS_RESET` environment variable is set (see below)
|
|
69
|
+
- Running on Studio (checkpoints support planned for future releases)
|
|
70
|
+
|
|
71
|
+
## Resetting Checkpoints
|
|
72
|
+
|
|
73
|
+
To ignore existing checkpoints and run your script from scratch, set the `DATACHAIN_CHECKPOINTS_RESET` environment variable:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
export DATACHAIN_CHECKPOINTS_RESET=1
|
|
77
|
+
python my_script.py
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Or set it inline:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
DATACHAIN_CHECKPOINTS_RESET=1 python my_script.py
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
This forces DataChain to recreate all datasets, regardless of existing checkpoints.
|
|
87
|
+
|
|
88
|
+
## How Job Names Are Determined
|
|
89
|
+
|
|
90
|
+
DataChain uses different strategies for naming jobs depending on how the code is executed:
|
|
91
|
+
|
|
92
|
+
### Script Execution (Checkpoints Enabled)
|
|
93
|
+
|
|
94
|
+
When running `python my_script.py`, DataChain uses the **absolute path** to the script as the job name:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
/home/user/projects/my_script.py
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
This allows DataChain to link runs of the same script together as parent-child jobs, enabling checkpoint lookup.
|
|
101
|
+
|
|
102
|
+
### Interactive or Module Execution (Checkpoints Disabled)
|
|
103
|
+
|
|
104
|
+
When running code interactively or as a module, DataChain uses a **unique UUID** as the job name:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
This prevents unrelated executions from being linked together, but also means checkpoints cannot be used.
|
|
111
|
+
|
|
112
|
+
## How Checkpoint Hashes Are Calculated
|
|
113
|
+
|
|
114
|
+
For each `.save()` operation, DataChain calculates a hash based on:
|
|
115
|
+
|
|
116
|
+
1. The hash of the previous checkpoint in the current job (if any)
|
|
117
|
+
2. The hash of the current DataChain operations chain
|
|
118
|
+
|
|
119
|
+
This creates a chain of hashes that uniquely identifies each stage of data processing. On subsequent runs, DataChain matches these hashes against the parent job's checkpoints and skips recreating datasets where the hashes match.
|
|
120
|
+
|
|
121
|
+
### Hash Invalidation
|
|
122
|
+
|
|
123
|
+
**Checkpoints are automatically invalidated when you modify the chain.** Any change to the DataChain operations will result in a different hash, causing DataChain to skip the checkpoint and recompute the dataset.
|
|
124
|
+
|
|
125
|
+
Changes that invalidate checkpoints include:
|
|
126
|
+
|
|
127
|
+
- **Modifying filter conditions:** `.filter(dc.C("score") > 0.5)` → `.filter(dc.C("score") > 0.8)`
|
|
128
|
+
- **Changing map/gen/agg functions:** Any modification to UDF logic
|
|
129
|
+
- **Altering function parameters:** Changes to column names, output types, or other parameters
|
|
130
|
+
- **Adding or removing operations:** Inserting new `.filter()`, `.map()`, or other steps
|
|
131
|
+
- **Reordering operations:** Changing the sequence of transformations
|
|
132
|
+
|
|
133
|
+
### Example
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# First run - creates three checkpoints
|
|
137
|
+
dc.read_csv("data.csv").save("stage1") # Hash = H1
|
|
138
|
+
|
|
139
|
+
dc.read_dataset("stage1").filter(dc.C("x") > 5).save("stage2") # Hash = H2 = hash(H1 + pipeline_hash)
|
|
140
|
+
|
|
141
|
+
dc.read_dataset("stage2").select("name", "value").save("stage3") # Hash = H3 = hash(H2 + pipeline_hash)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Second run (no changes):**
|
|
145
|
+
- All three hashes match → all three datasets are reused → no computation
|
|
146
|
+
|
|
147
|
+
**Second run (modified filter):**
|
|
148
|
+
```python
|
|
149
|
+
dc.read_csv("data.csv").save("stage1") # Hash = H1 matches ✓ → reused
|
|
150
|
+
|
|
151
|
+
dc.read_dataset("stage1").filter(dc.C("x") > 10).save("stage2") # Hash ≠ H2 ✗ → recomputed
|
|
152
|
+
|
|
153
|
+
dc.read_dataset("stage2").select("name", "value").save("stage3") # Hash ≠ H3 ✗ → recomputed
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Because the filter changed, `stage2` has a different hash and must be recomputed. Since `stage3` depends on `stage2`, its hash also changes (because it includes H2 in the calculation), so it must be recomputed as well.
|
|
157
|
+
|
|
158
|
+
**Key insight:** Modifying any step in the chain invalidates that checkpoint and all subsequent checkpoints, because the hash chain is broken.
|
|
159
|
+
|
|
160
|
+
## Dataset Persistence
|
|
161
|
+
|
|
162
|
+
Starting with the checkpoints feature, datasets created during script execution persist even if the script fails or is interrupted. This is essential for checkpoint functionality, as it allows subsequent runs to reuse successfully created datasets.
|
|
163
|
+
|
|
164
|
+
If you need to clean up datasets from failed runs, you can use:
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
import datachain as dc
|
|
168
|
+
|
|
169
|
+
# Remove a specific dataset
|
|
170
|
+
dc.delete_dataset("dataset_name")
|
|
171
|
+
|
|
172
|
+
# List all datasets to see what's available
|
|
173
|
+
for ds in dc.datasets():
|
|
174
|
+
print(ds.name)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Limitations
|
|
178
|
+
|
|
179
|
+
- **Local only:** Checkpoints currently work only for local script runs. Studio support is planned.
|
|
180
|
+
- **Script-based:** Code must be run as a script (not interactively or as a module).
|
|
181
|
+
- **Hash-based matching:** Any change to the chain will create a different hash, preventing checkpoint reuse.
|
|
182
|
+
- **Same script path:** The script must be run from the same absolute path for parent job linking to work.
|
|
183
|
+
|
|
184
|
+
## Future Plans
|
|
185
|
+
|
|
186
|
+
### Studio Support
|
|
187
|
+
|
|
188
|
+
Support for checkpoints on Studio is planned for future releases, which will enable checkpoint functionality for collaborative workflows and cloud-based data processing.
|
|
189
|
+
|
|
190
|
+
### UDF-Level Checkpoints
|
|
191
|
+
|
|
192
|
+
Currently, checkpoints are created only when datasets are saved using `.save()`. This means that if a script fails during a long-running UDF operation (like `.map()`, `.gen()`, or `.agg()`), the entire UDF computation must be rerun on the next execution.
|
|
193
|
+
|
|
194
|
+
Future versions will support **UDF-level checkpoints**, creating checkpoints after each UDF step in the chain. This will provide much more granular recovery:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
# Future behavior with UDF-level checkpoints
|
|
198
|
+
result = (
|
|
199
|
+
dc.read_csv("data.csv")
|
|
200
|
+
.map(heavy_computation_1) # Checkpoint created after this UDF
|
|
201
|
+
.map(heavy_computation_2) # Checkpoint created after this UDF
|
|
202
|
+
.map(heavy_computation_3) # Checkpoint created after this UDF
|
|
203
|
+
.save("result")
|
|
204
|
+
)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
If the script fails during `heavy_computation_3`, the next run will skip re-executing `heavy_computation_1` and `heavy_computation_2`, resuming only the work that wasn't completed.
|
|
@@ -10,6 +10,7 @@ Welcome to the DataChain User Guide! This section provides comprehensive documen
|
|
|
10
10
|
- [Data Processing Overview](./processing.md) - Discover DataChain's specialized data processing features.
|
|
11
11
|
- [Delta Processing](./delta.md) - Incremental data processing to efficiently handle large datasets that change over time.
|
|
12
12
|
- [Error Handling and Retries](./retry.md) - Learn how to handle processing errors and selectively reprocess problematic records.
|
|
13
|
+
- [Checkpoints](./checkpoints.md) - Automatically resume script execution from where it left off after failures.
|
|
13
14
|
- [Environment Variables](./env.md) - Configure DataChain's behavior using environment variables.
|
|
14
15
|
- [Namespaces](./namespaces.md) - Learn more about namespaces and projects.
|
|
15
16
|
- [Local DB Migrations](./namespaces.md) - Learn how to handle local DB migrations after upgrading datachain.
|
|
@@ -114,6 +114,7 @@ nav:
|
|
|
114
114
|
- Overview: guide/processing.md
|
|
115
115
|
- Delta Processing: guide/delta.md
|
|
116
116
|
- Errors Handling and Retries: guide/retry.md
|
|
117
|
+
- Checkpoints: guide/checkpoints.md
|
|
117
118
|
- Environment Variables: guide/env.md
|
|
118
119
|
- Namespaces: guide/namespaces.md
|
|
119
120
|
- Local DB Migrations: guide/db_migrations.md
|
|
@@ -793,6 +793,7 @@ class Catalog:
|
|
|
793
793
|
description: str | None = None,
|
|
794
794
|
attrs: list[str] | None = None,
|
|
795
795
|
update_version: str | None = "patch",
|
|
796
|
+
job_id: str | None = None,
|
|
796
797
|
) -> "DatasetRecord":
|
|
797
798
|
"""
|
|
798
799
|
Creates new dataset of a specific version.
|
|
@@ -866,6 +867,7 @@ class Catalog:
|
|
|
866
867
|
create_rows_table=create_rows,
|
|
867
868
|
columns=columns,
|
|
868
869
|
uuid=uuid,
|
|
870
|
+
job_id=job_id,
|
|
869
871
|
)
|
|
870
872
|
|
|
871
873
|
def create_new_dataset_version(
|
|
@@ -448,6 +448,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
448
448
|
def get_job_status(self, job_id: str) -> JobStatus | None:
|
|
449
449
|
"""Returns the status of the given job."""
|
|
450
450
|
|
|
451
|
+
@abstractmethod
|
|
452
|
+
def get_last_job_by_name(self, name: str, conn=None) -> "Job | None":
|
|
453
|
+
"""Returns the last job with the given name, ordered by created_at."""
|
|
454
|
+
|
|
451
455
|
#
|
|
452
456
|
# Checkpoints
|
|
453
457
|
#
|
|
@@ -1685,6 +1689,18 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1685
1689
|
query = self._jobs_query().where(self._jobs.c.id.in_(ids))
|
|
1686
1690
|
yield from self._parse_jobs(self.db.execute(query, conn=conn))
|
|
1687
1691
|
|
|
1692
|
+
def get_last_job_by_name(self, name: str, conn=None) -> "Job | None":
|
|
1693
|
+
query = (
|
|
1694
|
+
self._jobs_query()
|
|
1695
|
+
.where(self._jobs.c.name == name)
|
|
1696
|
+
.order_by(self._jobs.c.created_at.desc())
|
|
1697
|
+
.limit(1)
|
|
1698
|
+
)
|
|
1699
|
+
results = list(self.db.execute(query, conn=conn))
|
|
1700
|
+
if not results:
|
|
1701
|
+
return None
|
|
1702
|
+
return self._parse_job(results[0])
|
|
1703
|
+
|
|
1688
1704
|
def create_job(
|
|
1689
1705
|
self,
|
|
1690
1706
|
name: str,
|
|
@@ -200,7 +200,9 @@ def _get_source_info(
|
|
|
200
200
|
indirect=False,
|
|
201
201
|
)
|
|
202
202
|
|
|
203
|
-
source_ds_dep = next(
|
|
203
|
+
source_ds_dep = next(
|
|
204
|
+
(d for d in dependencies if d and d.name == source_ds.name), None
|
|
205
|
+
)
|
|
204
206
|
if not source_ds_dep:
|
|
205
207
|
# Starting dataset was removed, back off to normal dataset creation
|
|
206
208
|
return None, None, None, None, None
|
|
@@ -103,8 +103,10 @@ def _compare( # noqa: C901
|
|
|
103
103
|
left = left.mutate(**{ldiff_col: 1})
|
|
104
104
|
right = right.mutate(**{rdiff_col: 1})
|
|
105
105
|
|
|
106
|
-
if
|
|
106
|
+
if compare is None:
|
|
107
107
|
modified_cond = True
|
|
108
|
+
elif len(compare) == 0:
|
|
109
|
+
modified_cond = False
|
|
108
110
|
else:
|
|
109
111
|
modified_cond = or_( # type: ignore[assignment]
|
|
110
112
|
*[
|
|
@@ -27,7 +27,6 @@ from datachain import semver
|
|
|
27
27
|
from datachain.dataset import DatasetRecord
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
29
|
from datachain.error import (
|
|
30
|
-
JobNotFoundError,
|
|
31
30
|
ProjectCreateNotAllowedError,
|
|
32
31
|
ProjectNotFoundError,
|
|
33
32
|
)
|
|
@@ -627,6 +626,9 @@ class DataChain:
|
|
|
627
626
|
self._validate_version(version)
|
|
628
627
|
self._validate_update_version(update_version)
|
|
629
628
|
|
|
629
|
+
# get existing job if running in SaaS, or creating new one if running locally
|
|
630
|
+
job = self.session.get_or_create_job()
|
|
631
|
+
|
|
630
632
|
namespace_name, project_name, name = catalog.get_full_dataset_name(
|
|
631
633
|
name,
|
|
632
634
|
namespace_name=self._settings.namespace,
|
|
@@ -635,7 +637,7 @@ class DataChain:
|
|
|
635
637
|
project = self._get_or_create_project(namespace_name, project_name)
|
|
636
638
|
|
|
637
639
|
# Checkpoint handling
|
|
638
|
-
|
|
640
|
+
_hash, result = self._resolve_checkpoint(name, project, job, kwargs)
|
|
639
641
|
|
|
640
642
|
# Schema preparation
|
|
641
643
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
@@ -655,13 +657,12 @@ class DataChain:
|
|
|
655
657
|
attrs=attrs,
|
|
656
658
|
feature_schema=schema,
|
|
657
659
|
update_version=update_version,
|
|
660
|
+
job_id=job.id,
|
|
658
661
|
**kwargs,
|
|
659
662
|
)
|
|
660
663
|
)
|
|
661
664
|
|
|
662
|
-
|
|
663
|
-
catalog.metastore.create_checkpoint(job.id, _hash) # type: ignore[arg-type]
|
|
664
|
-
|
|
665
|
+
catalog.metastore.create_checkpoint(job.id, _hash) # type: ignore[arg-type]
|
|
665
666
|
return result
|
|
666
667
|
|
|
667
668
|
def _validate_version(self, version: str | None) -> None:
|
|
@@ -690,23 +691,15 @@ class DataChain:
|
|
|
690
691
|
self,
|
|
691
692
|
name: str,
|
|
692
693
|
project: Project,
|
|
694
|
+
job: Job,
|
|
693
695
|
kwargs: dict,
|
|
694
|
-
) -> tuple[
|
|
696
|
+
) -> tuple[str, "DataChain | None"]:
|
|
695
697
|
"""Check if checkpoint exists and return cached dataset if possible."""
|
|
696
698
|
from .datasets import read_dataset
|
|
697
699
|
|
|
698
700
|
metastore = self.session.catalog.metastore
|
|
699
|
-
|
|
700
|
-
job_id = os.getenv("DATACHAIN_JOB_ID")
|
|
701
701
|
checkpoints_reset = env2bool("DATACHAIN_CHECKPOINTS_RESET", undefined=True)
|
|
702
702
|
|
|
703
|
-
if not job_id:
|
|
704
|
-
return None, None, None
|
|
705
|
-
|
|
706
|
-
job = metastore.get_job(job_id)
|
|
707
|
-
if not job:
|
|
708
|
-
raise JobNotFoundError(f"Job with id {job_id} not found")
|
|
709
|
-
|
|
710
703
|
_hash = self._calculate_job_hash(job.id)
|
|
711
704
|
|
|
712
705
|
if (
|
|
@@ -718,9 +711,9 @@ class DataChain:
|
|
|
718
711
|
chain = read_dataset(
|
|
719
712
|
name, namespace=project.namespace.name, project=project.name, **kwargs
|
|
720
713
|
)
|
|
721
|
-
return
|
|
714
|
+
return _hash, chain
|
|
722
715
|
|
|
723
|
-
return
|
|
716
|
+
return _hash, None
|
|
724
717
|
|
|
725
718
|
def _handle_delta(
|
|
726
719
|
self,
|
|
@@ -1927,10 +1927,6 @@ class DatasetQuery:
|
|
|
1927
1927
|
)
|
|
1928
1928
|
version = version or dataset.latest_version
|
|
1929
1929
|
|
|
1930
|
-
self.session.add_dataset_version(
|
|
1931
|
-
dataset=dataset, version=version, listing=kwargs.get("listing", False)
|
|
1932
|
-
)
|
|
1933
|
-
|
|
1934
1930
|
dr = self.catalog.warehouse.dataset_rows(dataset)
|
|
1935
1931
|
|
|
1936
1932
|
self.catalog.warehouse.copy_table(dr.get_table(), query.select())
|