datachain 0.24.0__tar.gz → 0.24.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.24.0 → datachain-0.24.2}/.pre-commit-config.yaml +2 -2
- {datachain-0.24.0 → datachain-0.24.2}/PKG-INFO +1 -1
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/warehouse.py +6 -4
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/delta.py +82 -25
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/datachain.py +2 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/dataset.py +11 -10
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_datachain.py +69 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_delta.py +20 -5
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_retry.py +164 -50
- {datachain-0.24.0 → datachain-0.24.2}/.cruft.json +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.gitattributes +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/codecov.yaml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/dependabot.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/workflows/release.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/workflows/tests.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/.gitignore +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/LICENSE +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/README.rst +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/auth/login.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/auth/logout.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/auth/team.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/auth/token.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/job/cancel.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/job/clusters.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/job/logs.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/job/ls.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/commands/job/run.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/contributing.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/examples.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/db_migrations.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/delta.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/env.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/namespaces.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/processing.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/remotes.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/guide/retry.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/overrides/main.html +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/quick-start.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/file.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/pose.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/segment.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/datachain.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/func.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/index.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/toolkit.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/torch.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/references/udf.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/docs/tutorials.md +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/multimodal/wds.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/mkdocs.yml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/noxfile.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/pyproject.toml +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/setup.cfg +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/__main__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/asyn.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cache.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/cli/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/gcs.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/local.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/config.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/dataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/error.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/fs/reference.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/fs/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/base.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/func.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/numeric.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/func/window.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/job.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/file.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/projects.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/udf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/video.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/model/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/namespace.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/node.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/progress.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/project.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/py.typed +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/batch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/params.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/session.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/udf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/query/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/script_meta.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/semver.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain/utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/conftest.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/data.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/examples/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/data/lena.jpg +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/test_array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/test_path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/test_random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/functions/test_string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/model/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_batching.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_catalog.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_client.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_data_storage.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_datasets.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_file.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_image.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_ls.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_metastore.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_pull.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_read_database.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_session.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_toolkit.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_video.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/func/test_warehouse.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/test_atomicity.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/test_import_time.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/test_telemetry.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/model/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_client.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_config.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_func.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_listing.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_query.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_semver.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_session.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.24.0 → datachain-0.24.2}/tests/utils.py +0 -0
|
@@ -24,7 +24,7 @@ repos:
|
|
|
24
24
|
- id: trailing-whitespace
|
|
25
25
|
exclude: '^LICENSES/'
|
|
26
26
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
27
|
-
rev: 'v0.12.
|
|
27
|
+
rev: 'v0.12.1'
|
|
28
28
|
hooks:
|
|
29
29
|
- id: ruff
|
|
30
30
|
args: [--fix, --exit-non-zero-on-fix]
|
|
@@ -35,7 +35,7 @@ repos:
|
|
|
35
35
|
- id: codespell
|
|
36
36
|
additional_dependencies: ["tomli"]
|
|
37
37
|
- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
|
|
38
|
-
rev: v2.
|
|
38
|
+
rev: v2.15.0
|
|
39
39
|
hooks:
|
|
40
40
|
- id: pretty-format-toml
|
|
41
41
|
args: [--autofix, --no-sort]
|
|
@@ -218,7 +218,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
218
218
|
limit = query._limit
|
|
219
219
|
paginated_query = query.limit(page_size)
|
|
220
220
|
|
|
221
|
-
offset = 0
|
|
221
|
+
offset = query._offset or 0
|
|
222
222
|
num_yielded = 0
|
|
223
223
|
|
|
224
224
|
# Ensure we're using a thread-local connection
|
|
@@ -234,13 +234,13 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
234
234
|
# Cursor results are not thread-safe, so we convert them to a list
|
|
235
235
|
results = list(wh.dataset_rows_select(paginated_query.offset(offset)))
|
|
236
236
|
|
|
237
|
-
processed =
|
|
237
|
+
processed = 0
|
|
238
238
|
for row in results:
|
|
239
|
-
processed
|
|
239
|
+
processed += 1
|
|
240
240
|
yield row
|
|
241
241
|
num_yielded += 1
|
|
242
242
|
|
|
243
|
-
if
|
|
243
|
+
if processed < page_size:
|
|
244
244
|
break # no more results
|
|
245
245
|
offset += page_size
|
|
246
246
|
|
|
@@ -343,6 +343,8 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
343
343
|
if (id_col := get_query_id_column(query)) is None:
|
|
344
344
|
raise RuntimeError("sys__id column not found in query")
|
|
345
345
|
|
|
346
|
+
query = query._clone().offset(None).limit(None).order_by(None)
|
|
347
|
+
|
|
346
348
|
if is_batched:
|
|
347
349
|
for batch in ids:
|
|
348
350
|
yield list(self.dataset_rows_select(query.where(id_col.in_(batch))))
|
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
|
6
6
|
import datachain
|
|
7
7
|
from datachain.dataset import DatasetDependency
|
|
8
8
|
from datachain.error import DatasetNotFoundError
|
|
9
|
+
from datachain.project import Project
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from typing_extensions import Concatenate, ParamSpec
|
|
@@ -50,15 +51,24 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
|
|
|
50
51
|
|
|
51
52
|
def _get_delta_chain(
|
|
52
53
|
source_ds_name: str,
|
|
54
|
+
source_ds_project: Project,
|
|
53
55
|
source_ds_version: str,
|
|
54
56
|
source_ds_latest_version: str,
|
|
55
57
|
on: Union[str, Sequence[str]],
|
|
56
58
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
59
|
) -> "DataChain":
|
|
58
60
|
"""Get delta chain for processing changes between versions."""
|
|
59
|
-
source_dc = datachain.read_dataset(
|
|
61
|
+
source_dc = datachain.read_dataset(
|
|
62
|
+
source_ds_name,
|
|
63
|
+
namespace=source_ds_project.namespace.name,
|
|
64
|
+
project=source_ds_project.name,
|
|
65
|
+
version=source_ds_version,
|
|
66
|
+
)
|
|
60
67
|
source_dc_latest = datachain.read_dataset(
|
|
61
|
-
source_ds_name,
|
|
68
|
+
source_ds_name,
|
|
69
|
+
namespace=source_ds_project.namespace.name,
|
|
70
|
+
project=source_ds_project.name,
|
|
71
|
+
version=source_ds_latest_version,
|
|
62
72
|
)
|
|
63
73
|
|
|
64
74
|
# Calculate diff between source versions
|
|
@@ -67,12 +77,15 @@ def _get_delta_chain(
|
|
|
67
77
|
|
|
68
78
|
def _get_retry_chain(
|
|
69
79
|
name: str,
|
|
80
|
+
project: Project,
|
|
70
81
|
latest_version: str,
|
|
71
82
|
source_ds_name: str,
|
|
72
|
-
|
|
83
|
+
source_ds_project: Project,
|
|
84
|
+
source_ds_version: str,
|
|
73
85
|
on: Union[str, Sequence[str]],
|
|
74
86
|
right_on: Optional[Union[str, Sequence[str]]],
|
|
75
87
|
delta_retry: Optional[Union[bool, str]],
|
|
88
|
+
diff_chain: "DataChain",
|
|
76
89
|
) -> Optional["DataChain"]:
|
|
77
90
|
"""Get retry chain for processing error records and missing records."""
|
|
78
91
|
# Import here to avoid circular import
|
|
@@ -81,35 +94,49 @@ def _get_retry_chain(
|
|
|
81
94
|
retry_chain = None
|
|
82
95
|
|
|
83
96
|
# Read the latest version of the result dataset for retry logic
|
|
84
|
-
result_dataset = datachain.read_dataset(
|
|
85
|
-
|
|
86
|
-
|
|
97
|
+
result_dataset = datachain.read_dataset(
|
|
98
|
+
name,
|
|
99
|
+
namespace=project.namespace.name,
|
|
100
|
+
project=project.name,
|
|
101
|
+
version=latest_version,
|
|
102
|
+
)
|
|
103
|
+
source_dc = datachain.read_dataset(
|
|
104
|
+
source_ds_name,
|
|
105
|
+
namespace=source_ds_project.namespace.name,
|
|
106
|
+
project=source_ds_project.name,
|
|
107
|
+
version=source_ds_version,
|
|
87
108
|
)
|
|
88
109
|
|
|
89
110
|
# Handle error records if delta_retry is a string (column name)
|
|
90
111
|
if isinstance(delta_retry, str):
|
|
91
112
|
error_records = result_dataset.filter(C(delta_retry) != "")
|
|
92
|
-
error_source_records =
|
|
113
|
+
error_source_records = source_dc.merge(
|
|
93
114
|
error_records, on=on, right_on=right_on, inner=True
|
|
94
|
-
).select(*list(
|
|
115
|
+
).select(*list(source_dc.signals_schema.values))
|
|
95
116
|
retry_chain = error_source_records
|
|
96
117
|
|
|
97
118
|
# Handle missing records if delta_retry is True
|
|
98
119
|
elif delta_retry is True:
|
|
99
|
-
missing_records =
|
|
100
|
-
result_dataset, on=on, right_on=right_on
|
|
101
|
-
)
|
|
120
|
+
missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
|
|
102
121
|
retry_chain = missing_records
|
|
103
122
|
|
|
104
|
-
|
|
123
|
+
# Subtract also diff chain since some items might be picked
|
|
124
|
+
# up by `delta=True` itself (e.g. records got modified AND are missing in the
|
|
125
|
+
# result dataset atm)
|
|
126
|
+
return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
|
|
105
127
|
|
|
106
128
|
|
|
107
129
|
def _get_source_info(
|
|
108
130
|
name: str,
|
|
131
|
+
project: Project,
|
|
109
132
|
latest_version: str,
|
|
110
133
|
catalog,
|
|
111
134
|
) -> tuple[
|
|
112
|
-
Optional[str],
|
|
135
|
+
Optional[str],
|
|
136
|
+
Optional[Project],
|
|
137
|
+
Optional[str],
|
|
138
|
+
Optional[str],
|
|
139
|
+
Optional[list[DatasetDependency]],
|
|
113
140
|
]:
|
|
114
141
|
"""Get source dataset information and dependencies.
|
|
115
142
|
|
|
@@ -118,23 +145,34 @@ def _get_source_info(
|
|
|
118
145
|
Returns (None, None, None, None) if source dataset was removed.
|
|
119
146
|
"""
|
|
120
147
|
dependencies = catalog.get_dataset_dependencies(
|
|
121
|
-
name, latest_version, indirect=False
|
|
148
|
+
name, latest_version, project=project, indirect=False
|
|
122
149
|
)
|
|
123
150
|
|
|
124
151
|
dep = dependencies[0]
|
|
125
152
|
if not dep:
|
|
126
153
|
# Starting dataset was removed, back off to normal dataset creation
|
|
127
|
-
return None, None, None, None
|
|
154
|
+
return None, None, None, None, None
|
|
128
155
|
|
|
156
|
+
source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
|
|
129
157
|
source_ds_name = dep.name
|
|
130
158
|
source_ds_version = dep.version
|
|
131
|
-
source_ds_latest_version = catalog.get_dataset(
|
|
132
|
-
|
|
133
|
-
|
|
159
|
+
source_ds_latest_version = catalog.get_dataset(
|
|
160
|
+
source_ds_name, project=source_ds_project
|
|
161
|
+
).latest_version
|
|
162
|
+
|
|
163
|
+
return (
|
|
164
|
+
source_ds_name,
|
|
165
|
+
source_ds_project,
|
|
166
|
+
source_ds_version,
|
|
167
|
+
source_ds_latest_version,
|
|
168
|
+
dependencies,
|
|
169
|
+
)
|
|
134
170
|
|
|
135
171
|
|
|
136
172
|
def delta_retry_update(
|
|
137
173
|
dc: "DataChain",
|
|
174
|
+
namespace_name: str,
|
|
175
|
+
project_name: str,
|
|
138
176
|
name: str,
|
|
139
177
|
on: Union[str, Sequence[str]],
|
|
140
178
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
@@ -173,11 +211,12 @@ def delta_retry_update(
|
|
|
173
211
|
"""
|
|
174
212
|
|
|
175
213
|
catalog = dc.session.catalog
|
|
214
|
+
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
176
215
|
dc._query.apply_listing_pre_step()
|
|
177
216
|
|
|
178
217
|
# Check if dataset exists
|
|
179
218
|
try:
|
|
180
|
-
dataset = catalog.get_dataset(name)
|
|
219
|
+
dataset = catalog.get_dataset(name, project=project)
|
|
181
220
|
latest_version = dataset.latest_version
|
|
182
221
|
except DatasetNotFoundError:
|
|
183
222
|
# First creation of result dataset
|
|
@@ -189,19 +228,29 @@ def delta_retry_update(
|
|
|
189
228
|
retry_chain = None
|
|
190
229
|
processing_chain = None
|
|
191
230
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
231
|
+
(
|
|
232
|
+
source_ds_name,
|
|
233
|
+
source_ds_project,
|
|
234
|
+
source_ds_version,
|
|
235
|
+
source_ds_latest_version,
|
|
236
|
+
dependencies,
|
|
237
|
+
) = _get_source_info(name, project, latest_version, catalog)
|
|
195
238
|
|
|
196
239
|
# If source_ds_name is None, starting dataset was removed
|
|
197
240
|
if source_ds_name is None:
|
|
198
241
|
return None, None, True
|
|
199
242
|
|
|
243
|
+
assert source_ds_project
|
|
200
244
|
assert source_ds_version
|
|
201
245
|
assert source_ds_latest_version
|
|
202
246
|
|
|
203
247
|
diff_chain = _get_delta_chain(
|
|
204
|
-
source_ds_name,
|
|
248
|
+
source_ds_name,
|
|
249
|
+
source_ds_project,
|
|
250
|
+
source_ds_version,
|
|
251
|
+
source_ds_latest_version,
|
|
252
|
+
on,
|
|
253
|
+
compare,
|
|
205
254
|
)
|
|
206
255
|
|
|
207
256
|
# Filter out removed dep
|
|
@@ -215,12 +264,15 @@ def delta_retry_update(
|
|
|
215
264
|
if delta_retry:
|
|
216
265
|
retry_chain = _get_retry_chain(
|
|
217
266
|
name,
|
|
267
|
+
project,
|
|
218
268
|
latest_version,
|
|
219
269
|
source_ds_name,
|
|
220
|
-
|
|
270
|
+
source_ds_project,
|
|
271
|
+
source_ds_version,
|
|
221
272
|
on,
|
|
222
273
|
right_on,
|
|
223
274
|
delta_retry,
|
|
275
|
+
diff_chain,
|
|
224
276
|
)
|
|
225
277
|
|
|
226
278
|
# Combine delta and retry chains
|
|
@@ -236,7 +288,12 @@ def delta_retry_update(
|
|
|
236
288
|
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
237
289
|
return None, None, False
|
|
238
290
|
|
|
239
|
-
latest_dataset = datachain.read_dataset(
|
|
291
|
+
latest_dataset = datachain.read_dataset(
|
|
292
|
+
name,
|
|
293
|
+
namespace=project.namespace.name,
|
|
294
|
+
project=project.name,
|
|
295
|
+
version=latest_version,
|
|
296
|
+
)
|
|
240
297
|
compared_chain = latest_dataset.diff(
|
|
241
298
|
processing_chain,
|
|
242
299
|
on=right_on or on,
|
|
@@ -11,6 +11,7 @@ from collections.abc import Generator, Iterable, Iterator, Sequence
|
|
|
11
11
|
from copy import copy
|
|
12
12
|
from functools import wraps
|
|
13
13
|
from secrets import token_hex
|
|
14
|
+
from types import GeneratorType
|
|
14
15
|
from typing import (
|
|
15
16
|
TYPE_CHECKING,
|
|
16
17
|
Any,
|
|
@@ -557,8 +558,8 @@ class UDFStep(Step, ABC):
|
|
|
557
558
|
"""
|
|
558
559
|
assert self.partition_by is not None
|
|
559
560
|
|
|
560
|
-
if isinstance(self.partition_by,
|
|
561
|
-
list_partition_by = self.partition_by
|
|
561
|
+
if isinstance(self.partition_by, (list, tuple, GeneratorType)):
|
|
562
|
+
list_partition_by = list(self.partition_by)
|
|
562
563
|
else:
|
|
563
564
|
list_partition_by = [self.partition_by]
|
|
564
565
|
|
|
@@ -575,7 +576,10 @@ class UDFStep(Step, ABC):
|
|
|
575
576
|
f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
|
|
576
577
|
]
|
|
577
578
|
self.catalog.warehouse.db.execute(
|
|
578
|
-
tbl.insert().from_select(
|
|
579
|
+
tbl.insert().from_select(
|
|
580
|
+
cols,
|
|
581
|
+
query.offset(None).limit(None).with_only_columns(*cols),
|
|
582
|
+
)
|
|
579
583
|
)
|
|
580
584
|
|
|
581
585
|
return tbl
|
|
@@ -601,13 +605,10 @@ class UDFStep(Step, ABC):
|
|
|
601
605
|
if self.partition_by is not None:
|
|
602
606
|
partition_tbl = self.create_partitions_table(query)
|
|
603
607
|
temp_tables.append(partition_tbl.name)
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
.outerjoin(partition_tbl, partition_tbl.c.sys__id == subq.c.sys__id)
|
|
609
|
-
.add_columns(*partition_columns())
|
|
610
|
-
)
|
|
608
|
+
query = query.outerjoin(
|
|
609
|
+
partition_tbl,
|
|
610
|
+
partition_tbl.c.sys__id == query.selected_columns.sys__id,
|
|
611
|
+
).add_columns(*partition_columns())
|
|
611
612
|
|
|
612
613
|
query, tables = self.process_input_query(query)
|
|
613
614
|
temp_tables.extend(t.name for t in tables)
|
|
@@ -2324,3 +2324,72 @@ def test_agg(catalog_tmpfile, parallel):
|
|
|
2324
2324
|
],
|
|
2325
2325
|
"parent",
|
|
2326
2326
|
)
|
|
2327
|
+
|
|
2328
|
+
|
|
2329
|
+
@pytest.mark.parametrize("parallel", [1, 2])
|
|
2330
|
+
@pytest.mark.parametrize(
|
|
2331
|
+
"offset,limit,files",
|
|
2332
|
+
[
|
|
2333
|
+
(None, 1000, [f"file{i:02d}" for i in range(100)]),
|
|
2334
|
+
(None, 3, ["file00", "file01", "file02"]),
|
|
2335
|
+
(0, 3, ["file00", "file01", "file02"]),
|
|
2336
|
+
(97, 1000, ["file97", "file98", "file99"]),
|
|
2337
|
+
(1, 2, ["file01", "file02"]),
|
|
2338
|
+
(50, 3, ["file50", "file51", "file52"]),
|
|
2339
|
+
(None, 0, []),
|
|
2340
|
+
(50, 0, []),
|
|
2341
|
+
],
|
|
2342
|
+
)
|
|
2343
|
+
def test_agg_offset_limit(catalog_tmpfile, parallel, offset, limit, files):
|
|
2344
|
+
def process(filename: list[str]) -> Iterator[tuple[str, int]]:
|
|
2345
|
+
yield filename[0], len(filename)
|
|
2346
|
+
|
|
2347
|
+
ds = dc.read_values(
|
|
2348
|
+
filename=[f"file{i:02d}" for i in range(100)],
|
|
2349
|
+
value=list(range(100)),
|
|
2350
|
+
session=catalog_tmpfile.session,
|
|
2351
|
+
)
|
|
2352
|
+
if offset is not None:
|
|
2353
|
+
ds = ds.offset(offset)
|
|
2354
|
+
if limit is not None:
|
|
2355
|
+
ds = ds.limit(limit)
|
|
2356
|
+
ds = (
|
|
2357
|
+
ds.settings(parallel=parallel)
|
|
2358
|
+
.agg(
|
|
2359
|
+
process,
|
|
2360
|
+
output={"filename": str, "count": int},
|
|
2361
|
+
partition_by="filename",
|
|
2362
|
+
)
|
|
2363
|
+
.save("my-ds")
|
|
2364
|
+
)
|
|
2365
|
+
|
|
2366
|
+
records = list(ds.to_records())
|
|
2367
|
+
assert len(records) == len(files)
|
|
2368
|
+
assert all(row["count"] == 1 for row in records)
|
|
2369
|
+
assert sorted(row["filename"] for row in records) == sorted(files)
|
|
2370
|
+
|
|
2371
|
+
|
|
2372
|
+
@pytest.mark.parametrize("parallel", [1, 2])
|
|
2373
|
+
@pytest.mark.parametrize("sample", [0, 1, 3, 10, 50, 100])
|
|
2374
|
+
def test_agg_sample(catalog_tmpfile, parallel, sample):
|
|
2375
|
+
def process(filename: list[str]) -> Iterator[tuple[str, int]]:
|
|
2376
|
+
yield filename[0], len(filename)
|
|
2377
|
+
|
|
2378
|
+
ds = (
|
|
2379
|
+
dc.read_values(
|
|
2380
|
+
filename=[f"file{i:02d}" for i in range(100)],
|
|
2381
|
+
session=catalog_tmpfile.session,
|
|
2382
|
+
)
|
|
2383
|
+
.sample(sample)
|
|
2384
|
+
.settings(parallel=parallel)
|
|
2385
|
+
.agg(
|
|
2386
|
+
process,
|
|
2387
|
+
output={"filename": str, "count": int},
|
|
2388
|
+
partition_by="filename",
|
|
2389
|
+
)
|
|
2390
|
+
.save("my-ds")
|
|
2391
|
+
)
|
|
2392
|
+
|
|
2393
|
+
records = list(ds.to_records())
|
|
2394
|
+
assert len(records) == sample
|
|
2395
|
+
assert all(row["count"] == 1 for row in records)
|
|
@@ -14,15 +14,26 @@ from datachain.lib.file import File, ImageFile
|
|
|
14
14
|
def _get_dependencies(catalog, name, version) -> list[tuple[str, str]]:
|
|
15
15
|
return sorted(
|
|
16
16
|
[
|
|
17
|
-
(d.name, d.version)
|
|
17
|
+
(f"{d.namespace}.{d.project}.{d.name}", d.version)
|
|
18
18
|
for d in catalog.get_dataset_dependencies(name, version, indirect=False)
|
|
19
19
|
]
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
@pytest.mark.parametrize("project", ("global.dev", ""))
|
|
24
|
+
def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path, project):
|
|
24
25
|
catalog = test_session.catalog
|
|
25
|
-
|
|
26
|
+
default_namespace_name = catalog.metastore.default_namespace_name
|
|
27
|
+
default_project_name = catalog.metastore.default_project_name
|
|
28
|
+
|
|
29
|
+
if project:
|
|
30
|
+
starting_ds_name = f"{project}.starting_ds"
|
|
31
|
+
dependency_ds_name = starting_ds_name
|
|
32
|
+
else:
|
|
33
|
+
starting_ds_name = "starting_ds"
|
|
34
|
+
dependency_ds_name = (
|
|
35
|
+
f"{default_namespace_name}.{default_project_name}.{starting_ds_name}"
|
|
36
|
+
)
|
|
26
37
|
ds_name = "delta_ds"
|
|
27
38
|
|
|
28
39
|
images = [
|
|
@@ -55,12 +66,16 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
|
|
|
55
66
|
create_image_dataset(starting_ds_name, images[:2])
|
|
56
67
|
# first version of delta dataset
|
|
57
68
|
create_delta_dataset(ds_name)
|
|
58
|
-
assert _get_dependencies(catalog, ds_name, "1.0.0") == [
|
|
69
|
+
assert _get_dependencies(catalog, ds_name, "1.0.0") == [
|
|
70
|
+
(dependency_ds_name, "1.0.0")
|
|
71
|
+
]
|
|
59
72
|
# second version of starting dataset
|
|
60
73
|
create_image_dataset(starting_ds_name, images[2:])
|
|
61
74
|
# second version of delta dataset
|
|
62
75
|
create_delta_dataset(ds_name)
|
|
63
|
-
assert _get_dependencies(catalog, ds_name, "1.0.1") == [
|
|
76
|
+
assert _get_dependencies(catalog, ds_name, "1.0.1") == [
|
|
77
|
+
(dependency_ds_name, "1.0.1")
|
|
78
|
+
]
|
|
64
79
|
|
|
65
80
|
assert (dc.read_dataset(ds_name, version="1.0.0").order_by("file.path")).to_values(
|
|
66
81
|
"file.path"
|