datachain 0.35.2__tar.gz → 0.36.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.35.2 → datachain-0.36.1}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.35.2 → datachain-0.36.1}/.github/workflows/release.yml +1 -1
- {datachain-0.35.2 → datachain-0.36.1}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.35.2 → datachain-0.36.1}/.github/workflows/tests.yml +4 -4
- {datachain-0.35.2 → datachain-0.36.1}/.pre-commit-config.yaml +1 -1
- {datachain-0.35.2 → datachain-0.36.1}/PKG-INFO +3 -2
- {datachain-0.35.2 → datachain-0.36.1}/pyproject.toml +8 -3
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/catalog/catalog.py +45 -20
- datachain-0.36.1/src/datachain/catalog/dependency.py +164 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/metastore.py +80 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/schema.py +1 -2
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/sqlite.py +2 -9
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/warehouse.py +50 -33
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/diff/__init__.py +2 -6
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/audio.py +54 -53
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/datachain.py +13 -14
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/dataset.py +21 -26
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/dispatch.py +64 -42
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/queue.py +2 -1
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain.egg-info/PKG-INFO +3 -2
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain.egg-info/requires.txt +2 -1
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_datachain.py +1 -1
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_datachain_merge.py +7 -18
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_retry.py +0 -1
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_udf.py +116 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_audio.py +31 -37
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_datachain.py +15 -13
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_datachain_hash.py +1 -1
- {datachain-0.35.2 → datachain-0.36.1}/.cruft.json +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.gitattributes +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.github/codecov.yaml +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.github/dependabot.yml +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/.gitignore +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/LICENSE +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/README.rst +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/api_hooks.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/assets/webhook_dialog.png +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/assets/webhook_list.png +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/auth/login.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/auth/logout.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/auth/team.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/auth/token.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/index.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/job/cancel.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/job/clusters.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/job/logs.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/job/ls.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/commands/job/run.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/contributing.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/examples.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/db_migrations.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/delta.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/env.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/index.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/namespaces.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/processing.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/remotes.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/guide/retry.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/index.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/overrides/main.html +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/quick-start.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/file.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/index.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/pose.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/segment.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/datachain.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/func.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/aggregate.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/array.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/conditional.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/numeric.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/path.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/random.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/string.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/functions/window.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/index.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/toolkit.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/torch.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/references/udf.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/studio/api/.gitkeep +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/studio/webhooks.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/templates/main.dot +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/templates/operation.dot +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/templates/responses.def +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/docs/tutorials.md +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/get_started/nested_datamodel.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/multimodal/audio-to-text.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/mkdocs.yml +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/noxfile.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/setup.cfg +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/__main__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/asyn.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cache.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/checkpoint.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/cli/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/gcs.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/http.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/local.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/config.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/dataset.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/delta.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/error.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/fs/reference.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/fs/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/array.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/base.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/func.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/path.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/random.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/string.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/func/window.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/hash_utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/job.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/datasets.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/storage_pattern.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/file.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/listing.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/projects.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/udf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/video.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/listing.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/model/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/namespace.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/node.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/plugins.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/progress.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/project.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/py.typed +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/batch.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/params.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/session.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/query/udf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/script_meta.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/semver.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/postgresql_dialect.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/postgresql_types.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/studio.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain/utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/conftest.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/data.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/examples/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/data/lena.jpg +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/test_array.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/test_path.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/test_random.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/functions/test_string.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/model/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_audio.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_catalog.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_client.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_data_storage.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_delta.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_file.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_hf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_image.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_listing.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_ls.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_metastore.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_mutate.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_pull.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_query.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_read_database.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_read_dataset_remote.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_session.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_storage_pattern.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_studio_datetime_parsing.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_to_database.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_union.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_video.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/func/test_warehouse.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/test_atomicity.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/test_import_time.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/test_telemetry.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_checkpoints.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_partition_by.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_settings.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_storage_pattern.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/model/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_batching.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_cli_datasets.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_client.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_client_http.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_config.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_func.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_hash_utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_listing.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_query.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_query_steps_hash.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_semver.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_session.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.35.2 → datachain-0.36.1}/tests/utils.py +0 -0
|
@@ -29,7 +29,7 @@ jobs:
|
|
|
29
29
|
python-version: '3.10'
|
|
30
30
|
|
|
31
31
|
- name: Setup uv
|
|
32
|
-
uses: astral-sh/setup-uv@
|
|
32
|
+
uses: astral-sh/setup-uv@v7
|
|
33
33
|
with:
|
|
34
34
|
enable-cache: true
|
|
35
35
|
cache-suffix: lint
|
|
@@ -80,7 +80,7 @@ jobs:
|
|
|
80
80
|
|
|
81
81
|
- name: Setup PostgreSQL
|
|
82
82
|
if: runner.os != 'Windows'
|
|
83
|
-
uses: ikalnytskyi/action-setup-postgres@
|
|
83
|
+
uses: ikalnytskyi/action-setup-postgres@c4dda34aae1c821e3a771b68b73b13af3198a7ee # v8
|
|
84
84
|
with:
|
|
85
85
|
username: test
|
|
86
86
|
password: test
|
|
@@ -102,7 +102,7 @@ jobs:
|
|
|
102
102
|
python-version: ${{ matrix.pyv }}
|
|
103
103
|
|
|
104
104
|
- name: Setup uv
|
|
105
|
-
uses: astral-sh/setup-uv@
|
|
105
|
+
uses: astral-sh/setup-uv@v7
|
|
106
106
|
with:
|
|
107
107
|
enable-cache: true
|
|
108
108
|
cache-suffix: tests-${{ matrix.pyv }}
|
|
@@ -188,7 +188,7 @@ jobs:
|
|
|
188
188
|
python-version: ${{ matrix.pyv }}
|
|
189
189
|
|
|
190
190
|
- name: Setup uv
|
|
191
|
-
uses: astral-sh/setup-uv@
|
|
191
|
+
uses: astral-sh/setup-uv@v7
|
|
192
192
|
with:
|
|
193
193
|
enable-cache: true
|
|
194
194
|
cache-suffix: examples-${{ matrix.pyv }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.36.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -64,7 +64,6 @@ Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
|
64
64
|
Requires-Dist: torchvision; extra == "torch"
|
|
65
65
|
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
66
66
|
Provides-Extra: audio
|
|
67
|
-
Requires-Dist: torchaudio; extra == "audio"
|
|
68
67
|
Requires-Dist: soundfile; extra == "audio"
|
|
69
68
|
Provides-Extra: remote
|
|
70
69
|
Requires-Dist: lz4; extra == "remote"
|
|
@@ -76,6 +75,7 @@ Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
|
76
75
|
Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
|
|
77
76
|
Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
|
|
78
77
|
Requires-Dist: fsspec>=2024.12.0; extra == "hf"
|
|
78
|
+
Requires-Dist: torch<2.9.0; extra == "hf"
|
|
79
79
|
Provides-Extra: video
|
|
80
80
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
81
81
|
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
@@ -117,6 +117,7 @@ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
|
117
117
|
Requires-Dist: ultralytics; extra == "examples"
|
|
118
118
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
119
119
|
Requires-Dist: openai; extra == "examples"
|
|
120
|
+
Requires-Dist: torchaudio<2.9.0; extra == "examples"
|
|
120
121
|
Dynamic: license-file
|
|
121
122
|
|
|
122
123
|
================
|
|
@@ -73,7 +73,6 @@ torch = [
|
|
|
73
73
|
"transformers>=4.36.0"
|
|
74
74
|
]
|
|
75
75
|
audio = [
|
|
76
|
-
"torchaudio",
|
|
77
76
|
"soundfile"
|
|
78
77
|
]
|
|
79
78
|
remote = [
|
|
@@ -88,7 +87,11 @@ hf = [
|
|
|
88
87
|
"datasets[vision]>=4.0.0",
|
|
89
88
|
# https://github.com/pytorch/torchcodec/issues/640
|
|
90
89
|
"datasets[audio]>=4.0.0 ; (sys_platform == 'linux' or sys_platform == 'darwin')",
|
|
91
|
-
"fsspec>=2024.12.0"
|
|
90
|
+
"fsspec>=2024.12.0",
|
|
91
|
+
# Until datasets solve the issue, run test_hf_audio test to see if this can be removed
|
|
92
|
+
# https://github.com/meta-pytorch/torchcodec/issues/912
|
|
93
|
+
# https://github.com/huggingface/transformers/pull/41610
|
|
94
|
+
"torch<2.9.0"
|
|
92
95
|
]
|
|
93
96
|
video = [
|
|
94
97
|
"ffmpeg-python",
|
|
@@ -134,7 +137,9 @@ examples = [
|
|
|
134
137
|
"huggingface_hub[hf_transfer]",
|
|
135
138
|
"ultralytics",
|
|
136
139
|
"open_clip_torch",
|
|
137
|
-
"openai"
|
|
140
|
+
"openai",
|
|
141
|
+
# Transformers still require it
|
|
142
|
+
"torchaudio<2.9.0"
|
|
138
143
|
]
|
|
139
144
|
|
|
140
145
|
[project.urls]
|
|
@@ -54,6 +54,7 @@ from datachain.sql.types import DateTime, SQLType
|
|
|
54
54
|
from datachain.utils import DataChainDir
|
|
55
55
|
|
|
56
56
|
from .datasource import DataSource
|
|
57
|
+
from .dependency import build_dependency_hierarchy, populate_nested_dependencies
|
|
57
58
|
|
|
58
59
|
if TYPE_CHECKING:
|
|
59
60
|
from datachain.data_storage import AbstractMetastore, AbstractWarehouse
|
|
@@ -1203,6 +1204,38 @@ class Catalog:
|
|
|
1203
1204
|
assert isinstance(dataset_info, dict)
|
|
1204
1205
|
return DatasetRecord.from_dict(dataset_info)
|
|
1205
1206
|
|
|
1207
|
+
def get_dataset_dependencies_by_ids(
|
|
1208
|
+
self,
|
|
1209
|
+
dataset_id: int,
|
|
1210
|
+
version_id: int,
|
|
1211
|
+
indirect: bool = True,
|
|
1212
|
+
) -> list[DatasetDependency | None]:
|
|
1213
|
+
dependency_nodes = self.metastore.get_dataset_dependency_nodes(
|
|
1214
|
+
dataset_id=dataset_id,
|
|
1215
|
+
version_id=version_id,
|
|
1216
|
+
)
|
|
1217
|
+
|
|
1218
|
+
if not dependency_nodes:
|
|
1219
|
+
return []
|
|
1220
|
+
|
|
1221
|
+
dependency_map, children_map = build_dependency_hierarchy(dependency_nodes)
|
|
1222
|
+
|
|
1223
|
+
root_key = (dataset_id, version_id)
|
|
1224
|
+
if root_key not in children_map:
|
|
1225
|
+
return []
|
|
1226
|
+
|
|
1227
|
+
root_dependency_ids = children_map[root_key]
|
|
1228
|
+
root_dependencies = [dependency_map[dep_id] for dep_id in root_dependency_ids]
|
|
1229
|
+
|
|
1230
|
+
if indirect:
|
|
1231
|
+
for dependency in root_dependencies:
|
|
1232
|
+
if dependency is not None:
|
|
1233
|
+
populate_nested_dependencies(
|
|
1234
|
+
dependency, dependency_nodes, dependency_map, children_map
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
return root_dependencies
|
|
1238
|
+
|
|
1206
1239
|
def get_dataset_dependencies(
|
|
1207
1240
|
self,
|
|
1208
1241
|
name: str,
|
|
@@ -1216,29 +1249,21 @@ class Catalog:
|
|
|
1216
1249
|
namespace_name=namespace_name,
|
|
1217
1250
|
project_name=project_name,
|
|
1218
1251
|
)
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
)
|
|
1252
|
+
dataset_version = dataset.get_version(version)
|
|
1253
|
+
dataset_id = dataset.id
|
|
1254
|
+
dataset_version_id = dataset_version.id
|
|
1223
1255
|
|
|
1224
1256
|
if not indirect:
|
|
1225
|
-
return
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
# dependency has been removed
|
|
1230
|
-
continue
|
|
1231
|
-
if d.is_dataset:
|
|
1232
|
-
# only datasets can have dependencies
|
|
1233
|
-
d.dependencies = self.get_dataset_dependencies(
|
|
1234
|
-
d.name,
|
|
1235
|
-
d.version,
|
|
1236
|
-
namespace_name=d.namespace,
|
|
1237
|
-
project_name=d.project,
|
|
1238
|
-
indirect=indirect,
|
|
1239
|
-
)
|
|
1257
|
+
return self.metastore.get_direct_dataset_dependencies(
|
|
1258
|
+
dataset,
|
|
1259
|
+
version,
|
|
1260
|
+
)
|
|
1240
1261
|
|
|
1241
|
-
return
|
|
1262
|
+
return self.get_dataset_dependencies_by_ids(
|
|
1263
|
+
dataset_id,
|
|
1264
|
+
dataset_version_id,
|
|
1265
|
+
indirect,
|
|
1266
|
+
)
|
|
1242
1267
|
|
|
1243
1268
|
def ls_datasets(
|
|
1244
1269
|
self,
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import builtins
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
|
|
6
|
+
from datachain.dataset import DatasetDependency
|
|
7
|
+
|
|
8
|
+
DDN = TypeVar("DDN", bound="DatasetDependencyNode")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DatasetDependencyNode:
|
|
13
|
+
namespace: str
|
|
14
|
+
project: str
|
|
15
|
+
id: int
|
|
16
|
+
dataset_id: int | None
|
|
17
|
+
dataset_version_id: int | None
|
|
18
|
+
dataset_name: str | None
|
|
19
|
+
dataset_version: str | None
|
|
20
|
+
created_at: datetime
|
|
21
|
+
source_dataset_id: int
|
|
22
|
+
source_dataset_version_id: int | None
|
|
23
|
+
depth: int
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def parse(
|
|
27
|
+
cls: builtins.type[DDN],
|
|
28
|
+
namespace: str,
|
|
29
|
+
project: str,
|
|
30
|
+
id: int,
|
|
31
|
+
dataset_id: int | None,
|
|
32
|
+
dataset_version_id: int | None,
|
|
33
|
+
dataset_name: str | None,
|
|
34
|
+
dataset_version: str | None,
|
|
35
|
+
created_at: datetime,
|
|
36
|
+
source_dataset_id: int,
|
|
37
|
+
source_dataset_version_id: int | None,
|
|
38
|
+
depth: int,
|
|
39
|
+
) -> "DatasetDependencyNode | None":
|
|
40
|
+
return cls(
|
|
41
|
+
namespace,
|
|
42
|
+
project,
|
|
43
|
+
id,
|
|
44
|
+
dataset_id,
|
|
45
|
+
dataset_version_id,
|
|
46
|
+
dataset_name,
|
|
47
|
+
dataset_version,
|
|
48
|
+
created_at,
|
|
49
|
+
source_dataset_id,
|
|
50
|
+
source_dataset_version_id,
|
|
51
|
+
depth,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def to_dependency(self) -> "DatasetDependency | None":
|
|
55
|
+
return DatasetDependency.parse(
|
|
56
|
+
namespace_name=self.namespace,
|
|
57
|
+
project_name=self.project,
|
|
58
|
+
id=self.id,
|
|
59
|
+
dataset_id=self.dataset_id,
|
|
60
|
+
dataset_version_id=self.dataset_version_id,
|
|
61
|
+
dataset_name=self.dataset_name,
|
|
62
|
+
dataset_version=self.dataset_version,
|
|
63
|
+
dataset_version_created_at=self.created_at,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_dependency_hierarchy(
|
|
68
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
69
|
+
) -> tuple[
|
|
70
|
+
dict[int, DatasetDependency | None], dict[tuple[int, int | None], list[int]]
|
|
71
|
+
]:
|
|
72
|
+
"""
|
|
73
|
+
Build dependency hierarchy from dependency nodes.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
dependency_nodes: List of DatasetDependencyNode objects from the database
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (dependency_map, children_map) where:
|
|
80
|
+
- dependency_map: Maps dependency_id -> DatasetDependency
|
|
81
|
+
- children_map: Maps (source_dataset_id, source_version_id) ->
|
|
82
|
+
list of dependency_ids
|
|
83
|
+
"""
|
|
84
|
+
dependency_map: dict[int, DatasetDependency | None] = {}
|
|
85
|
+
children_map: dict[tuple[int, int | None], list[int]] = {}
|
|
86
|
+
|
|
87
|
+
for node in dependency_nodes:
|
|
88
|
+
if node is None:
|
|
89
|
+
continue
|
|
90
|
+
dependency = node.to_dependency()
|
|
91
|
+
parent_key = (node.source_dataset_id, node.source_dataset_version_id)
|
|
92
|
+
|
|
93
|
+
if dependency is not None:
|
|
94
|
+
dependency_map[dependency.id] = dependency
|
|
95
|
+
children_map.setdefault(parent_key, []).append(dependency.id)
|
|
96
|
+
else:
|
|
97
|
+
# Handle case where dependency creation failed (e.g., deleted dependency)
|
|
98
|
+
dependency_map[node.id] = None
|
|
99
|
+
children_map.setdefault(parent_key, []).append(node.id)
|
|
100
|
+
|
|
101
|
+
return dependency_map, children_map
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def populate_nested_dependencies(
|
|
105
|
+
dependency: DatasetDependency,
|
|
106
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
107
|
+
dependency_map: dict[int, DatasetDependency | None],
|
|
108
|
+
children_map: dict[tuple[int, int | None], list[int]],
|
|
109
|
+
) -> None:
|
|
110
|
+
"""
|
|
111
|
+
Recursively populate nested dependencies for a given dependency.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
dependency: The dependency to populate nested dependencies for
|
|
115
|
+
dependency_nodes: All dependency nodes from the database
|
|
116
|
+
dependency_map: Maps dependency_id -> DatasetDependency
|
|
117
|
+
children_map: Maps (source_dataset_id, source_version_id) ->
|
|
118
|
+
list of dependency_ids
|
|
119
|
+
"""
|
|
120
|
+
# Find the target dataset and version for this dependency
|
|
121
|
+
target_dataset_id, target_version_id = find_target_dataset_version(
|
|
122
|
+
dependency, dependency_nodes
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if target_dataset_id is None or target_version_id is None:
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
# Get children for this target
|
|
129
|
+
target_key = (target_dataset_id, target_version_id)
|
|
130
|
+
if target_key not in children_map:
|
|
131
|
+
dependency.dependencies = []
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
child_dependency_ids = children_map[target_key]
|
|
135
|
+
child_dependencies = [dependency_map[child_id] for child_id in child_dependency_ids]
|
|
136
|
+
|
|
137
|
+
dependency.dependencies = child_dependencies
|
|
138
|
+
|
|
139
|
+
# Recursively populate children
|
|
140
|
+
for child_dependency in child_dependencies:
|
|
141
|
+
if child_dependency is not None:
|
|
142
|
+
populate_nested_dependencies(
|
|
143
|
+
child_dependency, dependency_nodes, dependency_map, children_map
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def find_target_dataset_version(
|
|
148
|
+
dependency: DatasetDependency,
|
|
149
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
150
|
+
) -> tuple[int | None, int | None]:
|
|
151
|
+
"""
|
|
152
|
+
Find the target dataset ID and version ID for a given dependency.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
dependency: The dependency to find target for
|
|
156
|
+
dependency_nodes: All dependency nodes from the database
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Tuple of (target_dataset_id, target_version_id) or (None, None) if not found
|
|
160
|
+
"""
|
|
161
|
+
for node in dependency_nodes:
|
|
162
|
+
if node is not None and node.id == dependency.id:
|
|
163
|
+
return node.dataset_id, node.dataset_version_id
|
|
164
|
+
return None, None
|
|
@@ -22,10 +22,12 @@ from sqlalchemy import (
|
|
|
22
22
|
Text,
|
|
23
23
|
UniqueConstraint,
|
|
24
24
|
desc,
|
|
25
|
+
literal,
|
|
25
26
|
select,
|
|
26
27
|
)
|
|
27
28
|
from sqlalchemy.sql import func as f
|
|
28
29
|
|
|
30
|
+
from datachain.catalog.dependency import DatasetDependencyNode
|
|
29
31
|
from datachain.checkpoint import Checkpoint
|
|
30
32
|
from datachain.data_storage import JobQueryType, JobStatus
|
|
31
33
|
from datachain.data_storage.serializer import Serializable
|
|
@@ -78,6 +80,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
78
80
|
dataset_list_class: type[DatasetListRecord] = DatasetListRecord
|
|
79
81
|
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
80
82
|
dependency_class: type[DatasetDependency] = DatasetDependency
|
|
83
|
+
dependency_node_class: type[DatasetDependencyNode] = DatasetDependencyNode
|
|
81
84
|
job_class: type[Job] = Job
|
|
82
85
|
checkpoint_class: type[Checkpoint] = Checkpoint
|
|
83
86
|
|
|
@@ -366,6 +369,12 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
366
369
|
) -> list[DatasetDependency | None]:
|
|
367
370
|
"""Gets direct dataset dependencies."""
|
|
368
371
|
|
|
372
|
+
@abstractmethod
|
|
373
|
+
def get_dataset_dependency_nodes(
|
|
374
|
+
self, dataset_id: int, version_id: int
|
|
375
|
+
) -> list[DatasetDependencyNode | None]:
|
|
376
|
+
"""Gets dataset dependency node from database."""
|
|
377
|
+
|
|
369
378
|
@abstractmethod
|
|
370
379
|
def remove_dataset_dependencies(
|
|
371
380
|
self, dataset: DatasetRecord, version: str | None = None
|
|
@@ -1483,6 +1492,77 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1483
1492
|
|
|
1484
1493
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
1485
1494
|
|
|
1495
|
+
def get_dataset_dependency_nodes(
|
|
1496
|
+
self, dataset_id: int, version_id: int
|
|
1497
|
+
) -> list[DatasetDependencyNode | None]:
|
|
1498
|
+
n = self._namespaces_select().subquery()
|
|
1499
|
+
p = self._projects
|
|
1500
|
+
d = self._datasets_select().subquery()
|
|
1501
|
+
dd = self._datasets_dependencies
|
|
1502
|
+
dv = self._datasets_versions
|
|
1503
|
+
|
|
1504
|
+
# Common dependency fields for CTE
|
|
1505
|
+
dep_fields = [
|
|
1506
|
+
dd.c.id,
|
|
1507
|
+
dd.c.source_dataset_id,
|
|
1508
|
+
dd.c.source_dataset_version_id,
|
|
1509
|
+
dd.c.dataset_id,
|
|
1510
|
+
dd.c.dataset_version_id,
|
|
1511
|
+
]
|
|
1512
|
+
|
|
1513
|
+
# Base case: direct dependencies
|
|
1514
|
+
base_query = select(
|
|
1515
|
+
*dep_fields,
|
|
1516
|
+
literal(0).label("depth"),
|
|
1517
|
+
).where(
|
|
1518
|
+
(dd.c.source_dataset_id == dataset_id)
|
|
1519
|
+
& (dd.c.source_dataset_version_id == version_id)
|
|
1520
|
+
)
|
|
1521
|
+
|
|
1522
|
+
cte = base_query.cte(name="dependency_tree", recursive=True)
|
|
1523
|
+
|
|
1524
|
+
# Recursive case: dependencies of dependencies
|
|
1525
|
+
recursive_query = select(
|
|
1526
|
+
*dep_fields,
|
|
1527
|
+
(cte.c.depth + 1).label("depth"),
|
|
1528
|
+
).select_from(
|
|
1529
|
+
cte.join(
|
|
1530
|
+
dd,
|
|
1531
|
+
(cte.c.dataset_id == dd.c.source_dataset_id)
|
|
1532
|
+
& (cte.c.dataset_version_id == dd.c.source_dataset_version_id),
|
|
1533
|
+
)
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
cte = cte.union(recursive_query)
|
|
1537
|
+
|
|
1538
|
+
# Fetch all with full details
|
|
1539
|
+
final_query = select(
|
|
1540
|
+
n.c.name,
|
|
1541
|
+
p.c.name,
|
|
1542
|
+
cte.c.id,
|
|
1543
|
+
cte.c.dataset_id,
|
|
1544
|
+
cte.c.dataset_version_id,
|
|
1545
|
+
d.c.name,
|
|
1546
|
+
dv.c.version,
|
|
1547
|
+
dv.c.created_at,
|
|
1548
|
+
cte.c.source_dataset_id,
|
|
1549
|
+
cte.c.source_dataset_version_id,
|
|
1550
|
+
cte.c.depth,
|
|
1551
|
+
).select_from(
|
|
1552
|
+
# Use outer joins to handle cases where dependent datasets have been
|
|
1553
|
+
# physically deleted. This allows us to return dependency records with
|
|
1554
|
+
# None values instead of silently omitting them, making broken
|
|
1555
|
+
# dependencies visible to callers.
|
|
1556
|
+
cte.join(d, cte.c.dataset_id == d.c.id, isouter=True)
|
|
1557
|
+
.join(dv, cte.c.dataset_version_id == dv.c.id, isouter=True)
|
|
1558
|
+
.join(p, d.c.project_id == p.c.id, isouter=True)
|
|
1559
|
+
.join(n, p.c.namespace_id == n.c.id, isouter=True)
|
|
1560
|
+
)
|
|
1561
|
+
|
|
1562
|
+
return [
|
|
1563
|
+
self.dependency_node_class.parse(*r) for r in self.db.execute(final_query)
|
|
1564
|
+
]
|
|
1565
|
+
|
|
1486
1566
|
def remove_dataset_dependencies(
|
|
1487
1567
|
self, dataset: DatasetRecord, version: str | None = None
|
|
1488
1568
|
) -> None:
|
|
@@ -11,7 +11,6 @@ from datachain.sql.types import (
|
|
|
11
11
|
JSON,
|
|
12
12
|
Boolean,
|
|
13
13
|
DateTime,
|
|
14
|
-
Int,
|
|
15
14
|
Int64,
|
|
16
15
|
SQLType,
|
|
17
16
|
String,
|
|
@@ -269,7 +268,7 @@ class DataTable:
|
|
|
269
268
|
@classmethod
|
|
270
269
|
def sys_columns(cls):
|
|
271
270
|
return [
|
|
272
|
-
sa.Column("sys__id",
|
|
271
|
+
sa.Column("sys__id", UInt64, primary_key=True),
|
|
273
272
|
sa.Column(
|
|
274
273
|
"sys__rand", UInt64, nullable=False, server_default=f.abs(f.random())
|
|
275
274
|
),
|
|
@@ -868,11 +868,8 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
868
868
|
if isinstance(c, BinaryExpression):
|
|
869
869
|
right_left_join = add_left_rows_filter(c)
|
|
870
870
|
|
|
871
|
-
# Use CTE instead of subquery to force SQLite to materialize the result
|
|
872
|
-
# This breaks deep nesting and prevents parser stack overflow.
|
|
873
871
|
union_cte = sqlalchemy.union(left_right_join, right_left_join).cte()
|
|
874
|
-
|
|
875
|
-
return self._regenerate_system_columns(union_cte)
|
|
872
|
+
return sqlalchemy.select(*union_cte.c).select_from(union_cte)
|
|
876
873
|
|
|
877
874
|
def _system_row_number_expr(self):
|
|
878
875
|
return func.row_number().over()
|
|
@@ -884,11 +881,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
884
881
|
"""
|
|
885
882
|
Create a temporary table from a query for use in a UDF.
|
|
886
883
|
"""
|
|
887
|
-
columns = [
|
|
888
|
-
sqlalchemy.Column(c.name, c.type)
|
|
889
|
-
for c in query.selected_columns
|
|
890
|
-
if c.name != "sys__id"
|
|
891
|
-
]
|
|
884
|
+
columns = [sqlalchemy.Column(c.name, c.type) for c in query.selected_columns]
|
|
892
885
|
table = self.create_udf_table(columns)
|
|
893
886
|
|
|
894
887
|
with tqdm(desc="Preparing", unit=" rows", leave=False) as pbar:
|
|
@@ -5,7 +5,7 @@ import random
|
|
|
5
5
|
import string
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Union
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Union, cast
|
|
9
9
|
from urllib.parse import urlparse
|
|
10
10
|
|
|
11
11
|
import attrs
|
|
@@ -23,7 +23,7 @@ from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
|
23
23
|
from datachain.query.batch import RowsOutput
|
|
24
24
|
from datachain.query.schema import ColumnMeta
|
|
25
25
|
from datachain.sql.functions import path as pathfunc
|
|
26
|
-
from datachain.sql.types import
|
|
26
|
+
from datachain.sql.types import SQLType
|
|
27
27
|
from datachain.utils import sql_escape_like
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
|
|
32
32
|
_FromClauseArgument,
|
|
33
33
|
_OnClauseArgument,
|
|
34
34
|
)
|
|
35
|
+
from sqlalchemy.sql.selectable import FromClause
|
|
35
36
|
from sqlalchemy.types import TypeEngine
|
|
36
37
|
|
|
37
38
|
from datachain.data_storage import schema
|
|
@@ -248,45 +249,56 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
248
249
|
|
|
249
250
|
def _regenerate_system_columns(
|
|
250
251
|
self,
|
|
251
|
-
selectable: sa.Select
|
|
252
|
+
selectable: sa.Select,
|
|
252
253
|
keep_existing_columns: bool = False,
|
|
254
|
+
regenerate_columns: Iterable[str] | None = None,
|
|
253
255
|
) -> sa.Select:
|
|
254
256
|
"""
|
|
255
|
-
Return a SELECT that regenerates
|
|
257
|
+
Return a SELECT that regenerates system columns deterministically.
|
|
256
258
|
|
|
257
|
-
If keep_existing_columns is True, existing
|
|
258
|
-
|
|
259
|
-
"""
|
|
260
|
-
base = selectable.subquery() if hasattr(selectable, "subquery") else selectable
|
|
261
|
-
|
|
262
|
-
result_columns: dict[str, sa.ColumnElement] = {}
|
|
263
|
-
for col in base.c:
|
|
264
|
-
if col.name in result_columns:
|
|
265
|
-
raise ValueError(f"Duplicate column name {col.name} in SELECT")
|
|
266
|
-
if col.name in ("sys__id", "sys__rand"):
|
|
267
|
-
if keep_existing_columns:
|
|
268
|
-
result_columns[col.name] = col
|
|
269
|
-
else:
|
|
270
|
-
result_columns[col.name] = col
|
|
259
|
+
If keep_existing_columns is True, existing system columns will be kept as-is
|
|
260
|
+
even when they are listed in ``regenerate_columns``.
|
|
271
261
|
|
|
272
|
-
|
|
262
|
+
Args:
|
|
263
|
+
selectable: Base SELECT
|
|
264
|
+
keep_existing_columns: When True, reuse existing system columns even if
|
|
265
|
+
they are part of the regeneration set.
|
|
266
|
+
regenerate_columns: Names of system columns to regenerate. Defaults to
|
|
267
|
+
{"sys__id", "sys__rand"}. Columns not listed are left untouched.
|
|
268
|
+
"""
|
|
269
|
+
system_columns = {
|
|
273
270
|
sys_col.name: sys_col.type
|
|
274
271
|
for sys_col in self.schema.dataset_row_cls.sys_columns()
|
|
275
272
|
}
|
|
273
|
+
regenerate = set(regenerate_columns or system_columns)
|
|
274
|
+
generators = {
|
|
275
|
+
"sys__id": self._system_row_number_expr,
|
|
276
|
+
"sys__rand": self._system_random_expr,
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
base = cast("FromClause", selectable.subquery())
|
|
280
|
+
|
|
281
|
+
def build(name: str) -> sa.ColumnElement:
|
|
282
|
+
expr = generators[name]()
|
|
283
|
+
return sa.cast(expr, system_columns[name]).label(name)
|
|
284
|
+
|
|
285
|
+
columns: list[sa.ColumnElement] = []
|
|
286
|
+
present: set[str] = set()
|
|
287
|
+
changed = False
|
|
288
|
+
|
|
289
|
+
for col in base.c:
|
|
290
|
+
present.add(col.name)
|
|
291
|
+
regen = col.name in regenerate and not keep_existing_columns
|
|
292
|
+
columns.append(build(col.name) if regen else col)
|
|
293
|
+
changed |= regen
|
|
294
|
+
|
|
295
|
+
for name in regenerate - present:
|
|
296
|
+
columns.append(build(name))
|
|
297
|
+
changed = True
|
|
298
|
+
|
|
299
|
+
if not changed:
|
|
300
|
+
return selectable
|
|
276
301
|
|
|
277
|
-
# Add missing system columns if needed
|
|
278
|
-
if "sys__id" not in result_columns:
|
|
279
|
-
expr = self._system_row_number_expr()
|
|
280
|
-
expr = sa.cast(expr, system_types["sys__id"])
|
|
281
|
-
result_columns["sys__id"] = expr.label("sys__id")
|
|
282
|
-
if "sys__rand" not in result_columns:
|
|
283
|
-
expr = self._system_random_expr()
|
|
284
|
-
expr = sa.cast(expr, system_types["sys__rand"])
|
|
285
|
-
result_columns["sys__rand"] = expr.label("sys__rand")
|
|
286
|
-
|
|
287
|
-
# Wrap in subquery to materialize window functions, then wrap again in SELECT
|
|
288
|
-
# This ensures window functions are computed before INSERT...FROM SELECT
|
|
289
|
-
columns = list(result_columns.values())
|
|
290
302
|
inner = sa.select(*columns).select_from(base).subquery()
|
|
291
303
|
return sa.select(*inner.c).select_from(inner)
|
|
292
304
|
|
|
@@ -950,10 +962,15 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
950
962
|
SQLite TEMPORARY tables cannot be directly used as they are process-specific,
|
|
951
963
|
and UDFs are run in other processes when run in parallel.
|
|
952
964
|
"""
|
|
965
|
+
columns = [
|
|
966
|
+
c
|
|
967
|
+
for c in columns
|
|
968
|
+
if c.name not in [col.name for col in self.dataset_row_cls.sys_columns()]
|
|
969
|
+
]
|
|
953
970
|
tbl = sa.Table(
|
|
954
971
|
name or self.udf_table_name(),
|
|
955
972
|
sa.MetaData(),
|
|
956
|
-
|
|
973
|
+
*self.dataset_row_cls.sys_columns(),
|
|
957
974
|
*columns,
|
|
958
975
|
)
|
|
959
976
|
self.db.create_table(tbl, if_not_exists=True)
|