datachain 0.24.2__tar.gz → 0.24.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.24.2 → datachain-0.24.4}/PKG-INFO +1 -1
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/catalog.py +19 -2
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/metastore.py +3 -1
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/sqlite.py +9 -6
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/dataset.py +1 -1
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/datachain.py +26 -1
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/datasets.py +1 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/listing.py +10 -3
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/namespace.py +1 -1
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/project.py +1 -1
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/dataset.py +5 -1
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/PKG-INFO +1 -1
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_datachain.py +16 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_delta.py +7 -1
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_read_dataset_remote.py +49 -4
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_datachain.py +81 -0
- {datachain-0.24.2 → datachain-0.24.4}/.cruft.json +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.gitattributes +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/codecov.yaml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/dependabot.yml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/release.yml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/tests.yml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.gitignore +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/.pre-commit-config.yaml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/LICENSE +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/README.rst +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/assets/datachain.svg +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/login.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/logout.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/team.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/auth/token.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/index.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/cancel.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/clusters.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/logs.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/ls.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/commands/job/run.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/contributing.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/examples.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/db_migrations.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/delta.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/env.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/index.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/namespaces.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/processing.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/remotes.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/guide/retry.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/index.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/overrides/main.html +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/quick-start.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/file.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/index.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/pose.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/segment.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/datachain.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/func.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/index.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/toolkit.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/torch.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/references/udf.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/docs/tutorials.md +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/wds.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/mkdocs.yml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/noxfile.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/pyproject.toml +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/setup.cfg +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/__main__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/asyn.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cache.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/datasets.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/cli/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/azure.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/gcs.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/hf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/local.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/client/s3.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/config.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/delta.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/error.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/fs/reference.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/fs/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/array.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/base.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/conditional.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/func.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/numeric.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/path.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/random.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/string.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/func/window.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/job.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/clip.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/listings.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/records.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/storage.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/file.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/hf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/image.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/listing.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/namespaces.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/projects.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/settings.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/tar.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/text.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/udf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/video.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/bbox.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/pose.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/segment.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/model/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/node.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/progress.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/py.typed +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/batch.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/metrics.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/params.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/queue.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/schema.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/session.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/udf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/query/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/remote/studio.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/script_meta.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/semver.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/types.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/sql/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/studio.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/telemetry.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain/utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/conftest.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/data.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/examples/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/examples/test_examples.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/examples/wds_data.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/data/lena.jpg +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_array.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_path.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_random.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/functions/test_string.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_batching.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_catalog.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_client.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_data_storage.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_datasets.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_file.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_hf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_image.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_listing.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_ls.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_metastore.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_metrics.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_pull.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_pytorch.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_query.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_read_database.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_read_dataset_version_specifiers.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_retry.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_session.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_toolkit.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_video.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/func/test_warehouse.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/test_atomicity.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/test_cli_e2e.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/test_cli_studio.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/test_import_time.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/test_query_e2e.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/test_telemetry.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_namespace.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_project.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_asyn.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_cache.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_catalog.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_client.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_config.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_dataset.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_func.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_listing.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_metastore.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_query.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_query_params.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_semver.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_serializer.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_session.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_utils.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.24.2 → datachain-0.24.4}/tests/utils.py +0 -0
|
@@ -98,7 +98,7 @@ jobs:
|
|
|
98
98
|
- name: Run tests
|
|
99
99
|
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
|
|
100
100
|
run: >
|
|
101
|
-
|
|
101
|
+
DATACHAIN_METASTORE_ARG_USERNAME=john
|
|
102
102
|
PYTHONPATH="$(pwd)/..:${PYTHONPATH}"
|
|
103
103
|
pytest
|
|
104
104
|
--config-file=pyproject.toml -rs
|
|
@@ -1098,9 +1098,18 @@ class Catalog:
|
|
|
1098
1098
|
) -> DatasetRecord:
|
|
1099
1099
|
from datachain.lib.listing import is_listing_dataset
|
|
1100
1100
|
|
|
1101
|
+
project = project or self.metastore.default_project
|
|
1102
|
+
|
|
1101
1103
|
if is_listing_dataset(name):
|
|
1102
1104
|
project = self.metastore.listing_project
|
|
1103
|
-
|
|
1105
|
+
|
|
1106
|
+
try:
|
|
1107
|
+
return self.metastore.get_dataset(name, project.id if project else None)
|
|
1108
|
+
except DatasetNotFoundError:
|
|
1109
|
+
raise DatasetNotFoundError(
|
|
1110
|
+
f"Dataset {name} not found in namespace {project.namespace.name}"
|
|
1111
|
+
f" and project {project.name}"
|
|
1112
|
+
) from None
|
|
1104
1113
|
|
|
1105
1114
|
def get_dataset_with_remote_fallback(
|
|
1106
1115
|
self,
|
|
@@ -1111,6 +1120,14 @@ class Catalog:
|
|
|
1111
1120
|
pull_dataset: bool = False,
|
|
1112
1121
|
update: bool = False,
|
|
1113
1122
|
) -> DatasetRecord:
|
|
1123
|
+
# Intentionally ignore update flag is version is provided. Here only exact
|
|
1124
|
+
# version can be provided and update then doesn't make sense.
|
|
1125
|
+
# It corresponds to a query like this for example:
|
|
1126
|
+
#
|
|
1127
|
+
# dc.read_dataset("some.remote.dataset", version="1.0.0", update=True)
|
|
1128
|
+
if version:
|
|
1129
|
+
update = False
|
|
1130
|
+
|
|
1114
1131
|
if self.metastore.is_local_dataset(namespace_name) or not update:
|
|
1115
1132
|
try:
|
|
1116
1133
|
project = self.metastore.get_project(project_name, namespace_name)
|
|
@@ -1124,7 +1141,7 @@ class Catalog:
|
|
|
1124
1141
|
raise DatasetNotFoundError(
|
|
1125
1142
|
f"Dataset {name}"
|
|
1126
1143
|
+ (f" version {version} " if version else " ")
|
|
1127
|
-
+ "not found"
|
|
1144
|
+
+ f"not found in namespace {namespace_name} and project {project_name}"
|
|
1128
1145
|
)
|
|
1129
1146
|
|
|
1130
1147
|
if pull_dataset:
|
|
@@ -1194,14 +1194,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1194
1194
|
Gets a single dataset in project by dataset name.
|
|
1195
1195
|
"""
|
|
1196
1196
|
project_id = project_id or self.default_project.id
|
|
1197
|
+
|
|
1197
1198
|
d = self._datasets
|
|
1198
1199
|
query = self._base_dataset_query()
|
|
1199
1200
|
query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
|
|
1200
1201
|
ds = self._parse_dataset(self.db.execute(query, conn=conn))
|
|
1201
1202
|
if not ds:
|
|
1202
1203
|
raise DatasetNotFoundError(
|
|
1203
|
-
f"Dataset {name} not found in project {project_id}"
|
|
1204
|
+
f"Dataset {name} not found in project with id {project_id}"
|
|
1204
1205
|
)
|
|
1206
|
+
|
|
1205
1207
|
return ds
|
|
1206
1208
|
|
|
1207
1209
|
def remove_dataset_version(
|
|
@@ -774,7 +774,15 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
774
774
|
query: Select,
|
|
775
775
|
progress_cb: Optional[Callable[[int], None]] = None,
|
|
776
776
|
) -> None:
|
|
777
|
-
|
|
777
|
+
col_id = (
|
|
778
|
+
query.selected_columns.sys__id
|
|
779
|
+
if "sys__id" in query.selected_columns
|
|
780
|
+
else None
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
# If there is no sys__id column, we cannot copy the table in batches,
|
|
784
|
+
# and we need to copy all rows at once. Same if there is a group by clause.
|
|
785
|
+
if col_id is None or len(query._group_by_clause) > 0:
|
|
778
786
|
select_q = query.with_only_columns(
|
|
779
787
|
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
780
788
|
)
|
|
@@ -782,12 +790,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
782
790
|
self.db.execute(q)
|
|
783
791
|
return
|
|
784
792
|
|
|
785
|
-
if "sys__id" in query.selected_columns:
|
|
786
|
-
col_id = query.selected_columns.sys__id
|
|
787
|
-
else:
|
|
788
|
-
col_id = sqlalchemy.column("sys__id")
|
|
789
793
|
select_ids = query.with_only_columns(col_id)
|
|
790
|
-
|
|
791
794
|
ids = self.db.execute(select_ids).fetchall()
|
|
792
795
|
|
|
793
796
|
select_q = (
|
|
@@ -21,6 +21,7 @@ from typing import (
|
|
|
21
21
|
import orjson
|
|
22
22
|
import sqlalchemy
|
|
23
23
|
from pydantic import BaseModel
|
|
24
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
24
25
|
from tqdm import tqdm
|
|
25
26
|
|
|
26
27
|
from datachain import semver
|
|
@@ -806,11 +807,35 @@ class DataChain:
|
|
|
806
807
|
chain.save("new_dataset")
|
|
807
808
|
```
|
|
808
809
|
"""
|
|
810
|
+
# Convert string partition_by parameters to Column objects
|
|
811
|
+
processed_partition_by = partition_by
|
|
812
|
+
if partition_by is not None:
|
|
813
|
+
if isinstance(partition_by, (str, Function, ColumnElement)):
|
|
814
|
+
list_partition_by = [partition_by]
|
|
815
|
+
else:
|
|
816
|
+
list_partition_by = list(partition_by)
|
|
817
|
+
|
|
818
|
+
processed_partition_columns: list[ColumnElement] = []
|
|
819
|
+
for col in list_partition_by:
|
|
820
|
+
if isinstance(col, str):
|
|
821
|
+
col_db_name = ColumnMeta.to_db_name(col)
|
|
822
|
+
col_type = self.signals_schema.get_column_type(col_db_name)
|
|
823
|
+
column = Column(col_db_name, python_to_sql(col_type))
|
|
824
|
+
processed_partition_columns.append(column)
|
|
825
|
+
elif isinstance(col, Function):
|
|
826
|
+
column = col.get_column(self.signals_schema)
|
|
827
|
+
processed_partition_columns.append(column)
|
|
828
|
+
else:
|
|
829
|
+
# Assume it's already a ColumnElement
|
|
830
|
+
processed_partition_columns.append(col)
|
|
831
|
+
|
|
832
|
+
processed_partition_by = processed_partition_columns
|
|
833
|
+
|
|
809
834
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
810
835
|
return self._evolve(
|
|
811
836
|
query=self._query.generate(
|
|
812
837
|
udf_obj.to_udf_wrapper(),
|
|
813
|
-
partition_by=
|
|
838
|
+
partition_by=processed_partition_by,
|
|
814
839
|
**self._settings.to_dict(),
|
|
815
840
|
),
|
|
816
841
|
signal_schema=udf_obj.output,
|
|
@@ -65,10 +65,17 @@ class Listing:
|
|
|
65
65
|
|
|
66
66
|
@cached_property
|
|
67
67
|
def dataset(self) -> "DatasetRecord":
|
|
68
|
+
from datachain.error import DatasetNotFoundError
|
|
69
|
+
|
|
68
70
|
assert self.dataset_name
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
project = self.metastore.listing_project
|
|
72
|
+
try:
|
|
73
|
+
return self.metastore.get_dataset(self.dataset_name, project.id)
|
|
74
|
+
except DatasetNotFoundError:
|
|
75
|
+
raise DatasetNotFoundError(
|
|
76
|
+
f"Dataset {self.dataset_name} not found in namespace"
|
|
77
|
+
f" {project.namespace.name} and project {project.name}"
|
|
78
|
+
) from None
|
|
72
79
|
|
|
73
80
|
@cached_property
|
|
74
81
|
def dataset_rows(self):
|
|
@@ -82,7 +82,10 @@ if TYPE_CHECKING:
|
|
|
82
82
|
INSERT_BATCH_SIZE = 10000
|
|
83
83
|
|
|
84
84
|
PartitionByType = Union[
|
|
85
|
-
|
|
85
|
+
str,
|
|
86
|
+
Function,
|
|
87
|
+
ColumnElement,
|
|
88
|
+
Sequence[Union[str, Function, ColumnElement]],
|
|
86
89
|
]
|
|
87
90
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
88
91
|
DatasetDependencyType = tuple["DatasetRecord", str]
|
|
@@ -1142,6 +1145,7 @@ class DatasetQuery:
|
|
|
1142
1145
|
project_name=project_name,
|
|
1143
1146
|
version=version,
|
|
1144
1147
|
pull_dataset=True,
|
|
1148
|
+
update=update,
|
|
1145
1149
|
)
|
|
1146
1150
|
)
|
|
1147
1151
|
|
|
@@ -236,6 +236,22 @@ def test_read_storage_dependencies(cloud_test_catalog, cloud_type):
|
|
|
236
236
|
assert dependencies[0].name == dep_name
|
|
237
237
|
|
|
238
238
|
|
|
239
|
+
def test_persist_after_mutate(test_session):
|
|
240
|
+
chain = (
|
|
241
|
+
dc.read_values(fib=[1, 1, 2, 3, 5, 8, 13, 21], session=test_session)
|
|
242
|
+
.map(mod3=lambda fib: fib % 3, output=int)
|
|
243
|
+
.group_by(
|
|
244
|
+
cnt=dc.func.count(),
|
|
245
|
+
partition_by="mod3",
|
|
246
|
+
)
|
|
247
|
+
.mutate(x=1)
|
|
248
|
+
.persist()
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
assert chain.count() == 3
|
|
252
|
+
assert set(chain.to_values("mod3")) == {0, 1, 2}
|
|
253
|
+
|
|
254
|
+
|
|
239
255
|
def test_persist_not_affects_dependencies(tmp_dir, test_session):
|
|
240
256
|
for i in range(4):
|
|
241
257
|
(tmp_dir / f"file{i}.txt").write_text(f"file{i}")
|
|
@@ -248,6 +248,9 @@ def test_delta_update_check_num_calls(test_session, tmp_dir, tmp_path, capsys):
|
|
|
248
248
|
|
|
249
249
|
|
|
250
250
|
def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
|
|
251
|
+
catalog = test_session.catalog
|
|
252
|
+
default_namespace_name = catalog.metastore.default_namespace_name
|
|
253
|
+
default_project_name = catalog.metastore.default_project_name
|
|
251
254
|
ds_name = "delta_ds"
|
|
252
255
|
path = tmp_dir.as_uri()
|
|
253
256
|
tmp_dir = tmp_dir / "images"
|
|
@@ -296,7 +299,10 @@ def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
|
|
|
296
299
|
with pytest.raises(DatasetNotFoundError) as exc_info:
|
|
297
300
|
dc.read_dataset(ds_name, version="1.0.1")
|
|
298
301
|
|
|
299
|
-
assert str(exc_info.value) ==
|
|
302
|
+
assert str(exc_info.value) == (
|
|
303
|
+
f"Dataset {ds_name} version 1.0.1 not found in namespace "
|
|
304
|
+
f"{default_namespace_name} and project {default_project_name}"
|
|
305
|
+
)
|
|
300
306
|
|
|
301
307
|
|
|
302
308
|
@pytest.fixture
|
|
@@ -362,8 +362,16 @@ def test_read_dataset_remote_update_flag(
|
|
|
362
362
|
assert dc.datasets().to_values("version") == ["1.0.0"]
|
|
363
363
|
assert ds1.to_values("version")[0] == "1.0.0"
|
|
364
364
|
|
|
365
|
+
# Read without update and version returns a cached version
|
|
366
|
+
ds1 = dc.read_dataset(
|
|
367
|
+
f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
|
|
368
|
+
session=test_session,
|
|
369
|
+
)
|
|
370
|
+
assert dc.datasets().to_values("version") == ["1.0.0"]
|
|
371
|
+
assert ds1.to_values("version")[0] == "1.0.0"
|
|
372
|
+
|
|
365
373
|
# Second read with update=True with the exact version
|
|
366
|
-
# returns the same
|
|
374
|
+
# returns the same dataset version
|
|
367
375
|
ds2 = dc.read_dataset(
|
|
368
376
|
f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
|
|
369
377
|
version="1.0.0",
|
|
@@ -385,9 +393,7 @@ def test_read_dataset_remote_update_flag(
|
|
|
385
393
|
assert dc.datasets().to_values("version") == ["1.0.0"]
|
|
386
394
|
assert ds3.to_values("version")[0] == "1.0.0"
|
|
387
395
|
|
|
388
|
-
# Finally, read with update=
|
|
389
|
-
# that allows for newer version still bring the same version
|
|
390
|
-
# as the one already downloaded
|
|
396
|
+
# Finally, read with update=True brings the latest version
|
|
391
397
|
ds4 = dc.read_dataset(
|
|
392
398
|
f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
|
|
393
399
|
version=">=1.0.0",
|
|
@@ -399,6 +405,45 @@ def test_read_dataset_remote_update_flag(
|
|
|
399
405
|
assert dc.datasets().to_values("version") == ["1.0.0", "2.0.0"]
|
|
400
406
|
|
|
401
407
|
|
|
408
|
+
@skip_if_not_sqlite
|
|
409
|
+
def test_read_dataset_remote_update_flag_no_version(
|
|
410
|
+
studio_token,
|
|
411
|
+
test_session,
|
|
412
|
+
remote_dataset_multi_version,
|
|
413
|
+
mock_dataset_info_endpoint,
|
|
414
|
+
mock_export_endpoint_with_urls,
|
|
415
|
+
mock_export_status_completed,
|
|
416
|
+
mock_s3_parquet_download,
|
|
417
|
+
mock_dataset_rows_fetcher_status_check,
|
|
418
|
+
requests_mock,
|
|
419
|
+
):
|
|
420
|
+
"""Test read_dataset with update=True flag to force remote check."""
|
|
421
|
+
|
|
422
|
+
# Mock the Studio API responses
|
|
423
|
+
mock_dataset_info_endpoint(remote_dataset_multi_version)
|
|
424
|
+
mock_s3_parquet_download()
|
|
425
|
+
|
|
426
|
+
# First read - downloads version 1.0.0
|
|
427
|
+
ds1 = dc.read_dataset(
|
|
428
|
+
f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
|
|
429
|
+
version="1.0.0",
|
|
430
|
+
session=test_session,
|
|
431
|
+
)
|
|
432
|
+
assert dc.datasets().to_values("version") == ["1.0.0"]
|
|
433
|
+
assert ds1.to_values("version")[0] == "1.0.0"
|
|
434
|
+
|
|
435
|
+
# Read with update=True w/o version specifier also
|
|
436
|
+
# checks the most recent remote version and brings it
|
|
437
|
+
ds4 = dc.read_dataset(
|
|
438
|
+
f"{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs",
|
|
439
|
+
update=True,
|
|
440
|
+
session=test_session,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
assert ds4.to_values("version")[0] == "2.0.0"
|
|
444
|
+
assert dc.datasets().to_values("version") == ["1.0.0", "2.0.0"]
|
|
445
|
+
|
|
446
|
+
|
|
402
447
|
@skip_if_not_sqlite
|
|
403
448
|
def test_read_dataset_remote_version_specifiers(
|
|
404
449
|
studio_token,
|
|
@@ -16,6 +16,7 @@ from pydantic import BaseModel
|
|
|
16
16
|
|
|
17
17
|
import datachain as dc
|
|
18
18
|
from datachain import Column
|
|
19
|
+
from datachain.data_storage import AbstractMetastore
|
|
19
20
|
from datachain.error import (
|
|
20
21
|
DatasetInvalidVersionError,
|
|
21
22
|
DatasetNotFoundError,
|
|
@@ -3428,6 +3429,29 @@ def test_save_to_non_default_namespace_and_project(
|
|
|
3428
3429
|
dc.read_dataset(name="fibonacci")
|
|
3429
3430
|
|
|
3430
3431
|
|
|
3432
|
+
def test_dataset_not_found_in_default_project(test_session):
|
|
3433
|
+
metastore = test_session.catalog.metastore
|
|
3434
|
+
with pytest.raises(DatasetNotFoundError) as excinfo:
|
|
3435
|
+
with patch.object(AbstractMetastore, "is_local_dataset", return_value=True):
|
|
3436
|
+
dc.read_dataset("fibonacci")
|
|
3437
|
+
assert str(excinfo.value) == (
|
|
3438
|
+
f"Dataset fibonacci not found in namespace {metastore.default_namespace_name}"
|
|
3439
|
+
f" and project {metastore.default_project_name}"
|
|
3440
|
+
)
|
|
3441
|
+
|
|
3442
|
+
|
|
3443
|
+
@pytest.mark.parametrize("project_created", (True, False))
|
|
3444
|
+
def test_dataset_not_found_in_non_default_project(test_session, project_created):
|
|
3445
|
+
if project_created:
|
|
3446
|
+
dc.create_project("dev", "numbers")
|
|
3447
|
+
with pytest.raises(DatasetNotFoundError) as excinfo:
|
|
3448
|
+
with patch.object(AbstractMetastore, "is_local_dataset", return_value=True):
|
|
3449
|
+
dc.read_dataset("dev.numbers.fibonacci")
|
|
3450
|
+
assert str(excinfo.value) == (
|
|
3451
|
+
"Dataset fibonacci not found in namespace dev and project numbers"
|
|
3452
|
+
)
|
|
3453
|
+
|
|
3454
|
+
|
|
3431
3455
|
@pytest.mark.parametrize("use_settings", (True, False))
|
|
3432
3456
|
@pytest.mark.parametrize("project_created_upfront", (True, False))
|
|
3433
3457
|
def test_save_specify_only_non_default_project(
|
|
@@ -3571,3 +3595,60 @@ def test_save_create_project_not_allowed(test_session, allow_create_project):
|
|
|
3571
3595
|
dc.read_values(fib=[1, 1, 2, 3, 5, 8], session=test_session).save(
|
|
3572
3596
|
"dev.numbers.fibonacci"
|
|
3573
3597
|
)
|
|
3598
|
+
|
|
3599
|
+
|
|
3600
|
+
def test_agg_partition_by_string_notation(test_session):
|
|
3601
|
+
"""Test that agg method supports string notation for partition_by."""
|
|
3602
|
+
|
|
3603
|
+
class _ImageGroup(BaseModel):
|
|
3604
|
+
name: str
|
|
3605
|
+
size: int
|
|
3606
|
+
|
|
3607
|
+
def func(key, val) -> Iterator[tuple[File, _ImageGroup]]:
|
|
3608
|
+
n = "-".join(key)
|
|
3609
|
+
v = sum(val)
|
|
3610
|
+
yield File(path=n), _ImageGroup(name=n, size=v)
|
|
3611
|
+
|
|
3612
|
+
keys = ["n1", "n2", "n1"]
|
|
3613
|
+
values = [1, 5, 9]
|
|
3614
|
+
|
|
3615
|
+
# Test using string notation (NEW functionality)
|
|
3616
|
+
ds = dc.read_values(key=keys, val=values, session=test_session).agg(
|
|
3617
|
+
x=func,
|
|
3618
|
+
partition_by="key", # String notation instead of C("key")
|
|
3619
|
+
)
|
|
3620
|
+
|
|
3621
|
+
assert ds.order_by("x_1.name").to_values("x_1.name") == ["n1-n1", "n2"]
|
|
3622
|
+
assert ds.order_by("x_1.size").to_values("x_1.size") == [5, 10]
|
|
3623
|
+
|
|
3624
|
+
|
|
3625
|
+
def test_agg_partition_by_string_sequence(test_session):
|
|
3626
|
+
"""Test that agg method supports sequence of strings for partition_by."""
|
|
3627
|
+
|
|
3628
|
+
class _ImageGroup(BaseModel):
|
|
3629
|
+
name: str
|
|
3630
|
+
size: int
|
|
3631
|
+
|
|
3632
|
+
def func(key1, key2, val) -> Iterator[tuple[File, _ImageGroup]]:
|
|
3633
|
+
n = f"{key1[0]}-{key2[0]}"
|
|
3634
|
+
v = sum(val)
|
|
3635
|
+
yield File(path=n), _ImageGroup(name=n, size=v)
|
|
3636
|
+
|
|
3637
|
+
key1_values = ["a", "a", "b"]
|
|
3638
|
+
key2_values = ["x", "y", "x"]
|
|
3639
|
+
values = [1, 5, 9]
|
|
3640
|
+
|
|
3641
|
+
# Test using sequence of strings (NEW functionality)
|
|
3642
|
+
ds = dc.read_values(
|
|
3643
|
+
key1=key1_values, key2=key2_values, val=values, session=test_session
|
|
3644
|
+
).agg(
|
|
3645
|
+
x=func,
|
|
3646
|
+
partition_by=["key1", "key2"], # Sequence of strings
|
|
3647
|
+
)
|
|
3648
|
+
|
|
3649
|
+
result_names = ds.order_by("x_1.name").to_values("x_1.name")
|
|
3650
|
+
result_sizes = ds.order_by("x_1.size").to_values("x_1.size")
|
|
3651
|
+
|
|
3652
|
+
# Should have 3 partitions: (a,x), (a,y), (b,x)
|
|
3653
|
+
assert len(result_names) == 3
|
|
3654
|
+
assert len(result_sizes) == 3
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|