datachain 0.19.2__tar.gz → 0.20.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.19.2 → datachain-0.20.0}/.github/workflows/tests-studio.yml +1 -0
- {datachain-0.19.2 → datachain-0.20.0}/PKG-INFO +2 -2
- datachain-0.20.0/docs/guide/env.md +18 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/guide/index.md +1 -0
- {datachain-0.19.2 → datachain-0.20.0}/mkdocs.yml +1 -0
- {datachain-0.19.2 → datachain-0.20.0}/pyproject.toml +1 -1
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/__init__.py +3 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/catalog/catalog.py +180 -65
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/__init__.py +0 -7
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/datasets.py +43 -28
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/parser/__init__.py +1 -35
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/metastore.py +390 -37
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/sqlite.py +139 -7
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/warehouse.py +26 -7
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/dataset.py +125 -12
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/delta.py +9 -5
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/error.py +36 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dataset_info.py +4 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/datachain.py +86 -7
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/datasets.py +62 -12
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/listings.py +3 -3
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/records.py +1 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/storage.py +14 -2
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/listing.py +3 -1
- datachain-0.20.0/src/datachain/lib/namespaces.py +73 -0
- datachain-0.20.0/src/datachain/lib/projects.py +86 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/settings.py +10 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/listing.py +3 -1
- datachain-0.20.0/src/datachain/namespace.py +65 -0
- datachain-0.20.0/src/datachain/project.py +78 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/dataset.py +71 -46
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/session.py +1 -1
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/remote/studio.py +61 -26
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/studio.py +23 -6
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain.egg-info/PKG-INFO +2 -2
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain.egg-info/SOURCES.txt +7 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.19.2 → datachain-0.20.0}/tests/conftest.py +81 -4
- {datachain-0.19.2 → datachain-0.20.0}/tests/examples/test_examples.py +2 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_batching.py +5 -5
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_dataset_query.py +20 -2
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_datasets.py +113 -81
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_metastore.py +30 -10
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_pull.py +69 -18
- {datachain-0.19.2 → datachain-0.20.0}/tests/test_atomicity.py +4 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/test_cli_e2e.py +43 -10
- {datachain-0.19.2 → datachain-0.20.0}/tests/test_cli_studio.py +40 -29
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_datachain.py +55 -3
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_listing_info.py +9 -2
- datachain-0.20.0/tests/unit/lib/test_namespace.py +87 -0
- datachain-0.20.0/tests/unit/lib/test_project.py +184 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_dataset.py +25 -1
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_listing.py +20 -4
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_metastore.py +35 -3
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_session.py +32 -9
- {datachain-0.19.2 → datachain-0.20.0}/tests/utils.py +1 -1
- {datachain-0.19.2 → datachain-0.20.0}/.cruft.json +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.gitattributes +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/codecov.yaml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/dependabot.yml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/workflows/release.yml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/workflows/tests.yml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.gitignore +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/.pre-commit-config.yaml +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/LICENSE +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/README.rst +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/auth/login.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/auth/logout.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/auth/team.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/auth/token.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/index.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/job/cancel.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/job/clusters.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/job/logs.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/job/ls.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/commands/job/run.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/contributing.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/examples.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/guide/delta.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/guide/processing.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/guide/remotes.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/guide/retry.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/index.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/overrides/main.html +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/quick-start.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/arrowrow.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/bbox.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/file.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/imagefile.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/index.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/pose.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/segment.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/tarvfile.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/textfile.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/data-types/videofile.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/datachain.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/func.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/index.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/toolkit.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/torch.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/references/udf.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/docs/tutorials.md +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/incremental_processing/delta.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/incremental_processing/retry.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/incremental_processing/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/noxfile.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/setup.cfg +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/__main__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/asyn.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cache.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/du.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/index.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/ls.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/misc.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/query.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/commands/show.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/parser/job.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/parser/studio.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/parser/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/cli/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/local.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/config.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/diff/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/fs/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/fs/reference.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/fs/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/array.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/base.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/func.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/path.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/random.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/string.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/func/window.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/job.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/csv.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/database.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/hf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/json.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/pandas.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/parquet.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/dc/values.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/file.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/video.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/model/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/node.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/progress.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/py.typed +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/params.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/udf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/query/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/script_meta.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/semver.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain/utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/data.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/examples/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/data/Big_Buck_Bunny_360_10s_1MB.mp4 +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/data/lena.jpg +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/fake-service-account-credentials.json +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/test_aggregate.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/test_array.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/test_conditional.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/test_numeric.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/test_path.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/test_random.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/functions/test_string.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/model/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/model/data/running-mask0.png +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/model/data/running-mask1.png +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/model/data/running.jpg +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/model/data/ships.jpg +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/model/test_yolo.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_client.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_cloud_transfer.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_data_storage.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_datachain.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_datachain_merge.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_delta.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_file.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_hf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_hidden_field.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_image.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_listing.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_ls.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_query.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_read_database.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_retry.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_session.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_toolkit.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_video.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/func/test_warehouse.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/test_import_time.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/test_telemetry.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_python_to_sql.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_udf.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/model/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/model/test_bbox.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/model/test_pose.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/model/test_segment.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/model/test_utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_client.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_client_gcs.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_config.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_func.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_pytorch.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_query.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_script_meta.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_semver.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.19.2 → datachain-0.20.0}/tests/unit/test_warehouse.py +0 -0
|
@@ -98,6 +98,7 @@ jobs:
|
|
|
98
98
|
- name: Run tests
|
|
99
99
|
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
|
|
100
100
|
run: >
|
|
101
|
+
DATACHAIN_METASTORE_ARG_PROJECT=john
|
|
101
102
|
PYTHONPATH="$(pwd)/..:${PYTHONPATH}"
|
|
102
103
|
pytest
|
|
103
104
|
--config-file=pyproject.toml -rs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.20.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -94,7 +94,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
94
94
|
Requires-Dist: ultralytics; extra == "tests"
|
|
95
95
|
Provides-Extra: dev
|
|
96
96
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
97
|
-
Requires-Dist: mypy==1.16.
|
|
97
|
+
Requires-Dist: mypy==1.16.1; extra == "dev"
|
|
98
98
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
99
99
|
Requires-Dist: types-pytz; extra == "dev"
|
|
100
100
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Environment Variables
|
|
2
|
+
|
|
3
|
+
List of environment variables used to configure DataChain behavior.
|
|
4
|
+
|
|
5
|
+
### Core Configuration
|
|
6
|
+
|
|
7
|
+
- `DATACHAIN_ROOT_DIR` – Specifies the root directory where DataChain will create the `.datachain` folder to store its internal data. (default: the current working directory).
|
|
8
|
+
- `DATACHAIN_SYSTEM_CONFIG_DIR` – Overrides the system-wide configuration directory (default depends on the platform).
|
|
9
|
+
- `DATACHAIN_GLOBAL_CONFIG_DIR` – Overrides the user's global configuration directory (default depends on the platform).
|
|
10
|
+
- `DATACHAIN_NO_ANALYTICS` – Disables telemetry.
|
|
11
|
+
|
|
12
|
+
### Studio Integration
|
|
13
|
+
|
|
14
|
+
- `DATACHAIN_STUDIO_URL` – Custom Studio URL.
|
|
15
|
+
- `DATACHAIN_STUDIO_TOKEN` – Authentication token for Studio.
|
|
16
|
+
- `DATACHAIN_STUDIO_TEAM` – Studio team name.
|
|
17
|
+
|
|
18
|
+
Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code.
|
|
@@ -10,3 +10,4 @@ Welcome to the DataChain User Guide! This section provides comprehensive documen
|
|
|
10
10
|
- [Data Processing Overview](./processing.md) - Discover DataChain's specialized data processing features.
|
|
11
11
|
- [Delta Processing](./delta.md) - Incremental data processing to efficiently handle large datasets that change over time.
|
|
12
12
|
- [Error Handling and Retries](./retry.md) - Learn how to handle processing errors and selectively reprocess problematic records.
|
|
13
|
+
- [Environment Variables](./env.md) - Configure DataChain's behavior using environment variables.
|
|
@@ -105,6 +105,7 @@ nav:
|
|
|
105
105
|
- Overview: guide/processing.md
|
|
106
106
|
- Delta Processing: guide/delta.md
|
|
107
107
|
- Errors Handling and Retries: guide/retry.md
|
|
108
|
+
- Environment Variables: guide/env.md
|
|
108
109
|
- 🤝 Contributing: contributing.md
|
|
109
110
|
|
|
110
111
|
- DataChain Website ↗: https://datachain.ai" target="_blank"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from datachain.lib import namespaces, projects
|
|
1
2
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
3
|
from datachain.lib.dc import (
|
|
3
4
|
C,
|
|
@@ -67,7 +68,9 @@ __all__ = [
|
|
|
67
68
|
"is_chain_type",
|
|
68
69
|
"listings",
|
|
69
70
|
"metrics",
|
|
71
|
+
"namespaces",
|
|
70
72
|
"param",
|
|
73
|
+
"projects",
|
|
71
74
|
"read_csv",
|
|
72
75
|
"read_database",
|
|
73
76
|
"read_dataset",
|
|
@@ -41,6 +41,7 @@ from datachain.dataset import (
|
|
|
41
41
|
DatasetStatus,
|
|
42
42
|
StorageURI,
|
|
43
43
|
create_dataset_uri,
|
|
44
|
+
parse_dataset_name,
|
|
44
45
|
parse_dataset_uri,
|
|
45
46
|
)
|
|
46
47
|
from datachain.error import (
|
|
@@ -48,12 +49,14 @@ from datachain.error import (
|
|
|
48
49
|
DatasetInvalidVersionError,
|
|
49
50
|
DatasetNotFoundError,
|
|
50
51
|
DatasetVersionNotFoundError,
|
|
52
|
+
ProjectNotFoundError,
|
|
51
53
|
QueryScriptCancelError,
|
|
52
54
|
QueryScriptRunError,
|
|
53
55
|
)
|
|
54
56
|
from datachain.lib.listing import get_listing
|
|
55
57
|
from datachain.node import DirType, Node, NodeWithPath
|
|
56
58
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
59
|
+
from datachain.project import Project
|
|
57
60
|
from datachain.sql.types import DateTime, SQLType
|
|
58
61
|
from datachain.utils import DataChainDir
|
|
59
62
|
|
|
@@ -155,9 +158,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
155
158
|
self,
|
|
156
159
|
metastore: "AbstractMetastore",
|
|
157
160
|
warehouse: "AbstractWarehouse",
|
|
158
|
-
|
|
161
|
+
remote_ds: DatasetRecord,
|
|
159
162
|
remote_ds_version: str,
|
|
160
|
-
|
|
163
|
+
local_ds: DatasetRecord,
|
|
161
164
|
local_ds_version: str,
|
|
162
165
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
163
166
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
@@ -169,9 +172,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
169
172
|
self._check_dependencies()
|
|
170
173
|
self.metastore = metastore
|
|
171
174
|
self.warehouse = warehouse
|
|
172
|
-
self.
|
|
175
|
+
self.remote_ds = remote_ds
|
|
173
176
|
self.remote_ds_version = remote_ds_version
|
|
174
|
-
self.
|
|
177
|
+
self.local_ds = local_ds
|
|
175
178
|
self.local_ds_version = local_ds_version
|
|
176
179
|
self.schema = schema
|
|
177
180
|
self.last_status_check: Optional[float] = None
|
|
@@ -207,7 +210,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
207
210
|
Checks are done every PULL_DATASET_CHECK_STATUS_INTERVAL seconds
|
|
208
211
|
"""
|
|
209
212
|
export_status_response = self.studio_client.dataset_export_status(
|
|
210
|
-
self.
|
|
213
|
+
self.remote_ds, self.remote_ds_version
|
|
211
214
|
)
|
|
212
215
|
if not export_status_response.ok:
|
|
213
216
|
raise DataChainError(export_status_response.message)
|
|
@@ -254,9 +257,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
254
257
|
import pandas as pd
|
|
255
258
|
|
|
256
259
|
# metastore and warehouse are not thread safe
|
|
257
|
-
with self.
|
|
258
|
-
local_ds = metastore.get_dataset(self.local_ds_name)
|
|
259
|
-
|
|
260
|
+
with self.warehouse.clone() as warehouse:
|
|
260
261
|
urls = list(urls)
|
|
261
262
|
|
|
262
263
|
for url in urls:
|
|
@@ -269,7 +270,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
269
270
|
df = self.fix_columns(df)
|
|
270
271
|
|
|
271
272
|
inserted = warehouse.insert_dataset_rows(
|
|
272
|
-
df, local_ds, self.local_ds_version
|
|
273
|
+
df, self.local_ds, self.local_ds_version
|
|
273
274
|
)
|
|
274
275
|
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
275
276
|
# sometimes progress bar doesn't get updated so manually updating it
|
|
@@ -675,7 +676,11 @@ class Catalog:
|
|
|
675
676
|
listing: Optional[Listing]
|
|
676
677
|
if src.startswith("ds://"):
|
|
677
678
|
ds_name, ds_version = parse_dataset_uri(src)
|
|
678
|
-
|
|
679
|
+
ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
|
|
680
|
+
assert ds_namespace
|
|
681
|
+
assert ds_project
|
|
682
|
+
project = self.metastore.get_project(ds_project, ds_namespace)
|
|
683
|
+
dataset = self.get_dataset(ds_name, project)
|
|
679
684
|
if not ds_version:
|
|
680
685
|
ds_version = dataset.latest_version
|
|
681
686
|
dataset_sources = self.warehouse.get_dataset_sources(
|
|
@@ -695,7 +700,11 @@ class Catalog:
|
|
|
695
700
|
dataset_name=dataset_name,
|
|
696
701
|
)
|
|
697
702
|
rows = DatasetQuery(
|
|
698
|
-
name=dataset.name,
|
|
703
|
+
name=dataset.name,
|
|
704
|
+
namespace_name=dataset.project.namespace.name,
|
|
705
|
+
project_name=dataset.project.name,
|
|
706
|
+
version=ds_version,
|
|
707
|
+
catalog=self,
|
|
699
708
|
).to_db_records()
|
|
700
709
|
indexed_sources.append(
|
|
701
710
|
(
|
|
@@ -769,6 +778,7 @@ class Catalog:
|
|
|
769
778
|
def create_dataset(
|
|
770
779
|
self,
|
|
771
780
|
name: str,
|
|
781
|
+
project: Optional[Project] = None,
|
|
772
782
|
version: Optional[str] = None,
|
|
773
783
|
*,
|
|
774
784
|
columns: Sequence[Column],
|
|
@@ -788,6 +798,7 @@ class Catalog:
|
|
|
788
798
|
If version is None, then next unused version is created.
|
|
789
799
|
If version is given, then it must be an unused version.
|
|
790
800
|
"""
|
|
801
|
+
DatasetRecord.validate_name(name)
|
|
791
802
|
assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
|
|
792
803
|
if not listing and Client.is_data_source_uri(name):
|
|
793
804
|
raise RuntimeError(
|
|
@@ -795,7 +806,7 @@ class Catalog:
|
|
|
795
806
|
)
|
|
796
807
|
default_version = DEFAULT_DATASET_VERSION
|
|
797
808
|
try:
|
|
798
|
-
dataset = self.get_dataset(name)
|
|
809
|
+
dataset = self.get_dataset(name, project)
|
|
799
810
|
default_version = dataset.next_version_patch
|
|
800
811
|
if update_version == "major":
|
|
801
812
|
default_version = dataset.next_version_major
|
|
@@ -820,6 +831,7 @@ class Catalog:
|
|
|
820
831
|
}
|
|
821
832
|
dataset = self.metastore.create_dataset(
|
|
822
833
|
name,
|
|
834
|
+
project.id if project else None,
|
|
823
835
|
feature_schema=feature_schema,
|
|
824
836
|
query_script=query_script,
|
|
825
837
|
schema=schema,
|
|
@@ -892,7 +904,7 @@ class Catalog:
|
|
|
892
904
|
)
|
|
893
905
|
|
|
894
906
|
if create_rows_table:
|
|
895
|
-
table_name = self.warehouse.dataset_table_name(dataset
|
|
907
|
+
table_name = self.warehouse.dataset_table_name(dataset, version)
|
|
896
908
|
self.warehouse.create_dataset_rows_table(table_name, columns=columns)
|
|
897
909
|
self.update_dataset_version_with_warehouse_info(dataset, version)
|
|
898
910
|
|
|
@@ -923,7 +935,13 @@ class Catalog:
|
|
|
923
935
|
|
|
924
936
|
if not dataset_version.preview:
|
|
925
937
|
values["preview"] = (
|
|
926
|
-
DatasetQuery(
|
|
938
|
+
DatasetQuery(
|
|
939
|
+
name=dataset.name,
|
|
940
|
+
namespace_name=dataset.project.namespace.name,
|
|
941
|
+
project_name=dataset.project.name,
|
|
942
|
+
version=version,
|
|
943
|
+
catalog=self,
|
|
944
|
+
)
|
|
927
945
|
.limit(20)
|
|
928
946
|
.to_db_records()
|
|
929
947
|
)
|
|
@@ -949,6 +967,7 @@ class Catalog:
|
|
|
949
967
|
# updating name must result in updating dataset table names as well
|
|
950
968
|
for version in [v.version for v in dataset.versions]:
|
|
951
969
|
self.warehouse.rename_dataset_table(
|
|
970
|
+
dataset,
|
|
952
971
|
old_name,
|
|
953
972
|
new_name,
|
|
954
973
|
old_version=version,
|
|
@@ -986,6 +1005,7 @@ class Catalog:
|
|
|
986
1005
|
self,
|
|
987
1006
|
name: str,
|
|
988
1007
|
sources: list[str],
|
|
1008
|
+
project: Optional[Project] = None,
|
|
989
1009
|
client_config=None,
|
|
990
1010
|
recursive=False,
|
|
991
1011
|
) -> DatasetRecord:
|
|
@@ -994,6 +1014,8 @@ class Catalog:
|
|
|
994
1014
|
|
|
995
1015
|
from datachain import read_dataset, read_storage
|
|
996
1016
|
|
|
1017
|
+
project = project or self.metastore.default_project
|
|
1018
|
+
|
|
997
1019
|
chains = []
|
|
998
1020
|
for source in sources:
|
|
999
1021
|
if source.startswith(DATASET_PREFIX):
|
|
@@ -1006,10 +1028,11 @@ class Catalog:
|
|
|
1006
1028
|
# create union of all dataset queries created from sources
|
|
1007
1029
|
dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
|
|
1008
1030
|
try:
|
|
1031
|
+
dc = dc.settings(project=project.name, namespace=project.namespace.name)
|
|
1009
1032
|
dc.save(name)
|
|
1010
1033
|
except Exception as e: # noqa: BLE001
|
|
1011
1034
|
try:
|
|
1012
|
-
ds = self.get_dataset(name)
|
|
1035
|
+
ds = self.get_dataset(name, project)
|
|
1013
1036
|
self.metastore.update_dataset_status(
|
|
1014
1037
|
ds,
|
|
1015
1038
|
DatasetStatus.FAILED,
|
|
@@ -1026,7 +1049,7 @@ class Catalog:
|
|
|
1026
1049
|
except DatasetNotFoundError:
|
|
1027
1050
|
raise e from None
|
|
1028
1051
|
|
|
1029
|
-
ds = self.get_dataset(name)
|
|
1052
|
+
ds = self.get_dataset(name, project)
|
|
1030
1053
|
|
|
1031
1054
|
self.update_dataset_version_with_warehouse_info(
|
|
1032
1055
|
ds,
|
|
@@ -1034,49 +1057,67 @@ class Catalog:
|
|
|
1034
1057
|
sources="\n".join(sources),
|
|
1035
1058
|
)
|
|
1036
1059
|
|
|
1037
|
-
return self.get_dataset(name)
|
|
1060
|
+
return self.get_dataset(name, project)
|
|
1038
1061
|
|
|
1039
|
-
def get_dataset(
|
|
1040
|
-
|
|
1062
|
+
def get_dataset(
|
|
1063
|
+
self, name: str, project: Optional[Project] = None
|
|
1064
|
+
) -> DatasetRecord:
|
|
1065
|
+
from datachain.lib.listing import is_listing_dataset
|
|
1066
|
+
|
|
1067
|
+
if is_listing_dataset(name):
|
|
1068
|
+
project = self.metastore.listing_project
|
|
1069
|
+
return self.metastore.get_dataset(name, project.id if project else None)
|
|
1041
1070
|
|
|
1042
1071
|
def get_dataset_with_remote_fallback(
|
|
1043
|
-
self,
|
|
1072
|
+
self,
|
|
1073
|
+
name: str,
|
|
1074
|
+
namespace_name: str,
|
|
1075
|
+
project_name: str,
|
|
1076
|
+
version: Optional[str] = None,
|
|
1044
1077
|
) -> DatasetRecord:
|
|
1045
1078
|
try:
|
|
1046
|
-
|
|
1079
|
+
project = self.metastore.get_project(project_name, namespace_name)
|
|
1080
|
+
ds = self.get_dataset(name, project)
|
|
1047
1081
|
if version and not ds.has_version(version):
|
|
1048
1082
|
raise DatasetVersionNotFoundError(
|
|
1049
1083
|
f"Dataset {name} does not have version {version}"
|
|
1050
1084
|
)
|
|
1051
1085
|
return ds
|
|
1052
1086
|
|
|
1053
|
-
except (
|
|
1087
|
+
except (
|
|
1088
|
+
ProjectNotFoundError,
|
|
1089
|
+
DatasetNotFoundError,
|
|
1090
|
+
DatasetVersionNotFoundError,
|
|
1091
|
+
):
|
|
1054
1092
|
print("Dataset not found in local catalog, trying to get from studio")
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
remote_ds_uri += f"@v{version}"
|
|
1093
|
+
remote_ds_uri = create_dataset_uri(
|
|
1094
|
+
name, namespace_name, project_name, version
|
|
1095
|
+
)
|
|
1059
1096
|
|
|
1060
1097
|
self.pull_dataset(
|
|
1061
1098
|
remote_ds_uri=remote_ds_uri,
|
|
1062
1099
|
local_ds_name=name,
|
|
1063
1100
|
local_ds_version=version,
|
|
1064
1101
|
)
|
|
1065
|
-
return self.get_dataset(
|
|
1102
|
+
return self.get_dataset(
|
|
1103
|
+
name, self.metastore.get_project(project_name, namespace_name)
|
|
1104
|
+
)
|
|
1066
1105
|
|
|
1067
1106
|
def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
|
|
1068
1107
|
"""Returns dataset that contains version with specific uuid"""
|
|
1069
1108
|
for dataset in self.ls_datasets():
|
|
1070
1109
|
if dataset.has_version_with_uuid(uuid):
|
|
1071
|
-
return self.get_dataset(dataset.name)
|
|
1110
|
+
return self.get_dataset(dataset.name, dataset.project)
|
|
1072
1111
|
raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
|
|
1073
1112
|
|
|
1074
|
-
def get_remote_dataset(
|
|
1113
|
+
def get_remote_dataset(
|
|
1114
|
+
self, namespace: str, project: str, name: str
|
|
1115
|
+
) -> DatasetRecord:
|
|
1075
1116
|
from datachain.remote.studio import StudioClient
|
|
1076
1117
|
|
|
1077
1118
|
studio_client = StudioClient()
|
|
1078
1119
|
|
|
1079
|
-
info_response = studio_client.dataset_info(name)
|
|
1120
|
+
info_response = studio_client.dataset_info(namespace, project, name)
|
|
1080
1121
|
if not info_response.ok:
|
|
1081
1122
|
raise DataChainError(info_response.message)
|
|
1082
1123
|
|
|
@@ -1085,9 +1126,9 @@ class Catalog:
|
|
|
1085
1126
|
return DatasetRecord.from_dict(dataset_info)
|
|
1086
1127
|
|
|
1087
1128
|
def get_dataset_dependencies(
|
|
1088
|
-
self, name: str, version: str, indirect=False
|
|
1129
|
+
self, name: str, version: str, project: Optional[Project] = None, indirect=False
|
|
1089
1130
|
) -> list[Optional[DatasetDependency]]:
|
|
1090
|
-
dataset = self.get_dataset(name)
|
|
1131
|
+
dataset = self.get_dataset(name, project)
|
|
1091
1132
|
|
|
1092
1133
|
direct_dependencies = self.metastore.get_direct_dataset_dependencies(
|
|
1093
1134
|
dataset, version
|
|
@@ -1101,9 +1142,10 @@ class Catalog:
|
|
|
1101
1142
|
# dependency has been removed
|
|
1102
1143
|
continue
|
|
1103
1144
|
if d.is_dataset:
|
|
1145
|
+
project = self.metastore.get_project(d.project, d.namespace)
|
|
1104
1146
|
# only datasets can have dependencies
|
|
1105
1147
|
d.dependencies = self.get_dataset_dependencies(
|
|
1106
|
-
d.name, d.version, indirect=indirect
|
|
1148
|
+
d.name, d.version, project, indirect=indirect
|
|
1107
1149
|
)
|
|
1108
1150
|
|
|
1109
1151
|
return direct_dependencies
|
|
@@ -1113,9 +1155,12 @@ class Catalog:
|
|
|
1113
1155
|
prefix: Optional[str] = None,
|
|
1114
1156
|
include_listing: bool = False,
|
|
1115
1157
|
studio: bool = False,
|
|
1158
|
+
project: Optional[Project] = None,
|
|
1116
1159
|
) -> Iterator[DatasetListRecord]:
|
|
1117
1160
|
from datachain.remote.studio import StudioClient
|
|
1118
1161
|
|
|
1162
|
+
project_id = project.id if project else None
|
|
1163
|
+
|
|
1119
1164
|
if studio:
|
|
1120
1165
|
client = StudioClient()
|
|
1121
1166
|
response = client.ls_datasets(prefix=prefix)
|
|
@@ -1130,9 +1175,11 @@ class Catalog:
|
|
|
1130
1175
|
if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
|
|
1131
1176
|
)
|
|
1132
1177
|
elif prefix:
|
|
1133
|
-
datasets = self.metastore.list_datasets_by_prefix(
|
|
1178
|
+
datasets = self.metastore.list_datasets_by_prefix(
|
|
1179
|
+
prefix, project_id=project_id
|
|
1180
|
+
)
|
|
1134
1181
|
else:
|
|
1135
|
-
datasets = self.metastore.list_datasets()
|
|
1182
|
+
datasets = self.metastore.list_datasets(project_id=project_id)
|
|
1136
1183
|
|
|
1137
1184
|
for d in datasets:
|
|
1138
1185
|
if not d.is_bucket_listing or include_listing:
|
|
@@ -1144,11 +1191,15 @@ class Catalog:
|
|
|
1144
1191
|
include_listing: bool = False,
|
|
1145
1192
|
with_job: bool = True,
|
|
1146
1193
|
studio: bool = False,
|
|
1194
|
+
project: Optional[Project] = None,
|
|
1147
1195
|
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1148
1196
|
"""Iterate over all dataset versions with related jobs."""
|
|
1149
1197
|
datasets = list(
|
|
1150
1198
|
self.ls_datasets(
|
|
1151
|
-
prefix=prefix,
|
|
1199
|
+
prefix=prefix,
|
|
1200
|
+
include_listing=include_listing,
|
|
1201
|
+
studio=studio,
|
|
1202
|
+
project=project,
|
|
1152
1203
|
)
|
|
1153
1204
|
)
|
|
1154
1205
|
|
|
@@ -1184,6 +1235,7 @@ class Catalog:
|
|
|
1184
1235
|
prefix=prefix,
|
|
1185
1236
|
include_listing=True,
|
|
1186
1237
|
with_job=False,
|
|
1238
|
+
project=self.metastore.listing_project,
|
|
1187
1239
|
)
|
|
1188
1240
|
|
|
1189
1241
|
return [
|
|
@@ -1193,13 +1245,21 @@ class Catalog:
|
|
|
1193
1245
|
]
|
|
1194
1246
|
|
|
1195
1247
|
def ls_dataset_rows(
|
|
1196
|
-
self,
|
|
1248
|
+
self,
|
|
1249
|
+
dataset: DatasetRecord,
|
|
1250
|
+
version: str,
|
|
1251
|
+
offset=None,
|
|
1252
|
+
limit=None,
|
|
1197
1253
|
) -> list[dict]:
|
|
1198
1254
|
from datachain.query.dataset import DatasetQuery
|
|
1199
1255
|
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1256
|
+
q = DatasetQuery(
|
|
1257
|
+
name=dataset.name,
|
|
1258
|
+
namespace_name=dataset.project.namespace.name,
|
|
1259
|
+
project_name=dataset.project.name,
|
|
1260
|
+
version=version,
|
|
1261
|
+
catalog=self,
|
|
1262
|
+
)
|
|
1203
1263
|
if limit:
|
|
1204
1264
|
q = q.limit(limit)
|
|
1205
1265
|
if offset:
|
|
@@ -1232,35 +1292,29 @@ class Catalog:
|
|
|
1232
1292
|
bucket_uri: str,
|
|
1233
1293
|
name: str,
|
|
1234
1294
|
version: str,
|
|
1295
|
+
project: Optional[Project] = None,
|
|
1235
1296
|
client_config=None,
|
|
1236
1297
|
) -> list[str]:
|
|
1237
|
-
dataset = self.get_dataset(name)
|
|
1298
|
+
dataset = self.get_dataset(name, project)
|
|
1238
1299
|
|
|
1239
1300
|
return self.warehouse.export_dataset_table(
|
|
1240
1301
|
bucket_uri, dataset, version, client_config
|
|
1241
1302
|
)
|
|
1242
1303
|
|
|
1243
|
-
def dataset_table_export_file_names(
|
|
1244
|
-
|
|
1304
|
+
def dataset_table_export_file_names(
|
|
1305
|
+
self, name: str, version: str, project: Optional[Project] = None
|
|
1306
|
+
) -> list[str]:
|
|
1307
|
+
dataset = self.get_dataset(name, project)
|
|
1245
1308
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1246
1309
|
|
|
1247
1310
|
def remove_dataset(
|
|
1248
1311
|
self,
|
|
1249
1312
|
name: str,
|
|
1313
|
+
project: Optional[Project] = None,
|
|
1250
1314
|
version: Optional[str] = None,
|
|
1251
1315
|
force: Optional[bool] = False,
|
|
1252
|
-
studio: Optional[bool] = False,
|
|
1253
1316
|
):
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
if studio:
|
|
1257
|
-
client = StudioClient()
|
|
1258
|
-
response = client.rm_dataset(name, version=version, force=force)
|
|
1259
|
-
if not response.ok:
|
|
1260
|
-
raise DataChainError(response.message)
|
|
1261
|
-
return
|
|
1262
|
-
|
|
1263
|
-
dataset = self.get_dataset(name)
|
|
1317
|
+
dataset = self.get_dataset(name, project)
|
|
1264
1318
|
if not version and not force:
|
|
1265
1319
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
1266
1320
|
if version and not dataset.has_version(version):
|
|
@@ -1282,19 +1336,21 @@ class Catalog:
|
|
|
1282
1336
|
def edit_dataset(
|
|
1283
1337
|
self,
|
|
1284
1338
|
name: str,
|
|
1339
|
+
project: Optional[Project] = None,
|
|
1285
1340
|
new_name: Optional[str] = None,
|
|
1286
1341
|
description: Optional[str] = None,
|
|
1287
1342
|
attrs: Optional[list[str]] = None,
|
|
1288
1343
|
) -> DatasetRecord:
|
|
1289
1344
|
update_data = {}
|
|
1290
1345
|
if new_name:
|
|
1346
|
+
DatasetRecord.validate_name(new_name)
|
|
1291
1347
|
update_data["name"] = new_name
|
|
1292
1348
|
if description is not None:
|
|
1293
1349
|
update_data["description"] = description
|
|
1294
1350
|
if attrs is not None:
|
|
1295
1351
|
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1296
1352
|
|
|
1297
|
-
dataset = self.get_dataset(name)
|
|
1353
|
+
dataset = self.get_dataset(name, project)
|
|
1298
1354
|
return self.update_dataset(dataset, **update_data)
|
|
1299
1355
|
|
|
1300
1356
|
def ls(
|
|
@@ -1351,7 +1407,29 @@ class Catalog:
|
|
|
1351
1407
|
except Exception as e:
|
|
1352
1408
|
raise DataChainError("Error when parsing dataset uri") from e
|
|
1353
1409
|
|
|
1354
|
-
|
|
1410
|
+
remote_namespace, remote_project, remote_ds_name = parse_dataset_name(
|
|
1411
|
+
remote_ds_name
|
|
1412
|
+
)
|
|
1413
|
+
if not remote_namespace or not remote_project:
|
|
1414
|
+
raise DataChainError(
|
|
1415
|
+
f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
|
|
1416
|
+
f" or project missing"
|
|
1417
|
+
)
|
|
1418
|
+
|
|
1419
|
+
if local_ds_name:
|
|
1420
|
+
local_namespace, local_project, local_ds_name = parse_dataset_name(
|
|
1421
|
+
local_ds_name
|
|
1422
|
+
)
|
|
1423
|
+
if local_namespace and local_namespace != remote_namespace:
|
|
1424
|
+
raise DataChainError(
|
|
1425
|
+
"Local namespace must be the same to remote namespace"
|
|
1426
|
+
)
|
|
1427
|
+
if local_project and local_project != remote_project:
|
|
1428
|
+
raise DataChainError("Local project must be the same to remote project")
|
|
1429
|
+
|
|
1430
|
+
remote_ds = self.get_remote_dataset(
|
|
1431
|
+
remote_namespace, remote_project, remote_ds_name
|
|
1432
|
+
)
|
|
1355
1433
|
|
|
1356
1434
|
try:
|
|
1357
1435
|
# if version is not specified in uri, take the latest one
|
|
@@ -1359,7 +1437,12 @@ class Catalog:
|
|
|
1359
1437
|
version = remote_ds.latest_version
|
|
1360
1438
|
print(f"Version not specified, pulling the latest one (v{version})")
|
|
1361
1439
|
# updating dataset uri with latest version
|
|
1362
|
-
remote_ds_uri = create_dataset_uri(
|
|
1440
|
+
remote_ds_uri = create_dataset_uri(
|
|
1441
|
+
remote_ds.name,
|
|
1442
|
+
remote_ds.project.namespace.name,
|
|
1443
|
+
remote_ds.project.name,
|
|
1444
|
+
version,
|
|
1445
|
+
)
|
|
1363
1446
|
remote_ds_version = remote_ds.get_version(version)
|
|
1364
1447
|
except (DatasetVersionNotFoundError, StopIteration) as exc:
|
|
1365
1448
|
raise DataChainError(
|
|
@@ -1368,7 +1451,13 @@ class Catalog:
|
|
|
1368
1451
|
|
|
1369
1452
|
local_ds_name = local_ds_name or remote_ds.name
|
|
1370
1453
|
local_ds_version = local_ds_version or remote_ds_version.version
|
|
1371
|
-
|
|
1454
|
+
|
|
1455
|
+
local_ds_uri = create_dataset_uri(
|
|
1456
|
+
local_ds_name,
|
|
1457
|
+
remote_ds.project.namespace.name,
|
|
1458
|
+
remote_ds.project.name,
|
|
1459
|
+
local_ds_version,
|
|
1460
|
+
)
|
|
1372
1461
|
|
|
1373
1462
|
try:
|
|
1374
1463
|
# try to find existing dataset with the same uuid to avoid pulling again
|
|
@@ -1377,7 +1466,10 @@ class Catalog:
|
|
|
1377
1466
|
remote_ds_version.uuid
|
|
1378
1467
|
)
|
|
1379
1468
|
existing_ds_uri = create_dataset_uri(
|
|
1380
|
-
existing_ds.name,
|
|
1469
|
+
existing_ds.name,
|
|
1470
|
+
existing_ds.project.namespace.name,
|
|
1471
|
+
existing_ds.project.name,
|
|
1472
|
+
existing_ds_version.version,
|
|
1381
1473
|
)
|
|
1382
1474
|
if existing_ds_uri == remote_ds_uri:
|
|
1383
1475
|
print(f"Local copy of dataset {remote_ds_uri} already present")
|
|
@@ -1391,8 +1483,26 @@ class Catalog:
|
|
|
1391
1483
|
except DatasetNotFoundError:
|
|
1392
1484
|
pass
|
|
1393
1485
|
|
|
1486
|
+
# Create namespace and project if doesn't exist
|
|
1487
|
+
print(
|
|
1488
|
+
f"Creating namespace {remote_ds.project.namespace.name} and project"
|
|
1489
|
+
f" {remote_ds.project.name}"
|
|
1490
|
+
)
|
|
1491
|
+
|
|
1492
|
+
namespace = self.metastore.create_namespace(
|
|
1493
|
+
remote_ds.project.namespace.name,
|
|
1494
|
+
description=remote_ds.project.namespace.description,
|
|
1495
|
+
uuid=remote_ds.project.namespace.uuid,
|
|
1496
|
+
)
|
|
1497
|
+
project = self.metastore.create_project(
|
|
1498
|
+
remote_ds.project.name,
|
|
1499
|
+
namespace.name,
|
|
1500
|
+
description=remote_ds.project.description,
|
|
1501
|
+
uuid=remote_ds.project.uuid,
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1394
1504
|
try:
|
|
1395
|
-
local_dataset = self.get_dataset(local_ds_name)
|
|
1505
|
+
local_dataset = self.get_dataset(local_ds_name, project=project)
|
|
1396
1506
|
if local_dataset and local_dataset.has_version(local_ds_version):
|
|
1397
1507
|
raise DataChainError(
|
|
1398
1508
|
f"Local dataset {local_ds_uri} already exists with different uuid,"
|
|
@@ -1414,6 +1524,7 @@ class Catalog:
|
|
|
1414
1524
|
|
|
1415
1525
|
local_ds = self.create_dataset(
|
|
1416
1526
|
local_ds_name,
|
|
1527
|
+
project,
|
|
1417
1528
|
local_ds_version,
|
|
1418
1529
|
query_script=remote_ds_version.query_script,
|
|
1419
1530
|
create_rows=True,
|
|
@@ -1426,7 +1537,7 @@ class Catalog:
|
|
|
1426
1537
|
# asking remote to export dataset rows table to s3 and to return signed
|
|
1427
1538
|
# urls of exported parts, which are in parquet format
|
|
1428
1539
|
export_response = studio_client.export_dataset_table(
|
|
1429
|
-
|
|
1540
|
+
remote_ds, remote_ds_version.version
|
|
1430
1541
|
)
|
|
1431
1542
|
if not export_response.ok:
|
|
1432
1543
|
raise DataChainError(export_response.message)
|
|
@@ -1457,9 +1568,9 @@ class Catalog:
|
|
|
1457
1568
|
rows_fetcher = DatasetRowsFetcher(
|
|
1458
1569
|
metastore,
|
|
1459
1570
|
warehouse,
|
|
1460
|
-
|
|
1571
|
+
remote_ds,
|
|
1461
1572
|
remote_ds_version.version,
|
|
1462
|
-
|
|
1573
|
+
local_ds,
|
|
1463
1574
|
local_ds_version,
|
|
1464
1575
|
schema,
|
|
1465
1576
|
progress_bar=dataset_save_progress_bar,
|
|
@@ -1469,7 +1580,7 @@ class Catalog:
|
|
|
1469
1580
|
iter(batch(signed_urls)), dataset_save_progress_bar
|
|
1470
1581
|
)
|
|
1471
1582
|
except:
|
|
1472
|
-
self.remove_dataset(local_ds_name, local_ds_version)
|
|
1583
|
+
self.remove_dataset(local_ds_name, project, local_ds_version)
|
|
1473
1584
|
raise
|
|
1474
1585
|
|
|
1475
1586
|
local_ds = self.metastore.update_dataset_status(
|
|
@@ -1526,7 +1637,11 @@ class Catalog:
|
|
|
1526
1637
|
)
|
|
1527
1638
|
|
|
1528
1639
|
self.create_dataset_from_sources(
|
|
1529
|
-
output,
|
|
1640
|
+
output,
|
|
1641
|
+
sources,
|
|
1642
|
+
self.metastore.default_project,
|
|
1643
|
+
client_config=client_config,
|
|
1644
|
+
recursive=recursive,
|
|
1530
1645
|
)
|
|
1531
1646
|
|
|
1532
1647
|
def query(
|