datachain 0.7.2__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.2/src/datachain.egg-info → datachain-0.7.3}/PKG-INFO +2 -2
- {datachain-0.7.2 → datachain-0.7.3}/README.rst +1 -1
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/catalog/catalog.py +4 -3
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/cli.py +108 -8
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/metastore.py +63 -11
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/dataset.py +142 -14
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/dataset_info.py +7 -3
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/remote/studio.py +40 -8
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/studio.py +29 -0
- {datachain-0.7.2 → datachain-0.7.3/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_catalog.py +21 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/test_cli_studio.py +119 -0
- {datachain-0.7.2 → datachain-0.7.3}/.cruft.json +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.gitattributes +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/codecov.yaml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/dependabot.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/workflows/release.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/workflows/tests.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.gitignore +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/.pre-commit-config.yaml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/CONTRIBUTING.rst +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/LICENSE +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/index.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/overrides/main.html +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/references/datachain.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/references/datatype.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/references/file.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/references/index.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/references/sql.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/references/torch.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/docs/references/udf.md +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/multimodal/wds.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/mkdocs.yml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/noxfile.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/pyproject.toml +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/setup.cfg +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/__main__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/asyn.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/cache.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/gcs.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/local.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/config.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/error.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/array.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/base.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/conditional.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/func.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/path.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/random.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/string.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/func/window.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/job.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/dc.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/file.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/hf.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/listing.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/listing.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/bbox.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/pose.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/segment.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/node.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/progress.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/py.typed +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/dataset.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/params.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/query/session.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain/utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/conftest.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/data.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/examples/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/examples/test_examples.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_client.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_datachain.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_datasets.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_listing.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_ls.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_pull.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_query.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/func/test_toolkit.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/test_atomicity.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/test_query_e2e.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/test_telemetry.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_client.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_config.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_func.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_listing.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_query.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_session.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.2 → datachain-0.7.3}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -139,7 +139,7 @@ Key Features
|
|
|
139
139
|
============
|
|
140
140
|
|
|
141
141
|
📂 **Multimodal Dataset Versioning.**
|
|
142
|
-
- Version unstructured data without redundant data copies, by
|
|
142
|
+
- Version unstructured data without redundant data copies, by supporting
|
|
143
143
|
references to S3, GCP, Azure, and local file systems.
|
|
144
144
|
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
145
145
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
@@ -37,7 +37,7 @@ Key Features
|
|
|
37
37
|
============
|
|
38
38
|
|
|
39
39
|
📂 **Multimodal Dataset Versioning.**
|
|
40
|
-
- Version unstructured data without redundant data copies, by
|
|
40
|
+
- Version unstructured data without redundant data copies, by supporting
|
|
41
41
|
references to S3, GCP, Azure, and local file systems.
|
|
42
42
|
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
43
43
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
@@ -38,6 +38,7 @@ from datachain.dataset import (
|
|
|
38
38
|
DATASET_PREFIX,
|
|
39
39
|
QUERY_DATASET_PREFIX,
|
|
40
40
|
DatasetDependency,
|
|
41
|
+
DatasetListRecord,
|
|
41
42
|
DatasetRecord,
|
|
42
43
|
DatasetStats,
|
|
43
44
|
DatasetStatus,
|
|
@@ -72,7 +73,7 @@ if TYPE_CHECKING:
|
|
|
72
73
|
AbstractMetastore,
|
|
73
74
|
AbstractWarehouse,
|
|
74
75
|
)
|
|
75
|
-
from datachain.dataset import
|
|
76
|
+
from datachain.dataset import DatasetListVersion
|
|
76
77
|
from datachain.job import Job
|
|
77
78
|
from datachain.lib.file import File
|
|
78
79
|
from datachain.listing import Listing
|
|
@@ -1135,7 +1136,7 @@ class Catalog:
|
|
|
1135
1136
|
|
|
1136
1137
|
return direct_dependencies
|
|
1137
1138
|
|
|
1138
|
-
def ls_datasets(self, include_listing: bool = False) -> Iterator[
|
|
1139
|
+
def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
|
|
1139
1140
|
datasets = self.metastore.list_datasets()
|
|
1140
1141
|
for d in datasets:
|
|
1141
1142
|
if not d.is_bucket_listing or include_listing:
|
|
@@ -1144,7 +1145,7 @@ class Catalog:
|
|
|
1144
1145
|
def list_datasets_versions(
|
|
1145
1146
|
self,
|
|
1146
1147
|
include_listing: bool = False,
|
|
1147
|
-
) -> Iterator[tuple[
|
|
1148
|
+
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1148
1149
|
"""Iterate over all dataset versions with related jobs."""
|
|
1149
1150
|
datasets = list(self.ls_datasets(include_listing=include_listing))
|
|
1150
1151
|
|
|
@@ -18,7 +18,12 @@ from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyVa
|
|
|
18
18
|
from datachain.config import Config
|
|
19
19
|
from datachain.error import DataChainError
|
|
20
20
|
from datachain.lib.dc import DataChain
|
|
21
|
-
from datachain.studio import
|
|
21
|
+
from datachain.studio import (
|
|
22
|
+
edit_studio_dataset,
|
|
23
|
+
list_datasets,
|
|
24
|
+
process_studio_cli_args,
|
|
25
|
+
remove_studio_dataset,
|
|
26
|
+
)
|
|
22
27
|
from datachain.telemetry import telemetry
|
|
23
28
|
|
|
24
29
|
if TYPE_CHECKING:
|
|
@@ -403,21 +408,44 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
403
408
|
parse_edit_dataset.add_argument(
|
|
404
409
|
"--new-name",
|
|
405
410
|
action="store",
|
|
406
|
-
default="",
|
|
407
411
|
help="Dataset new name",
|
|
408
412
|
)
|
|
409
413
|
parse_edit_dataset.add_argument(
|
|
410
414
|
"--description",
|
|
411
415
|
action="store",
|
|
412
|
-
default="",
|
|
413
416
|
help="Dataset description",
|
|
414
417
|
)
|
|
415
418
|
parse_edit_dataset.add_argument(
|
|
416
419
|
"--labels",
|
|
417
|
-
default=[],
|
|
418
420
|
nargs="+",
|
|
419
421
|
help="Dataset labels",
|
|
420
422
|
)
|
|
423
|
+
parse_edit_dataset.add_argument(
|
|
424
|
+
"--studio",
|
|
425
|
+
action="store_true",
|
|
426
|
+
default=False,
|
|
427
|
+
help="Edit dataset from Studio",
|
|
428
|
+
)
|
|
429
|
+
parse_edit_dataset.add_argument(
|
|
430
|
+
"-L",
|
|
431
|
+
"--local",
|
|
432
|
+
action="store_true",
|
|
433
|
+
default=False,
|
|
434
|
+
help="Edit local dataset only",
|
|
435
|
+
)
|
|
436
|
+
parse_edit_dataset.add_argument(
|
|
437
|
+
"-a",
|
|
438
|
+
"--all",
|
|
439
|
+
action="store_true",
|
|
440
|
+
default=True,
|
|
441
|
+
help="Edit both datasets from studio and local",
|
|
442
|
+
)
|
|
443
|
+
parse_edit_dataset.add_argument(
|
|
444
|
+
"--team",
|
|
445
|
+
action="store",
|
|
446
|
+
default=None,
|
|
447
|
+
help="The team to edit a dataset. By default, it will use team from config.",
|
|
448
|
+
)
|
|
421
449
|
|
|
422
450
|
datasets_parser = subp.add_parser(
|
|
423
451
|
"datasets", parents=[parent_parser], description="List datasets"
|
|
@@ -466,6 +494,32 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
466
494
|
action=BooleanOptionalAction,
|
|
467
495
|
help="Force delete registered dataset with all of it's versions",
|
|
468
496
|
)
|
|
497
|
+
rm_dataset_parser.add_argument(
|
|
498
|
+
"--studio",
|
|
499
|
+
action="store_true",
|
|
500
|
+
default=False,
|
|
501
|
+
help="Remove dataset from Studio",
|
|
502
|
+
)
|
|
503
|
+
rm_dataset_parser.add_argument(
|
|
504
|
+
"-L",
|
|
505
|
+
"--local",
|
|
506
|
+
action="store_true",
|
|
507
|
+
default=False,
|
|
508
|
+
help="Remove local datasets only",
|
|
509
|
+
)
|
|
510
|
+
rm_dataset_parser.add_argument(
|
|
511
|
+
"-a",
|
|
512
|
+
"--all",
|
|
513
|
+
action="store_true",
|
|
514
|
+
default=True,
|
|
515
|
+
help="Remove both local and studio",
|
|
516
|
+
)
|
|
517
|
+
rm_dataset_parser.add_argument(
|
|
518
|
+
"--team",
|
|
519
|
+
action="store",
|
|
520
|
+
default=None,
|
|
521
|
+
help="The team to delete a dataset. By default, it will use team from config.",
|
|
522
|
+
)
|
|
469
523
|
|
|
470
524
|
dataset_stats_parser = subp.add_parser(
|
|
471
525
|
"dataset-stats",
|
|
@@ -909,8 +963,40 @@ def rm_dataset(
|
|
|
909
963
|
name: str,
|
|
910
964
|
version: Optional[int] = None,
|
|
911
965
|
force: Optional[bool] = False,
|
|
966
|
+
studio: bool = False,
|
|
967
|
+
local: bool = False,
|
|
968
|
+
all: bool = True,
|
|
969
|
+
team: Optional[str] = None,
|
|
970
|
+
):
|
|
971
|
+
token = Config().read().get("studio", {}).get("token")
|
|
972
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
973
|
+
|
|
974
|
+
if all or local:
|
|
975
|
+
catalog.remove_dataset(name, version=version, force=force)
|
|
976
|
+
|
|
977
|
+
if (all or studio) and token:
|
|
978
|
+
remove_studio_dataset(team, name, version, force)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def edit_dataset(
|
|
982
|
+
catalog: "Catalog",
|
|
983
|
+
name: str,
|
|
984
|
+
new_name: Optional[str] = None,
|
|
985
|
+
description: Optional[str] = None,
|
|
986
|
+
labels: Optional[list[str]] = None,
|
|
987
|
+
studio: bool = False,
|
|
988
|
+
local: bool = False,
|
|
989
|
+
all: bool = True,
|
|
990
|
+
team: Optional[str] = None,
|
|
912
991
|
):
|
|
913
|
-
|
|
992
|
+
token = Config().read().get("studio", {}).get("token")
|
|
993
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
994
|
+
|
|
995
|
+
if all or local:
|
|
996
|
+
catalog.edit_dataset(name, new_name, description, labels)
|
|
997
|
+
|
|
998
|
+
if (all or studio) and token:
|
|
999
|
+
edit_studio_dataset(team, name, new_name, description, labels)
|
|
914
1000
|
|
|
915
1001
|
|
|
916
1002
|
def dataset_stats(
|
|
@@ -1127,11 +1213,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1127
1213
|
edatachain_file=args.edatachain_file,
|
|
1128
1214
|
)
|
|
1129
1215
|
elif args.command == "edit-dataset":
|
|
1130
|
-
|
|
1216
|
+
edit_dataset(
|
|
1217
|
+
catalog,
|
|
1131
1218
|
args.name,
|
|
1132
|
-
description=args.description,
|
|
1133
1219
|
new_name=args.new_name,
|
|
1220
|
+
description=args.description,
|
|
1134
1221
|
labels=args.labels,
|
|
1222
|
+
studio=args.studio,
|
|
1223
|
+
local=args.local,
|
|
1224
|
+
all=args.all,
|
|
1225
|
+
team=args.team,
|
|
1135
1226
|
)
|
|
1136
1227
|
elif args.command == "ls":
|
|
1137
1228
|
ls(
|
|
@@ -1164,7 +1255,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1164
1255
|
schema=args.schema,
|
|
1165
1256
|
)
|
|
1166
1257
|
elif args.command == "rm-dataset":
|
|
1167
|
-
rm_dataset(
|
|
1258
|
+
rm_dataset(
|
|
1259
|
+
catalog,
|
|
1260
|
+
args.name,
|
|
1261
|
+
version=args.version,
|
|
1262
|
+
force=args.force,
|
|
1263
|
+
studio=args.studio,
|
|
1264
|
+
local=args.local,
|
|
1265
|
+
all=args.all,
|
|
1266
|
+
team=args.team,
|
|
1267
|
+
)
|
|
1168
1268
|
elif args.command == "dataset-stats":
|
|
1169
1269
|
dataset_stats(
|
|
1170
1270
|
catalog,
|
|
@@ -27,6 +27,8 @@ from datachain.data_storage import JobQueryType, JobStatus
|
|
|
27
27
|
from datachain.data_storage.serializer import Serializable
|
|
28
28
|
from datachain.dataset import (
|
|
29
29
|
DatasetDependency,
|
|
30
|
+
DatasetListRecord,
|
|
31
|
+
DatasetListVersion,
|
|
30
32
|
DatasetRecord,
|
|
31
33
|
DatasetStatus,
|
|
32
34
|
DatasetVersion,
|
|
@@ -59,6 +61,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
59
61
|
|
|
60
62
|
schema: "schema.Schema"
|
|
61
63
|
dataset_class: type[DatasetRecord] = DatasetRecord
|
|
64
|
+
dataset_list_class: type[DatasetListRecord] = DatasetListRecord
|
|
65
|
+
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
62
66
|
dependency_class: type[DatasetDependency] = DatasetDependency
|
|
63
67
|
job_class: type[Job] = Job
|
|
64
68
|
|
|
@@ -166,11 +170,11 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
166
170
|
"""
|
|
167
171
|
|
|
168
172
|
@abstractmethod
|
|
169
|
-
def list_datasets(self) -> Iterator[
|
|
173
|
+
def list_datasets(self) -> Iterator[DatasetListRecord]:
|
|
170
174
|
"""Lists all datasets."""
|
|
171
175
|
|
|
172
176
|
@abstractmethod
|
|
173
|
-
def list_datasets_by_prefix(self, prefix: str) -> Iterator["
|
|
177
|
+
def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetListRecord"]:
|
|
174
178
|
"""Lists all datasets which names start with prefix."""
|
|
175
179
|
|
|
176
180
|
@abstractmethod
|
|
@@ -348,6 +352,14 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
348
352
|
if c.name # type: ignore [attr-defined]
|
|
349
353
|
]
|
|
350
354
|
|
|
355
|
+
@cached_property
|
|
356
|
+
def _dataset_list_fields(self) -> list[str]:
|
|
357
|
+
return [
|
|
358
|
+
c.name # type: ignore [attr-defined]
|
|
359
|
+
for c in self._datasets_columns()
|
|
360
|
+
if c.name in self.dataset_list_class.__dataclass_fields__ # type: ignore [attr-defined]
|
|
361
|
+
]
|
|
362
|
+
|
|
351
363
|
@classmethod
|
|
352
364
|
def _datasets_versions_columns(cls) -> list["SchemaItem"]:
|
|
353
365
|
"""Datasets versions table columns."""
|
|
@@ -390,6 +402,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
390
402
|
if c.name # type: ignore [attr-defined]
|
|
391
403
|
]
|
|
392
404
|
|
|
405
|
+
@cached_property
|
|
406
|
+
def _dataset_list_version_fields(self) -> list[str]:
|
|
407
|
+
return [
|
|
408
|
+
c.name # type: ignore [attr-defined]
|
|
409
|
+
for c in self._datasets_versions_columns()
|
|
410
|
+
if c.name # type: ignore [attr-defined]
|
|
411
|
+
in self.dataset_list_version_class.__dataclass_fields__
|
|
412
|
+
]
|
|
413
|
+
|
|
393
414
|
@classmethod
|
|
394
415
|
def _datasets_dependencies_columns(cls) -> list["SchemaItem"]:
|
|
395
416
|
"""Datasets dependencies table columns."""
|
|
@@ -671,7 +692,25 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
671
692
|
if dataset:
|
|
672
693
|
yield dataset
|
|
673
694
|
|
|
674
|
-
def
|
|
695
|
+
def _parse_list_dataset(self, rows) -> Optional[DatasetListRecord]:
|
|
696
|
+
versions = [self.dataset_list_class.parse(*r) for r in rows]
|
|
697
|
+
if not versions:
|
|
698
|
+
return None
|
|
699
|
+
return reduce(lambda ds, version: ds.merge_versions(version), versions)
|
|
700
|
+
|
|
701
|
+
def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
|
|
702
|
+
# grouping rows by dataset id
|
|
703
|
+
for _, g in groupby(rows, lambda r: r[0]):
|
|
704
|
+
dataset = self._parse_list_dataset(list(g))
|
|
705
|
+
if dataset:
|
|
706
|
+
yield dataset
|
|
707
|
+
|
|
708
|
+
def _get_dataset_query(
|
|
709
|
+
self,
|
|
710
|
+
dataset_fields: list[str],
|
|
711
|
+
dataset_version_fields: list[str],
|
|
712
|
+
isouter: bool = True,
|
|
713
|
+
):
|
|
675
714
|
if not (
|
|
676
715
|
self.db.has_table(self._datasets.name)
|
|
677
716
|
and self.db.has_table(self._datasets_versions.name)
|
|
@@ -680,23 +719,36 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
680
719
|
|
|
681
720
|
d = self._datasets
|
|
682
721
|
dv = self._datasets_versions
|
|
722
|
+
|
|
683
723
|
query = self._datasets_select(
|
|
684
|
-
*(getattr(d.c, f) for f in
|
|
685
|
-
*(getattr(dv.c, f) for f in
|
|
724
|
+
*(getattr(d.c, f) for f in dataset_fields),
|
|
725
|
+
*(getattr(dv.c, f) for f in dataset_version_fields),
|
|
686
726
|
)
|
|
687
|
-
j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=
|
|
727
|
+
j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
|
|
688
728
|
return query.select_from(j)
|
|
689
729
|
|
|
690
|
-
def
|
|
730
|
+
def _base_dataset_query(self):
|
|
731
|
+
return self._get_dataset_query(
|
|
732
|
+
self._dataset_fields, self._dataset_version_fields
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
def _base_list_datasets_query(self):
|
|
736
|
+
return self._get_dataset_query(
|
|
737
|
+
self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
def list_datasets(self) -> Iterator["DatasetListRecord"]:
|
|
691
741
|
"""Lists all datasets."""
|
|
692
|
-
yield from self.
|
|
742
|
+
yield from self._parse_dataset_list(
|
|
743
|
+
self.db.execute(self._base_list_datasets_query())
|
|
744
|
+
)
|
|
693
745
|
|
|
694
746
|
def list_datasets_by_prefix(
|
|
695
747
|
self, prefix: str, conn=None
|
|
696
|
-
) -> Iterator["
|
|
697
|
-
query = self.
|
|
748
|
+
) -> Iterator["DatasetListRecord"]:
|
|
749
|
+
query = self._base_list_datasets_query()
|
|
698
750
|
query = query.where(self._datasets.c.name.startswith(prefix))
|
|
699
|
-
yield from self.
|
|
751
|
+
yield from self._parse_dataset_list(self.db.execute(query))
|
|
700
752
|
|
|
701
753
|
def get_dataset(self, name: str, conn=None) -> DatasetRecord:
|
|
702
754
|
"""Gets a single dataset by name"""
|
|
@@ -15,7 +15,9 @@ from datachain.error import DatasetVersionNotFoundError
|
|
|
15
15
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
16
16
|
|
|
17
17
|
T = TypeVar("T", bound="DatasetRecord")
|
|
18
|
+
LT = TypeVar("LT", bound="DatasetListRecord")
|
|
18
19
|
V = TypeVar("V", bound="DatasetVersion")
|
|
20
|
+
LV = TypeVar("LV", bound="DatasetListVersion")
|
|
19
21
|
DD = TypeVar("DD", bound="DatasetDependency")
|
|
20
22
|
|
|
21
23
|
DATASET_PREFIX = "ds://"
|
|
@@ -264,6 +266,59 @@ class DatasetVersion:
|
|
|
264
266
|
return cls(**kwargs)
|
|
265
267
|
|
|
266
268
|
|
|
269
|
+
@dataclass
|
|
270
|
+
class DatasetListVersion:
|
|
271
|
+
id: int
|
|
272
|
+
uuid: str
|
|
273
|
+
dataset_id: int
|
|
274
|
+
version: int
|
|
275
|
+
status: int
|
|
276
|
+
created_at: datetime
|
|
277
|
+
finished_at: Optional[datetime]
|
|
278
|
+
error_message: str
|
|
279
|
+
error_stack: str
|
|
280
|
+
num_objects: Optional[int]
|
|
281
|
+
size: Optional[int]
|
|
282
|
+
query_script: str = ""
|
|
283
|
+
job_id: Optional[str] = None
|
|
284
|
+
|
|
285
|
+
@classmethod
|
|
286
|
+
def parse(
|
|
287
|
+
cls: type[LV],
|
|
288
|
+
id: int,
|
|
289
|
+
uuid: str,
|
|
290
|
+
dataset_id: int,
|
|
291
|
+
version: int,
|
|
292
|
+
status: int,
|
|
293
|
+
created_at: datetime,
|
|
294
|
+
finished_at: Optional[datetime],
|
|
295
|
+
error_message: str,
|
|
296
|
+
error_stack: str,
|
|
297
|
+
num_objects: Optional[int],
|
|
298
|
+
size: Optional[int],
|
|
299
|
+
query_script: str = "",
|
|
300
|
+
job_id: Optional[str] = None,
|
|
301
|
+
):
|
|
302
|
+
return cls(
|
|
303
|
+
id,
|
|
304
|
+
uuid,
|
|
305
|
+
dataset_id,
|
|
306
|
+
version,
|
|
307
|
+
status,
|
|
308
|
+
created_at,
|
|
309
|
+
finished_at,
|
|
310
|
+
error_message,
|
|
311
|
+
error_stack,
|
|
312
|
+
num_objects,
|
|
313
|
+
size,
|
|
314
|
+
query_script,
|
|
315
|
+
job_id,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def __hash__(self):
|
|
319
|
+
return hash(f"{self.dataset_id}_{self.version}")
|
|
320
|
+
|
|
321
|
+
|
|
267
322
|
@dataclass
|
|
268
323
|
class DatasetRecord:
|
|
269
324
|
id: int
|
|
@@ -447,20 +502,6 @@ class DatasetRecord:
|
|
|
447
502
|
identifier = self.identifier(version)
|
|
448
503
|
return f"{DATASET_PREFIX}{identifier}"
|
|
449
504
|
|
|
450
|
-
@property
|
|
451
|
-
def is_bucket_listing(self) -> bool:
|
|
452
|
-
"""
|
|
453
|
-
For bucket listing we implicitly create underlying dataset to hold data. This
|
|
454
|
-
method is checking if this is one of those datasets.
|
|
455
|
-
"""
|
|
456
|
-
from datachain.client import Client
|
|
457
|
-
|
|
458
|
-
# TODO refactor and maybe remove method in
|
|
459
|
-
# https://github.com/iterative/datachain/issues/318
|
|
460
|
-
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
|
461
|
-
LISTING_PREFIX
|
|
462
|
-
)
|
|
463
|
-
|
|
464
505
|
@property
|
|
465
506
|
def versions_values(self) -> list[int]:
|
|
466
507
|
"""
|
|
@@ -499,5 +540,92 @@ class DatasetRecord:
|
|
|
499
540
|
return cls(**kwargs, versions=versions)
|
|
500
541
|
|
|
501
542
|
|
|
543
|
+
@dataclass
|
|
544
|
+
class DatasetListRecord:
|
|
545
|
+
id: int
|
|
546
|
+
name: str
|
|
547
|
+
description: Optional[str]
|
|
548
|
+
labels: list[str]
|
|
549
|
+
versions: list[DatasetListVersion]
|
|
550
|
+
created_at: Optional[datetime] = None
|
|
551
|
+
|
|
552
|
+
@classmethod
|
|
553
|
+
def parse( # noqa: PLR0913
|
|
554
|
+
cls: type[LT],
|
|
555
|
+
id: int,
|
|
556
|
+
name: str,
|
|
557
|
+
description: Optional[str],
|
|
558
|
+
labels: str,
|
|
559
|
+
created_at: datetime,
|
|
560
|
+
version_id: int,
|
|
561
|
+
version_uuid: str,
|
|
562
|
+
version_dataset_id: int,
|
|
563
|
+
version: int,
|
|
564
|
+
version_status: int,
|
|
565
|
+
version_created_at: datetime,
|
|
566
|
+
version_finished_at: Optional[datetime],
|
|
567
|
+
version_error_message: str,
|
|
568
|
+
version_error_stack: str,
|
|
569
|
+
version_num_objects: Optional[int],
|
|
570
|
+
version_size: Optional[int],
|
|
571
|
+
version_query_script: Optional[str],
|
|
572
|
+
version_job_id: Optional[str] = None,
|
|
573
|
+
) -> "DatasetListRecord":
|
|
574
|
+
labels_lst: list[str] = json.loads(labels) if labels else []
|
|
575
|
+
|
|
576
|
+
dataset_version = DatasetListVersion.parse(
|
|
577
|
+
version_id,
|
|
578
|
+
version_uuid,
|
|
579
|
+
version_dataset_id,
|
|
580
|
+
version,
|
|
581
|
+
version_status,
|
|
582
|
+
version_created_at,
|
|
583
|
+
version_finished_at,
|
|
584
|
+
version_error_message,
|
|
585
|
+
version_error_stack,
|
|
586
|
+
version_num_objects,
|
|
587
|
+
version_size,
|
|
588
|
+
version_query_script, # type: ignore[arg-type]
|
|
589
|
+
version_job_id,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
return cls(
|
|
593
|
+
id,
|
|
594
|
+
name,
|
|
595
|
+
description,
|
|
596
|
+
labels_lst,
|
|
597
|
+
[dataset_version],
|
|
598
|
+
created_at,
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
|
|
602
|
+
"""Merge versions from another dataset"""
|
|
603
|
+
if other.id != self.id:
|
|
604
|
+
raise RuntimeError("Cannot merge versions of datasets with different ids")
|
|
605
|
+
if not other.versions:
|
|
606
|
+
# nothing to merge
|
|
607
|
+
return self
|
|
608
|
+
if not self.versions:
|
|
609
|
+
self.versions = []
|
|
610
|
+
|
|
611
|
+
self.versions = list(set(self.versions + other.versions))
|
|
612
|
+
self.versions.sort(key=lambda v: v.version)
|
|
613
|
+
return self
|
|
614
|
+
|
|
615
|
+
@property
|
|
616
|
+
def is_bucket_listing(self) -> bool:
|
|
617
|
+
"""
|
|
618
|
+
For bucket listing we implicitly create underlying dataset to hold data. This
|
|
619
|
+
method is checking if this is one of those datasets.
|
|
620
|
+
"""
|
|
621
|
+
from datachain.client import Client
|
|
622
|
+
|
|
623
|
+
# TODO refactor and maybe remove method in
|
|
624
|
+
# https://github.com/iterative/datachain/issues/318
|
|
625
|
+
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
|
626
|
+
LISTING_PREFIX
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
|
|
502
630
|
class RowDict(dict):
|
|
503
631
|
pass
|
|
@@ -5,7 +5,11 @@ from uuid import uuid4
|
|
|
5
5
|
|
|
6
6
|
from pydantic import Field, field_validator
|
|
7
7
|
|
|
8
|
-
from datachain.dataset import
|
|
8
|
+
from datachain.dataset import (
|
|
9
|
+
DatasetListRecord,
|
|
10
|
+
DatasetListVersion,
|
|
11
|
+
DatasetStatus,
|
|
12
|
+
)
|
|
9
13
|
from datachain.job import Job
|
|
10
14
|
from datachain.lib.data_model import DataModel
|
|
11
15
|
from datachain.utils import TIME_ZERO
|
|
@@ -57,8 +61,8 @@ class DatasetInfo(DataModel):
|
|
|
57
61
|
@classmethod
|
|
58
62
|
def from_models(
|
|
59
63
|
cls,
|
|
60
|
-
dataset:
|
|
61
|
-
version:
|
|
64
|
+
dataset: DatasetListRecord,
|
|
65
|
+
version: DatasetListVersion,
|
|
62
66
|
job: Optional[Job],
|
|
63
67
|
) -> "Self":
|
|
64
68
|
return cls(
|
|
@@ -178,17 +178,9 @@ class StudioClient:
|
|
|
178
178
|
data = {}
|
|
179
179
|
|
|
180
180
|
if not ok:
|
|
181
|
-
logger.error(
|
|
182
|
-
"Got bad response from Studio, content is %s",
|
|
183
|
-
response.content.decode("utf-8"),
|
|
184
|
-
)
|
|
185
181
|
if response.status_code == 403:
|
|
186
182
|
message = f"Not authorized for the team {self.team}"
|
|
187
183
|
else:
|
|
188
|
-
logger.error(
|
|
189
|
-
"Got bad response from Studio, content is %s",
|
|
190
|
-
response.content.decode("utf-8"),
|
|
191
|
-
)
|
|
192
184
|
message = data.get("message", "")
|
|
193
185
|
else:
|
|
194
186
|
message = ""
|
|
@@ -230,6 +222,46 @@ class StudioClient:
|
|
|
230
222
|
def ls_datasets(self) -> Response[LsData]:
|
|
231
223
|
return self._send_request("datachain/ls-datasets", {})
|
|
232
224
|
|
|
225
|
+
def edit_dataset(
|
|
226
|
+
self,
|
|
227
|
+
name: str,
|
|
228
|
+
new_name: Optional[str] = None,
|
|
229
|
+
description: Optional[str] = None,
|
|
230
|
+
labels: Optional[list[str]] = None,
|
|
231
|
+
) -> Response[DatasetInfoData]:
|
|
232
|
+
body = {
|
|
233
|
+
"dataset_name": name,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if new_name is not None:
|
|
237
|
+
body["new_name"] = new_name
|
|
238
|
+
|
|
239
|
+
if description is not None:
|
|
240
|
+
body["description"] = description
|
|
241
|
+
|
|
242
|
+
if labels is not None:
|
|
243
|
+
body["labels"] = labels # type: ignore[assignment]
|
|
244
|
+
|
|
245
|
+
return self._send_request(
|
|
246
|
+
"datachain/edit-dataset",
|
|
247
|
+
body,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def rm_dataset(
|
|
251
|
+
self,
|
|
252
|
+
name: str,
|
|
253
|
+
version: Optional[int] = None,
|
|
254
|
+
force: Optional[bool] = False,
|
|
255
|
+
) -> Response[DatasetInfoData]:
|
|
256
|
+
return self._send_request(
|
|
257
|
+
"datachain/rm-dataset",
|
|
258
|
+
{
|
|
259
|
+
"dataset_name": name,
|
|
260
|
+
"version": version,
|
|
261
|
+
"force": force,
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
|
|
233
265
|
def dataset_info(self, name: str) -> Response[DatasetInfoData]:
|
|
234
266
|
def _parse_dataset_info(dataset_info):
|
|
235
267
|
_parse_dates(dataset_info, ["created_at", "finished_at"])
|
|
@@ -130,6 +130,35 @@ def list_datasets(team: Optional[str] = None):
|
|
|
130
130
|
yield (name, version)
|
|
131
131
|
|
|
132
132
|
|
|
133
|
+
def edit_studio_dataset(
|
|
134
|
+
team_name: Optional[str],
|
|
135
|
+
name: str,
|
|
136
|
+
new_name: Optional[str] = None,
|
|
137
|
+
description: Optional[str] = None,
|
|
138
|
+
labels: Optional[list[str]] = None,
|
|
139
|
+
):
|
|
140
|
+
client = StudioClient(team=team_name)
|
|
141
|
+
response = client.edit_dataset(name, new_name, description, labels)
|
|
142
|
+
if not response.ok:
|
|
143
|
+
raise_remote_error(response.message)
|
|
144
|
+
|
|
145
|
+
print(f"Dataset {name} updated")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def remove_studio_dataset(
|
|
149
|
+
team_name: Optional[str],
|
|
150
|
+
name: str,
|
|
151
|
+
version: Optional[int] = None,
|
|
152
|
+
force: Optional[bool] = False,
|
|
153
|
+
):
|
|
154
|
+
client = StudioClient(team=team_name)
|
|
155
|
+
response = client.rm_dataset(name, version, force)
|
|
156
|
+
if not response.ok:
|
|
157
|
+
raise_remote_error(response.message)
|
|
158
|
+
|
|
159
|
+
print(f"Dataset {name} removed")
|
|
160
|
+
|
|
161
|
+
|
|
133
162
|
def save_config(hostname, token):
|
|
134
163
|
config = Config(ConfigLevel.GLOBAL)
|
|
135
164
|
with config.edit() as conf:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -139,7 +139,7 @@ Key Features
|
|
|
139
139
|
============
|
|
140
140
|
|
|
141
141
|
📂 **Multimodal Dataset Versioning.**
|
|
142
|
-
- Version unstructured data without redundant data copies, by
|
|
142
|
+
- Version unstructured data without redundant data copies, by supporting
|
|
143
143
|
references to S3, GCP, Azure, and local file systems.
|
|
144
144
|
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
145
145
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|