datachain 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.1 → datachain-0.7.3}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/.github/workflows/release.yml +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/.github/workflows/tests.yml +3 -3
- {datachain-0.7.1 → datachain-0.7.3}/.pre-commit-config.yaml +1 -1
- {datachain-0.7.1/src/datachain.egg-info → datachain-0.7.3}/PKG-INFO +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/README.rst +1 -1
- datachain-0.7.3/docs/references/sql.md +18 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/computer_vision/openimage-detect.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/examples/get_started/common_sql_functions.py +4 -5
- {datachain-0.7.1 → datachain-0.7.3}/examples/multimodal/clip_inference.py +3 -4
- {datachain-0.7.1 → datachain-0.7.3}/examples/multimodal/wds.py +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/examples/multimodal/wds_filtered.py +6 -10
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/__init__.py +0 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/catalog/catalog.py +12 -9
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/cli.py +109 -9
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/fsspec.py +9 -9
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/metastore.py +63 -11
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/schema.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/sqlite.py +5 -4
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/warehouse.py +18 -18
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/dataset.py +142 -14
- datachain-0.7.3/src/datachain/func/__init__.py +49 -0
- {datachain-0.7.1/src/datachain/lib → datachain-0.7.3/src/datachain}/func/aggregate.py +13 -11
- datachain-0.7.3/src/datachain/func/array.py +176 -0
- datachain-0.7.3/src/datachain/func/base.py +23 -0
- datachain-0.7.3/src/datachain/func/conditional.py +81 -0
- datachain-0.7.3/src/datachain/func/func.py +384 -0
- datachain-0.7.3/src/datachain/func/path.py +110 -0
- datachain-0.7.3/src/datachain/func/random.py +23 -0
- datachain-0.7.3/src/datachain/func/string.py +154 -0
- datachain-0.7.3/src/datachain/func/window.py +49 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/arrow.py +24 -12
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/data_model.py +25 -9
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/dataset_info.py +9 -5
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/dc.py +94 -56
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/hf.py +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/signal_schema.py +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/utils.py +1 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/webdataset_laion.py +5 -5
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/bbox.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/pose.py +5 -5
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/segment.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/nodes_fetcher.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/dataset.py +57 -34
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/remote/studio.py +40 -8
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/__init__.py +0 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/selectable.py +11 -5
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/sqlite/base.py +11 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/studio.py +29 -0
- {datachain-0.7.1 → datachain-0.7.3/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain.egg-info/SOURCES.txt +11 -3
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_catalog.py +21 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_datachain.py +37 -6
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_datasets.py +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_pull.py +1 -1
- {datachain-0.7.1 → datachain-0.7.3}/tests/test_cli_studio.py +119 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/test_query_e2e.py +30 -40
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_arrow.py +34 -6
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_datachain.py +37 -22
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_hf.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_sql_to_python.py +0 -3
- datachain-0.7.3/tests/unit/sql/sqlite/__init__.py +0 -0
- datachain-0.7.3/tests/unit/sql/test_array.py +73 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/sql/test_conditional.py +25 -10
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/sql/test_path.py +10 -9
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/sql/test_random.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/sql/test_string.py +2 -2
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_database_engine.py +15 -4
- datachain-0.7.3/tests/unit/test_func.py +256 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_session.py +2 -1
- datachain-0.7.1/docs/references/sql.md +0 -18
- datachain-0.7.1/src/datachain/lib/func/__init__.py +0 -32
- datachain-0.7.1/src/datachain/lib/func/func.py +0 -152
- datachain-0.7.1/src/datachain/sql/functions/__init__.py +0 -26
- datachain-0.7.1/tests/unit/sql/test_array.py +0 -20
- {datachain-0.7.1 → datachain-0.7.3}/.cruft.json +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.gitattributes +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.github/codecov.yaml +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.github/dependabot.yml +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/.gitignore +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/CONTRIBUTING.rst +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/LICENSE +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/index.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/overrides/main.html +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/references/datachain.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/references/datatype.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/references/file.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/references/index.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/references/torch.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/docs/references/udf.md +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/mkdocs.yml +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/noxfile.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/pyproject.toml +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/setup.cfg +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/__main__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/asyn.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/cache.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/gcs.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/local.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/config.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/error.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/job.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/file.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/listing.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/udf.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/listing.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/node.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/progress.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/py.typed +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/batch.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/params.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/query/session.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.1/tests/benchmarks → datachain-0.7.3/src/datachain/sql/functions}/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain/utils.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain.egg-info/requires.txt +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/__init__.py +0 -0
- {datachain-0.7.1/tests/examples → datachain-0.7.3/tests/benchmarks}/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/conftest.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/data.py +0 -0
- {datachain-0.7.1/tests/func → datachain-0.7.3/tests/examples}/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/examples/test_examples.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.1/tests/unit → datachain-0.7.3/tests/func}/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_client.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_listing.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_ls.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_query.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/func/test_toolkit.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/test_atomicity.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/test_telemetry.py +0 -0
- {datachain-0.7.1/tests/unit/lib → datachain-0.7.3/tests/unit}/__init__.py +0 -0
- {datachain-0.7.1/tests/unit/sql → datachain-0.7.3/tests/unit/lib}/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.1/tests/unit/sql/sqlite → datachain-0.7.3/tests/unit/sql}/__init__.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_client.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_config.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_listing.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_query.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.1 → datachain-0.7.3}/tests/utils.py +0 -0
|
@@ -28,7 +28,7 @@ jobs:
|
|
|
28
28
|
python-version: '3.9'
|
|
29
29
|
|
|
30
30
|
- name: Setup uv
|
|
31
|
-
uses: astral-sh/setup-uv@
|
|
31
|
+
uses: astral-sh/setup-uv@v4
|
|
32
32
|
with:
|
|
33
33
|
enable-cache: true
|
|
34
34
|
cache-suffix: lint
|
|
@@ -82,7 +82,7 @@ jobs:
|
|
|
82
82
|
python-version: ${{ matrix.pyv }}
|
|
83
83
|
|
|
84
84
|
- name: Setup uv
|
|
85
|
-
uses: astral-sh/setup-uv@
|
|
85
|
+
uses: astral-sh/setup-uv@v4
|
|
86
86
|
with:
|
|
87
87
|
enable-cache: true
|
|
88
88
|
cache-suffix: tests-${{ matrix.pyv }}
|
|
@@ -142,7 +142,7 @@ jobs:
|
|
|
142
142
|
python-version: ${{ matrix.pyv }}
|
|
143
143
|
|
|
144
144
|
- name: Setup uv
|
|
145
|
-
uses: astral-sh/setup-uv@
|
|
145
|
+
uses: astral-sh/setup-uv@v4
|
|
146
146
|
with:
|
|
147
147
|
enable-cache: true
|
|
148
148
|
cache-suffix: examples-${{ matrix.pyv }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -139,7 +139,7 @@ Key Features
|
|
|
139
139
|
============
|
|
140
140
|
|
|
141
141
|
📂 **Multimodal Dataset Versioning.**
|
|
142
|
-
- Version unstructured data without redundant data copies, by
|
|
142
|
+
- Version unstructured data without redundant data copies, by supporting
|
|
143
143
|
references to S3, GCP, Azure, and local file systems.
|
|
144
144
|
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
145
145
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
@@ -37,7 +37,7 @@ Key Features
|
|
|
37
37
|
============
|
|
38
38
|
|
|
39
39
|
📂 **Multimodal Dataset Versioning.**
|
|
40
|
-
- Version unstructured data without redundant data copies, by
|
|
40
|
+
- Version unstructured data without redundant data copies, by supporting
|
|
41
41
|
references to S3, GCP, Azure, and local file systems.
|
|
42
42
|
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
43
43
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# SQL
|
|
2
|
+
|
|
3
|
+
Use SQL functions to operate on the underlying database storing the chain data. Useful
|
|
4
|
+
for operations like [`DataChain.filter`](datachain.md#datachain.lib.dc.DataChain.filter)
|
|
5
|
+
and [`DataChain.mutate`](datachain.md#datachain.lib.dc.DataChain.mutate). Import
|
|
6
|
+
these functions from `datachain.sql.functions`.
|
|
7
|
+
|
|
8
|
+
::: datachain.func.avg
|
|
9
|
+
::: datachain.func.count
|
|
10
|
+
::: datachain.func.greatest
|
|
11
|
+
::: datachain.func.least
|
|
12
|
+
::: datachain.func.max
|
|
13
|
+
::: datachain.func.min
|
|
14
|
+
::: datachain.func.rand
|
|
15
|
+
::: datachain.func.sum
|
|
16
|
+
::: datachain.func.array
|
|
17
|
+
::: datachain.func.path
|
|
18
|
+
::: datachain.func.string
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
from PIL import Image
|
|
4
4
|
|
|
5
5
|
from datachain import C, DataChain, File, model
|
|
6
|
-
from datachain.
|
|
6
|
+
from datachain.func import path
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def openimage_detect(args):
|
|
@@ -48,7 +48,7 @@ source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
|
|
|
48
48
|
.filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
|
|
49
49
|
.agg(
|
|
50
50
|
openimage_detect,
|
|
51
|
-
partition_by=path.file_stem(
|
|
51
|
+
partition_by=path.file_stem("file.path"),
|
|
52
52
|
params=["file"],
|
|
53
53
|
output={"file": File, "bbox": model.BBox},
|
|
54
54
|
)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from datachain import C, DataChain
|
|
2
|
-
from datachain.
|
|
3
|
-
from datachain.sql.functions import array, greatest, least, path, string
|
|
2
|
+
from datachain.func import array, greatest, least, path, string
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
def num_chars_udf(file):
|
|
@@ -18,7 +17,7 @@ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
|
18
17
|
(
|
|
19
18
|
dc.mutate(
|
|
20
19
|
length=string.length(path.name(C("file.path"))),
|
|
21
|
-
parts=string.split(path.name(C("file.path")),
|
|
20
|
+
parts=string.split(path.name(C("file.path")), "."),
|
|
22
21
|
)
|
|
23
22
|
.select("file.path", "length", "parts")
|
|
24
23
|
.show(5)
|
|
@@ -35,8 +34,8 @@ dc.map(num_chars_udf, params=["file"], output={"num_chars": list[str]}).select(
|
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
chain = dc.mutate(
|
|
38
|
-
a=array.length(string.split(
|
|
39
|
-
b=array.length(string.split(path.name(
|
|
37
|
+
a=array.length(string.split("file.path", "/")),
|
|
38
|
+
b=array.length(string.split(path.name("file.path"), "0")),
|
|
40
39
|
)
|
|
41
40
|
|
|
42
41
|
(
|
|
@@ -3,8 +3,7 @@ import torch
|
|
|
3
3
|
from torch.nn.functional import cosine_similarity
|
|
4
4
|
from torch.utils.data import DataLoader
|
|
5
5
|
|
|
6
|
-
from datachain import C, DataChain
|
|
7
|
-
from datachain.sql.functions import path
|
|
6
|
+
from datachain import C, DataChain, func
|
|
8
7
|
|
|
9
8
|
source = "gs://datachain-demo/50k-laion-files/000000/00000000*"
|
|
10
9
|
|
|
@@ -18,8 +17,8 @@ def create_dataset():
|
|
|
18
17
|
)
|
|
19
18
|
return imgs.merge(
|
|
20
19
|
captions,
|
|
21
|
-
on=path.file_stem(imgs.c("file.path")),
|
|
22
|
-
right_on=path.file_stem(captions.c("file.path")),
|
|
20
|
+
on=func.path.file_stem(imgs.c("file.path")),
|
|
21
|
+
right_on=func.path.file_stem(captions.c("file.path")),
|
|
23
22
|
)
|
|
24
23
|
|
|
25
24
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
3
|
from datachain import DataChain
|
|
4
|
+
from datachain.func import path
|
|
4
5
|
from datachain.lib.webdataset import process_webdataset
|
|
5
6
|
from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
|
|
6
|
-
from datachain.sql.functions import path
|
|
7
7
|
|
|
8
8
|
IMAGE_TARS = os.getenv(
|
|
9
9
|
"IMAGE_TARS", "gs://datachain-demo/datacomp-small/shards/000000[0-5]*.tar"
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import datachain.error
|
|
2
|
-
from datachain import C, DataChain
|
|
2
|
+
from datachain import C, DataChain, func
|
|
3
3
|
from datachain.lib.webdataset import process_webdataset
|
|
4
4
|
from datachain.lib.webdataset_laion import WDSLaion
|
|
5
|
-
from datachain.sql import literal
|
|
6
|
-
from datachain.sql.functions import array, greatest, least, string
|
|
7
5
|
|
|
8
6
|
name = "wds"
|
|
9
7
|
try:
|
|
@@ -20,14 +18,12 @@ except datachain.error.DatasetNotFoundError:
|
|
|
20
18
|
wds.print_schema()
|
|
21
19
|
|
|
22
20
|
filtered = (
|
|
23
|
-
wds.filter(string.length(
|
|
24
|
-
.filter(array.length(string.split(
|
|
21
|
+
wds.filter(func.string.length("laion.txt") > 5)
|
|
22
|
+
.filter(func.array.length(func.string.split("laion.txt", " ")) > 2)
|
|
23
|
+
.filter(func.least("laion.json.original_width", "laion.json.original_height") > 200)
|
|
25
24
|
.filter(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
.filter(
|
|
29
|
-
greatest(C("laion.json.original_width"), C("laion.json.original_height"))
|
|
30
|
-
/ least(C("laion.json.original_width"), C("laion.json.original_height"))
|
|
25
|
+
func.greatest("laion.json.original_width", "laion.json.original_height")
|
|
26
|
+
/ func.least("laion.json.original_width", "laion.json.original_height")
|
|
31
27
|
< 3.0
|
|
32
28
|
)
|
|
33
29
|
.save()
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from datachain.lib import func
|
|
2
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
3
2
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
4
3
|
from datachain.lib.file import (
|
|
@@ -35,7 +34,6 @@ __all__ = [
|
|
|
35
34
|
"Sys",
|
|
36
35
|
"TarVFile",
|
|
37
36
|
"TextFile",
|
|
38
|
-
"func",
|
|
39
37
|
"is_chain_type",
|
|
40
38
|
"metrics",
|
|
41
39
|
"param",
|
|
@@ -38,6 +38,7 @@ from datachain.dataset import (
|
|
|
38
38
|
DATASET_PREFIX,
|
|
39
39
|
QUERY_DATASET_PREFIX,
|
|
40
40
|
DatasetDependency,
|
|
41
|
+
DatasetListRecord,
|
|
41
42
|
DatasetRecord,
|
|
42
43
|
DatasetStats,
|
|
43
44
|
DatasetStatus,
|
|
@@ -54,7 +55,6 @@ from datachain.error import (
|
|
|
54
55
|
QueryScriptCancelError,
|
|
55
56
|
QueryScriptRunError,
|
|
56
57
|
)
|
|
57
|
-
from datachain.listing import Listing
|
|
58
58
|
from datachain.node import DirType, Node, NodeWithPath
|
|
59
59
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
60
60
|
from datachain.remote.studio import StudioClient
|
|
@@ -73,9 +73,10 @@ if TYPE_CHECKING:
|
|
|
73
73
|
AbstractMetastore,
|
|
74
74
|
AbstractWarehouse,
|
|
75
75
|
)
|
|
76
|
-
from datachain.dataset import
|
|
76
|
+
from datachain.dataset import DatasetListVersion
|
|
77
77
|
from datachain.job import Job
|
|
78
78
|
from datachain.lib.file import File
|
|
79
|
+
from datachain.listing import Listing
|
|
79
80
|
|
|
80
81
|
logger = logging.getLogger("datachain")
|
|
81
82
|
|
|
@@ -236,7 +237,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
236
237
|
class NodeGroup:
|
|
237
238
|
"""Class for a group of nodes from the same source"""
|
|
238
239
|
|
|
239
|
-
listing: Listing
|
|
240
|
+
listing: "Listing"
|
|
240
241
|
sources: list[DataSource]
|
|
241
242
|
|
|
242
243
|
# The source path within the bucket
|
|
@@ -591,8 +592,9 @@ class Catalog:
|
|
|
591
592
|
client_config=None,
|
|
592
593
|
object_name="file",
|
|
593
594
|
skip_indexing=False,
|
|
594
|
-
) -> tuple[Listing, str]:
|
|
595
|
+
) -> tuple["Listing", str]:
|
|
595
596
|
from datachain.lib.dc import DataChain
|
|
597
|
+
from datachain.listing import Listing
|
|
596
598
|
|
|
597
599
|
DataChain.from_storage(
|
|
598
600
|
source, session=self.session, update=update, object_name=object_name
|
|
@@ -660,7 +662,8 @@ class Catalog:
|
|
|
660
662
|
no_glob: bool = False,
|
|
661
663
|
client_config=None,
|
|
662
664
|
) -> list[NodeGroup]:
|
|
663
|
-
from datachain.
|
|
665
|
+
from datachain.listing import Listing
|
|
666
|
+
from datachain.query.dataset import DatasetQuery
|
|
664
667
|
|
|
665
668
|
def _row_to_node(d: dict[str, Any]) -> Node:
|
|
666
669
|
del d["file__source"]
|
|
@@ -876,7 +879,7 @@ class Catalog:
|
|
|
876
879
|
def update_dataset_version_with_warehouse_info(
|
|
877
880
|
self, dataset: DatasetRecord, version: int, rows_dropped=False, **kwargs
|
|
878
881
|
) -> None:
|
|
879
|
-
from datachain.query import DatasetQuery
|
|
882
|
+
from datachain.query.dataset import DatasetQuery
|
|
880
883
|
|
|
881
884
|
dataset_version = dataset.get_version(version)
|
|
882
885
|
|
|
@@ -1133,7 +1136,7 @@ class Catalog:
|
|
|
1133
1136
|
|
|
1134
1137
|
return direct_dependencies
|
|
1135
1138
|
|
|
1136
|
-
def ls_datasets(self, include_listing: bool = False) -> Iterator[
|
|
1139
|
+
def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
|
|
1137
1140
|
datasets = self.metastore.list_datasets()
|
|
1138
1141
|
for d in datasets:
|
|
1139
1142
|
if not d.is_bucket_listing or include_listing:
|
|
@@ -1142,7 +1145,7 @@ class Catalog:
|
|
|
1142
1145
|
def list_datasets_versions(
|
|
1143
1146
|
self,
|
|
1144
1147
|
include_listing: bool = False,
|
|
1145
|
-
) -> Iterator[tuple[
|
|
1148
|
+
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1146
1149
|
"""Iterate over all dataset versions with related jobs."""
|
|
1147
1150
|
datasets = list(self.ls_datasets(include_listing=include_listing))
|
|
1148
1151
|
|
|
@@ -1177,7 +1180,7 @@ class Catalog:
|
|
|
1177
1180
|
def ls_dataset_rows(
|
|
1178
1181
|
self, name: str, version: int, offset=None, limit=None
|
|
1179
1182
|
) -> list[dict]:
|
|
1180
|
-
from datachain.query import DatasetQuery
|
|
1183
|
+
from datachain.query.dataset import DatasetQuery
|
|
1181
1184
|
|
|
1182
1185
|
dataset = self.get_dataset(name)
|
|
1183
1186
|
|
|
@@ -18,7 +18,12 @@ from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyVa
|
|
|
18
18
|
from datachain.config import Config
|
|
19
19
|
from datachain.error import DataChainError
|
|
20
20
|
from datachain.lib.dc import DataChain
|
|
21
|
-
from datachain.studio import
|
|
21
|
+
from datachain.studio import (
|
|
22
|
+
edit_studio_dataset,
|
|
23
|
+
list_datasets,
|
|
24
|
+
process_studio_cli_args,
|
|
25
|
+
remove_studio_dataset,
|
|
26
|
+
)
|
|
22
27
|
from datachain.telemetry import telemetry
|
|
23
28
|
|
|
24
29
|
if TYPE_CHECKING:
|
|
@@ -403,21 +408,44 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
403
408
|
parse_edit_dataset.add_argument(
|
|
404
409
|
"--new-name",
|
|
405
410
|
action="store",
|
|
406
|
-
default="",
|
|
407
411
|
help="Dataset new name",
|
|
408
412
|
)
|
|
409
413
|
parse_edit_dataset.add_argument(
|
|
410
414
|
"--description",
|
|
411
415
|
action="store",
|
|
412
|
-
default="",
|
|
413
416
|
help="Dataset description",
|
|
414
417
|
)
|
|
415
418
|
parse_edit_dataset.add_argument(
|
|
416
419
|
"--labels",
|
|
417
|
-
default=[],
|
|
418
420
|
nargs="+",
|
|
419
421
|
help="Dataset labels",
|
|
420
422
|
)
|
|
423
|
+
parse_edit_dataset.add_argument(
|
|
424
|
+
"--studio",
|
|
425
|
+
action="store_true",
|
|
426
|
+
default=False,
|
|
427
|
+
help="Edit dataset from Studio",
|
|
428
|
+
)
|
|
429
|
+
parse_edit_dataset.add_argument(
|
|
430
|
+
"-L",
|
|
431
|
+
"--local",
|
|
432
|
+
action="store_true",
|
|
433
|
+
default=False,
|
|
434
|
+
help="Edit local dataset only",
|
|
435
|
+
)
|
|
436
|
+
parse_edit_dataset.add_argument(
|
|
437
|
+
"-a",
|
|
438
|
+
"--all",
|
|
439
|
+
action="store_true",
|
|
440
|
+
default=True,
|
|
441
|
+
help="Edit both datasets from studio and local",
|
|
442
|
+
)
|
|
443
|
+
parse_edit_dataset.add_argument(
|
|
444
|
+
"--team",
|
|
445
|
+
action="store",
|
|
446
|
+
default=None,
|
|
447
|
+
help="The team to edit a dataset. By default, it will use team from config.",
|
|
448
|
+
)
|
|
421
449
|
|
|
422
450
|
datasets_parser = subp.add_parser(
|
|
423
451
|
"datasets", parents=[parent_parser], description="List datasets"
|
|
@@ -466,6 +494,32 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
466
494
|
action=BooleanOptionalAction,
|
|
467
495
|
help="Force delete registered dataset with all of it's versions",
|
|
468
496
|
)
|
|
497
|
+
rm_dataset_parser.add_argument(
|
|
498
|
+
"--studio",
|
|
499
|
+
action="store_true",
|
|
500
|
+
default=False,
|
|
501
|
+
help="Remove dataset from Studio",
|
|
502
|
+
)
|
|
503
|
+
rm_dataset_parser.add_argument(
|
|
504
|
+
"-L",
|
|
505
|
+
"--local",
|
|
506
|
+
action="store_true",
|
|
507
|
+
default=False,
|
|
508
|
+
help="Remove local datasets only",
|
|
509
|
+
)
|
|
510
|
+
rm_dataset_parser.add_argument(
|
|
511
|
+
"-a",
|
|
512
|
+
"--all",
|
|
513
|
+
action="store_true",
|
|
514
|
+
default=True,
|
|
515
|
+
help="Remove both local and studio",
|
|
516
|
+
)
|
|
517
|
+
rm_dataset_parser.add_argument(
|
|
518
|
+
"--team",
|
|
519
|
+
action="store",
|
|
520
|
+
default=None,
|
|
521
|
+
help="The team to delete a dataset. By default, it will use team from config.",
|
|
522
|
+
)
|
|
469
523
|
|
|
470
524
|
dataset_stats_parser = subp.add_parser(
|
|
471
525
|
"dataset-stats",
|
|
@@ -909,8 +963,40 @@ def rm_dataset(
|
|
|
909
963
|
name: str,
|
|
910
964
|
version: Optional[int] = None,
|
|
911
965
|
force: Optional[bool] = False,
|
|
966
|
+
studio: bool = False,
|
|
967
|
+
local: bool = False,
|
|
968
|
+
all: bool = True,
|
|
969
|
+
team: Optional[str] = None,
|
|
970
|
+
):
|
|
971
|
+
token = Config().read().get("studio", {}).get("token")
|
|
972
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
973
|
+
|
|
974
|
+
if all or local:
|
|
975
|
+
catalog.remove_dataset(name, version=version, force=force)
|
|
976
|
+
|
|
977
|
+
if (all or studio) and token:
|
|
978
|
+
remove_studio_dataset(team, name, version, force)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def edit_dataset(
|
|
982
|
+
catalog: "Catalog",
|
|
983
|
+
name: str,
|
|
984
|
+
new_name: Optional[str] = None,
|
|
985
|
+
description: Optional[str] = None,
|
|
986
|
+
labels: Optional[list[str]] = None,
|
|
987
|
+
studio: bool = False,
|
|
988
|
+
local: bool = False,
|
|
989
|
+
all: bool = True,
|
|
990
|
+
team: Optional[str] = None,
|
|
912
991
|
):
|
|
913
|
-
|
|
992
|
+
token = Config().read().get("studio", {}).get("token")
|
|
993
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
994
|
+
|
|
995
|
+
if all or local:
|
|
996
|
+
catalog.edit_dataset(name, new_name, description, labels)
|
|
997
|
+
|
|
998
|
+
if (all or studio) and token:
|
|
999
|
+
edit_studio_dataset(team, name, new_name, description, labels)
|
|
914
1000
|
|
|
915
1001
|
|
|
916
1002
|
def dataset_stats(
|
|
@@ -957,7 +1043,7 @@ def show(
|
|
|
957
1043
|
schema: bool = False,
|
|
958
1044
|
) -> None:
|
|
959
1045
|
from datachain.lib.dc import DataChain
|
|
960
|
-
from datachain.query import DatasetQuery
|
|
1046
|
+
from datachain.query.dataset import DatasetQuery
|
|
961
1047
|
from datachain.utils import show_records
|
|
962
1048
|
|
|
963
1049
|
dataset = catalog.get_dataset(name)
|
|
@@ -1127,11 +1213,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1127
1213
|
edatachain_file=args.edatachain_file,
|
|
1128
1214
|
)
|
|
1129
1215
|
elif args.command == "edit-dataset":
|
|
1130
|
-
|
|
1216
|
+
edit_dataset(
|
|
1217
|
+
catalog,
|
|
1131
1218
|
args.name,
|
|
1132
|
-
description=args.description,
|
|
1133
1219
|
new_name=args.new_name,
|
|
1220
|
+
description=args.description,
|
|
1134
1221
|
labels=args.labels,
|
|
1222
|
+
studio=args.studio,
|
|
1223
|
+
local=args.local,
|
|
1224
|
+
all=args.all,
|
|
1225
|
+
team=args.team,
|
|
1135
1226
|
)
|
|
1136
1227
|
elif args.command == "ls":
|
|
1137
1228
|
ls(
|
|
@@ -1164,7 +1255,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1164
1255
|
schema=args.schema,
|
|
1165
1256
|
)
|
|
1166
1257
|
elif args.command == "rm-dataset":
|
|
1167
|
-
rm_dataset(
|
|
1258
|
+
rm_dataset(
|
|
1259
|
+
catalog,
|
|
1260
|
+
args.name,
|
|
1261
|
+
version=args.version,
|
|
1262
|
+
force=args.force,
|
|
1263
|
+
studio=args.studio,
|
|
1264
|
+
local=args.local,
|
|
1265
|
+
all=args.all,
|
|
1266
|
+
team=args.team,
|
|
1267
|
+
)
|
|
1168
1268
|
elif args.command == "dataset-stats":
|
|
1169
1269
|
dataset_stats(
|
|
1170
1270
|
catalog,
|
|
@@ -28,7 +28,6 @@ from tqdm import tqdm
|
|
|
28
28
|
from datachain.cache import DataChainCache
|
|
29
29
|
from datachain.client.fileslice import FileWrapper
|
|
30
30
|
from datachain.error import ClientError as DataChainClientError
|
|
31
|
-
from datachain.lib.file import File
|
|
32
31
|
from datachain.nodes_fetcher import NodesFetcher
|
|
33
32
|
from datachain.nodes_thread_pool import NodeChunk
|
|
34
33
|
|
|
@@ -36,6 +35,7 @@ if TYPE_CHECKING:
|
|
|
36
35
|
from fsspec.spec import AbstractFileSystem
|
|
37
36
|
|
|
38
37
|
from datachain.dataset import StorageURI
|
|
38
|
+
from datachain.lib.file import File
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
logger = logging.getLogger("datachain")
|
|
@@ -45,7 +45,7 @@ DELIMITER = "/" # Path delimiter.
|
|
|
45
45
|
|
|
46
46
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
47
47
|
|
|
48
|
-
ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
|
|
48
|
+
ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def _is_win_local_path(uri: str) -> bool:
|
|
@@ -212,7 +212,7 @@ class Client(ABC):
|
|
|
212
212
|
|
|
213
213
|
async def scandir(
|
|
214
214
|
self, start_prefix: str, method: str = "default"
|
|
215
|
-
) -> AsyncIterator[Sequence[File]]:
|
|
215
|
+
) -> AsyncIterator[Sequence["File"]]:
|
|
216
216
|
try:
|
|
217
217
|
impl = getattr(self, f"_fetch_{method}")
|
|
218
218
|
except AttributeError:
|
|
@@ -317,7 +317,7 @@ class Client(ABC):
|
|
|
317
317
|
return f"{self.PREFIX}{self.name}/{rel_path}"
|
|
318
318
|
|
|
319
319
|
@abstractmethod
|
|
320
|
-
def info_to_file(self, v: dict[str, Any], parent: str) -> File: ...
|
|
320
|
+
def info_to_file(self, v: dict[str, Any], parent: str) -> "File": ...
|
|
321
321
|
|
|
322
322
|
def fetch_nodes(
|
|
323
323
|
self,
|
|
@@ -354,7 +354,7 @@ class Client(ABC):
|
|
|
354
354
|
copy2(src, dst)
|
|
355
355
|
|
|
356
356
|
def open_object(
|
|
357
|
-
self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
|
|
357
|
+
self, file: "File", use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
|
|
358
358
|
) -> BinaryIO:
|
|
359
359
|
"""Open a file, including files in tar archives."""
|
|
360
360
|
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
@@ -362,19 +362,19 @@ class Client(ABC):
|
|
|
362
362
|
assert not file.location
|
|
363
363
|
return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
|
|
364
364
|
|
|
365
|
-
def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
365
|
+
def download(self, file: "File", *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
366
366
|
sync(get_loop(), functools.partial(self._download, file, callback=callback))
|
|
367
367
|
|
|
368
|
-
async def _download(self, file: File, *, callback: "Callback" = None) -> None:
|
|
368
|
+
async def _download(self, file: "File", *, callback: "Callback" = None) -> None:
|
|
369
369
|
if self.cache.contains(file):
|
|
370
370
|
# Already in cache, so there's nothing to do.
|
|
371
371
|
return
|
|
372
372
|
await self._put_in_cache(file, callback=callback)
|
|
373
373
|
|
|
374
|
-
def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
374
|
+
def put_in_cache(self, file: "File", *, callback: "Callback" = None) -> None:
|
|
375
375
|
sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
|
|
376
376
|
|
|
377
|
-
async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
377
|
+
async def _put_in_cache(self, file: "File", *, callback: "Callback" = None) -> None:
|
|
378
378
|
assert not file.location
|
|
379
379
|
if file.etag:
|
|
380
380
|
etag = await self.get_current_etag(file)
|