datachain 0.7.8__tar.gz → 0.7.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.8/src/datachain.egg-info → datachain-0.7.9}/PKG-INFO +2 -2
- {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/hf-dataset-llm-eval.py +6 -3
- {datachain-0.7.8 → datachain-0.7.9}/pyproject.toml +1 -1
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/cli.py +9 -3
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/metastore.py +3 -2
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/dc.py +1 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/pytorch.py +54 -37
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/remote/studio.py +44 -25
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/studio.py +2 -2
- {datachain-0.7.8 → datachain-0.7.9/src/datachain.egg-info}/PKG-INFO +2 -2
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/requires.txt +1 -1
- {datachain-0.7.8 → datachain-0.7.9}/tests/conftest.py +1 -1
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_catalog.py +32 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_ls.py +2 -2
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_pull.py +13 -13
- {datachain-0.7.8 → datachain-0.7.9}/tests/test_cli_studio.py +4 -2
- {datachain-0.7.8 → datachain-0.7.9}/.cruft.json +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.gitattributes +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/codecov.yaml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/dependabot.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/release.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/tests.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.gitignore +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/.pre-commit-config.yaml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/CONTRIBUTING.rst +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/LICENSE +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/README.rst +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/index.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/overrides/main.html +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/references/datachain.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/references/datatype.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/references/file.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/references/index.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/references/sql.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/references/torch.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/docs/references/udf.md +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/wds.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/mkdocs.yml +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/noxfile.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/setup.cfg +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/__main__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/asyn.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/cache.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/gcs.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/local.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/config.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/dataset.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/error.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/array.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/base.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/conditional.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/func.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/numeric.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/path.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/random.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/string.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/func/window.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/job.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/file.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/hf.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/listing.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/udf.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/listing.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/bbox.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/pose.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/segment.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/node.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/progress.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/py.typed +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/batch.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/dataset.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/params.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/query/session.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain/utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/SOURCES.txt +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/data.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/examples/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/examples/test_examples.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_client.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_datachain.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_datasets.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_listing.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_query.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/func/test_toolkit.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/test_atomicity.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/test_query_e2e.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/test_telemetry.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_client.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_config.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_func.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_listing.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_query.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_session.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.8 → datachain-0.7.9}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.9
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -98,7 +98,7 @@ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
|
|
|
98
98
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
99
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
100
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.
|
|
101
|
+
Requires-Dist: ultralytics==8.3.37; extra == "examples"
|
|
102
102
|
|
|
103
103
|
================
|
|
104
104
|
|logo| DataChain
|
|
@@ -15,9 +15,11 @@ class DialogEval(DataModel):
|
|
|
15
15
|
|
|
16
16
|
# DataChain function to evaluate dialog.
|
|
17
17
|
# DataChain is using types for inputs, results to automatically infer schema.
|
|
18
|
-
def eval_dialog(
|
|
19
|
-
client
|
|
20
|
-
|
|
18
|
+
def eval_dialog(
|
|
19
|
+
client: InferenceClient,
|
|
20
|
+
user_input: str,
|
|
21
|
+
bot_response: str,
|
|
22
|
+
) -> DialogEval:
|
|
21
23
|
completion = client.chat_completion(
|
|
22
24
|
messages=[
|
|
23
25
|
{
|
|
@@ -44,6 +46,7 @@ def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
|
|
|
44
46
|
"hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv"
|
|
45
47
|
)
|
|
46
48
|
.settings(parallel=10)
|
|
49
|
+
.setup(client=lambda: InferenceClient("meta-llama/Llama-3.1-70B-Instruct"))
|
|
47
50
|
.map(response=eval_dialog)
|
|
48
51
|
.to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
|
|
49
52
|
)
|
|
@@ -16,7 +16,7 @@ from tabulate import tabulate
|
|
|
16
16
|
from datachain import Session, utils
|
|
17
17
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
18
18
|
from datachain.config import Config
|
|
19
|
-
from datachain.error import DataChainError
|
|
19
|
+
from datachain.error import DataChainError, DatasetNotFoundError
|
|
20
20
|
from datachain.lib.dc import DataChain
|
|
21
21
|
from datachain.studio import (
|
|
22
22
|
edit_studio_dataset,
|
|
@@ -1056,7 +1056,10 @@ def rm_dataset(
|
|
|
1056
1056
|
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
1057
1057
|
|
|
1058
1058
|
if all or local:
|
|
1059
|
-
|
|
1059
|
+
try:
|
|
1060
|
+
catalog.remove_dataset(name, version=version, force=force)
|
|
1061
|
+
except DatasetNotFoundError:
|
|
1062
|
+
print("Dataset not found in local", file=sys.stderr)
|
|
1060
1063
|
|
|
1061
1064
|
if (all or studio) and token:
|
|
1062
1065
|
remove_studio_dataset(team, name, version, force)
|
|
@@ -1077,7 +1080,10 @@ def edit_dataset(
|
|
|
1077
1080
|
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
1078
1081
|
|
|
1079
1082
|
if all or local:
|
|
1080
|
-
|
|
1083
|
+
try:
|
|
1084
|
+
catalog.edit_dataset(name, new_name, description, labels)
|
|
1085
|
+
except DatasetNotFoundError:
|
|
1086
|
+
print("Dataset not found in local", file=sys.stderr)
|
|
1081
1087
|
|
|
1082
1088
|
if (all or studio) and token:
|
|
1083
1089
|
edit_studio_dataset(team, name, new_name, description, labels)
|
|
@@ -725,9 +725,10 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
725
725
|
|
|
726
726
|
def list_datasets(self) -> Iterator["DatasetListRecord"]:
|
|
727
727
|
"""Lists all datasets."""
|
|
728
|
-
|
|
729
|
-
self.
|
|
728
|
+
query = self._base_list_datasets_query().order_by(
|
|
729
|
+
self._datasets.c.name, self._datasets_versions.c.version
|
|
730
730
|
)
|
|
731
|
+
yield from self._parse_dataset_list(self.db.execute(query))
|
|
731
732
|
|
|
732
733
|
def list_datasets_by_prefix(
|
|
733
734
|
self, prefix: str, conn=None
|
|
@@ -10,8 +10,10 @@ from torchvision.transforms import v2
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
12
|
from datachain import Session
|
|
13
|
+
from datachain.asyn import AsyncMapper
|
|
13
14
|
from datachain.catalog import Catalog, get_catalog
|
|
14
15
|
from datachain.lib.dc import DataChain
|
|
16
|
+
from datachain.lib.settings import Settings
|
|
15
17
|
from datachain.lib.text import convert_text
|
|
16
18
|
|
|
17
19
|
if TYPE_CHECKING:
|
|
@@ -30,6 +32,8 @@ def label_to_int(value: str, classes: list) -> int:
|
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
class PytorchDataset(IterableDataset):
|
|
35
|
+
prefetch: int = 2
|
|
36
|
+
|
|
33
37
|
def __init__(
|
|
34
38
|
self,
|
|
35
39
|
name: str,
|
|
@@ -39,6 +43,7 @@ class PytorchDataset(IterableDataset):
|
|
|
39
43
|
tokenizer: Optional[Callable] = None,
|
|
40
44
|
tokenizer_kwargs: Optional[dict[str, Any]] = None,
|
|
41
45
|
num_samples: int = 0,
|
|
46
|
+
dc_settings: Optional[Settings] = None,
|
|
42
47
|
):
|
|
43
48
|
"""
|
|
44
49
|
Pytorch IterableDataset that streams DataChain datasets.
|
|
@@ -66,6 +71,11 @@ class PytorchDataset(IterableDataset):
|
|
|
66
71
|
catalog = get_catalog()
|
|
67
72
|
self._init_catalog(catalog)
|
|
68
73
|
|
|
74
|
+
dc_settings = dc_settings or Settings()
|
|
75
|
+
self.cache = dc_settings.cache
|
|
76
|
+
if (prefetch := dc_settings.prefetch) is not None:
|
|
77
|
+
self.prefetch = prefetch
|
|
78
|
+
|
|
69
79
|
def _init_catalog(self, catalog: "Catalog"):
|
|
70
80
|
# For compatibility with multiprocessing,
|
|
71
81
|
# we can only store params in __init__(), as Catalog isn't picklable
|
|
@@ -82,51 +92,58 @@ class PytorchDataset(IterableDataset):
|
|
|
82
92
|
wh = wh_cls(*wh_args, **wh_kwargs)
|
|
83
93
|
return Catalog(ms, wh, **self._catalog_params)
|
|
84
94
|
|
|
85
|
-
def
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
session = Session.get(catalog=self.catalog)
|
|
89
|
-
total_rank, total_workers = self.get_rank_and_workers()
|
|
95
|
+
def _rows_iter(self, total_rank: int, total_workers: int):
|
|
96
|
+
catalog = self._get_catalog()
|
|
97
|
+
session = Session("PyTorch", catalog=catalog)
|
|
90
98
|
ds = DataChain.from_dataset(
|
|
91
99
|
name=self.name, version=self.version, session=session
|
|
92
|
-
)
|
|
100
|
+
).settings(cache=self.cache, prefetch=self.prefetch)
|
|
93
101
|
ds = ds.remove_file_signals()
|
|
94
102
|
|
|
95
103
|
if self.num_samples > 0:
|
|
96
104
|
ds = ds.sample(self.num_samples)
|
|
97
105
|
ds = ds.chunk(total_rank, total_workers)
|
|
106
|
+
yield from ds.collect()
|
|
107
|
+
|
|
108
|
+
def __iter__(self) -> Iterator[Any]:
|
|
109
|
+
total_rank, total_workers = self.get_rank_and_workers()
|
|
110
|
+
rows = self._rows_iter(total_rank, total_workers)
|
|
111
|
+
if self.prefetch > 0:
|
|
112
|
+
from datachain.lib.udf import _prefetch_input
|
|
113
|
+
|
|
114
|
+
rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
|
|
115
|
+
|
|
98
116
|
desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
|
|
99
|
-
with tqdm(desc=desc, unit=" rows") as
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
pbar.update(1)
|
|
117
|
+
with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
|
|
118
|
+
yield from map(self._process_row, rows_it)
|
|
119
|
+
|
|
120
|
+
def _process_row(self, row_features):
|
|
121
|
+
row = []
|
|
122
|
+
for fr in row_features:
|
|
123
|
+
if hasattr(fr, "read"):
|
|
124
|
+
row.append(fr.read()) # type: ignore[unreachable]
|
|
125
|
+
else:
|
|
126
|
+
row.append(fr)
|
|
127
|
+
# Apply transforms
|
|
128
|
+
if self.transform:
|
|
129
|
+
try:
|
|
130
|
+
if isinstance(self.transform, v2.Transform):
|
|
131
|
+
row = self.transform(row)
|
|
132
|
+
for i, val in enumerate(row):
|
|
133
|
+
if isinstance(val, Image.Image):
|
|
134
|
+
row[i] = self.transform(val)
|
|
135
|
+
except ValueError:
|
|
136
|
+
logger.warning("Skipping transform due to unsupported data types.")
|
|
137
|
+
self.transform = None
|
|
138
|
+
if self.tokenizer:
|
|
139
|
+
for i, val in enumerate(row):
|
|
140
|
+
if isinstance(val, str) or (
|
|
141
|
+
isinstance(val, list) and isinstance(val[0], str)
|
|
142
|
+
):
|
|
143
|
+
row[i] = convert_text(
|
|
144
|
+
val, self.tokenizer, self.tokenizer_kwargs
|
|
145
|
+
).squeeze(0) # type: ignore[union-attr]
|
|
146
|
+
return row
|
|
130
147
|
|
|
131
148
|
@staticmethod
|
|
132
149
|
def get_rank_and_workers() -> tuple[int, int]:
|
|
@@ -119,18 +119,27 @@ class StudioClient:
|
|
|
119
119
|
"\tpip install 'datachain[remote]'"
|
|
120
120
|
) from None
|
|
121
121
|
|
|
122
|
-
def _send_request_msgpack(
|
|
122
|
+
def _send_request_msgpack(
|
|
123
|
+
self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
|
|
124
|
+
) -> Response[Any]:
|
|
123
125
|
import msgpack
|
|
124
126
|
import requests
|
|
125
127
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
128
|
+
kwargs = (
|
|
129
|
+
{"params": {**data, "team_name": self.team}}
|
|
130
|
+
if method == "GET"
|
|
131
|
+
else {"json": {**data, "team_name": self.team}}
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
response = requests.request(
|
|
135
|
+
method=method, # type: ignore[arg-type]
|
|
136
|
+
url=f"{self.url}/{route}",
|
|
129
137
|
headers={
|
|
130
138
|
"Content-Type": "application/json",
|
|
131
139
|
"Authorization": f"token {self.token}",
|
|
132
140
|
},
|
|
133
141
|
timeout=self.timeout,
|
|
142
|
+
**kwargs, # type: ignore[arg-type]
|
|
134
143
|
)
|
|
135
144
|
ok = response.ok
|
|
136
145
|
if not ok:
|
|
@@ -148,7 +157,9 @@ class StudioClient:
|
|
|
148
157
|
return Response(response_data, ok, message)
|
|
149
158
|
|
|
150
159
|
@retry_with_backoff(retries=5)
|
|
151
|
-
def _send_request(
|
|
160
|
+
def _send_request(
|
|
161
|
+
self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
|
|
162
|
+
) -> Response[Any]:
|
|
152
163
|
"""
|
|
153
164
|
Function that communicate Studio API.
|
|
154
165
|
It will raise an exception, and try to retry, if 5xx status code is
|
|
@@ -157,14 +168,21 @@ class StudioClient:
|
|
|
157
168
|
"""
|
|
158
169
|
import requests
|
|
159
170
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
171
|
+
kwargs = (
|
|
172
|
+
{"params": {**data, "team_name": self.team}}
|
|
173
|
+
if method == "GET"
|
|
174
|
+
else {"json": {**data, "team_name": self.team}}
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
response = requests.request(
|
|
178
|
+
method=method, # type: ignore[arg-type]
|
|
179
|
+
url=f"{self.url}/{route}",
|
|
163
180
|
headers={
|
|
164
181
|
"Content-Type": "application/json",
|
|
165
182
|
"Authorization": f"token {self.token}",
|
|
166
183
|
},
|
|
167
184
|
timeout=self.timeout,
|
|
185
|
+
**kwargs, # type: ignore[arg-type]
|
|
168
186
|
)
|
|
169
187
|
try:
|
|
170
188
|
response.raise_for_status()
|
|
@@ -222,7 +240,7 @@ class StudioClient:
|
|
|
222
240
|
yield path, response
|
|
223
241
|
|
|
224
242
|
def ls_datasets(self) -> Response[LsData]:
|
|
225
|
-
return self._send_request("datachain/
|
|
243
|
+
return self._send_request("datachain/datasets", {}, method="GET")
|
|
226
244
|
|
|
227
245
|
def edit_dataset(
|
|
228
246
|
self,
|
|
@@ -232,20 +250,14 @@ class StudioClient:
|
|
|
232
250
|
labels: Optional[list[str]] = None,
|
|
233
251
|
) -> Response[DatasetInfoData]:
|
|
234
252
|
body = {
|
|
253
|
+
"new_name": new_name,
|
|
235
254
|
"dataset_name": name,
|
|
255
|
+
"description": description,
|
|
256
|
+
"labels": labels,
|
|
236
257
|
}
|
|
237
258
|
|
|
238
|
-
if new_name is not None:
|
|
239
|
-
body["new_name"] = new_name
|
|
240
|
-
|
|
241
|
-
if description is not None:
|
|
242
|
-
body["description"] = description
|
|
243
|
-
|
|
244
|
-
if labels is not None:
|
|
245
|
-
body["labels"] = labels # type: ignore[assignment]
|
|
246
|
-
|
|
247
259
|
return self._send_request(
|
|
248
|
-
"datachain/
|
|
260
|
+
"datachain/datasets",
|
|
249
261
|
body,
|
|
250
262
|
)
|
|
251
263
|
|
|
@@ -256,12 +268,13 @@ class StudioClient:
|
|
|
256
268
|
force: Optional[bool] = False,
|
|
257
269
|
) -> Response[DatasetInfoData]:
|
|
258
270
|
return self._send_request(
|
|
259
|
-
"datachain/
|
|
271
|
+
"datachain/datasets",
|
|
260
272
|
{
|
|
261
273
|
"dataset_name": name,
|
|
262
274
|
"version": version,
|
|
263
275
|
"force": force,
|
|
264
276
|
},
|
|
277
|
+
method="DELETE",
|
|
265
278
|
)
|
|
266
279
|
|
|
267
280
|
def dataset_info(self, name: str) -> Response[DatasetInfoData]:
|
|
@@ -272,7 +285,9 @@ class StudioClient:
|
|
|
272
285
|
|
|
273
286
|
return dataset_info
|
|
274
287
|
|
|
275
|
-
response = self._send_request(
|
|
288
|
+
response = self._send_request(
|
|
289
|
+
"datachain/datasets/info", {"dataset_name": name}, method="GET"
|
|
290
|
+
)
|
|
276
291
|
if response.ok:
|
|
277
292
|
response.data = _parse_dataset_info(response.data)
|
|
278
293
|
return response
|
|
@@ -282,14 +297,16 @@ class StudioClient:
|
|
|
282
297
|
) -> Response[DatasetRowsData]:
|
|
283
298
|
req_data = {"dataset_name": name, "dataset_version": version}
|
|
284
299
|
return self._send_request_msgpack(
|
|
285
|
-
"datachain/
|
|
300
|
+
"datachain/datasets/rows",
|
|
286
301
|
{**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
|
|
302
|
+
method="GET",
|
|
287
303
|
)
|
|
288
304
|
|
|
289
305
|
def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
|
|
290
306
|
response = self._send_request(
|
|
291
|
-
"datachain/
|
|
307
|
+
"datachain/datasets/stats",
|
|
292
308
|
{"dataset_name": name, "dataset_version": version},
|
|
309
|
+
method="GET",
|
|
293
310
|
)
|
|
294
311
|
if response.ok:
|
|
295
312
|
response.data = DatasetStats(**response.data)
|
|
@@ -299,16 +316,18 @@ class StudioClient:
|
|
|
299
316
|
self, name: str, version: int
|
|
300
317
|
) -> Response[DatasetExportSignedUrls]:
|
|
301
318
|
return self._send_request(
|
|
302
|
-
"datachain/
|
|
319
|
+
"datachain/datasets/export",
|
|
303
320
|
{"dataset_name": name, "dataset_version": version},
|
|
321
|
+
method="GET",
|
|
304
322
|
)
|
|
305
323
|
|
|
306
324
|
def dataset_export_status(
|
|
307
325
|
self, name: str, version: int
|
|
308
326
|
) -> Response[DatasetExportStatus]:
|
|
309
327
|
return self._send_request(
|
|
310
|
-
"datachain/
|
|
328
|
+
"datachain/datasets/export-status",
|
|
311
329
|
{"dataset_name": name, "dataset_version": version},
|
|
330
|
+
method="GET",
|
|
312
331
|
)
|
|
313
332
|
|
|
314
333
|
def upload_file(self, file_name: str, content: bytes) -> Response[FileUploadData]:
|
|
@@ -155,7 +155,7 @@ def edit_studio_dataset(
|
|
|
155
155
|
if not response.ok:
|
|
156
156
|
raise_remote_error(response.message)
|
|
157
157
|
|
|
158
|
-
print(f"Dataset {name} updated")
|
|
158
|
+
print(f"Dataset '{name}' updated in Studio")
|
|
159
159
|
|
|
160
160
|
|
|
161
161
|
def remove_studio_dataset(
|
|
@@ -169,7 +169,7 @@ def remove_studio_dataset(
|
|
|
169
169
|
if not response.ok:
|
|
170
170
|
raise_remote_error(response.message)
|
|
171
171
|
|
|
172
|
-
print(f"Dataset {name} removed")
|
|
172
|
+
print(f"Dataset '{name}' removed from Studio")
|
|
173
173
|
|
|
174
174
|
|
|
175
175
|
def save_config(hostname, token):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.9
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -98,7 +98,7 @@ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
|
|
|
98
98
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
99
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
100
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.
|
|
101
|
+
Requires-Dist: ultralytics==8.3.37; extra == "examples"
|
|
102
102
|
|
|
103
103
|
================
|
|
104
104
|
|logo| DataChain
|
|
@@ -772,6 +772,38 @@ def test_dataset_stats(test_session):
|
|
|
772
772
|
assert dataset_version2.size == 18
|
|
773
773
|
|
|
774
774
|
|
|
775
|
+
def test_ls_datasets_ordered(test_session):
|
|
776
|
+
ids = [1, 2, 3]
|
|
777
|
+
values = tuple(zip(["a", "b", "c"], ids))
|
|
778
|
+
|
|
779
|
+
assert not list(test_session.catalog.ls_datasets())
|
|
780
|
+
|
|
781
|
+
dc = DataChain.from_values(
|
|
782
|
+
ids=ids,
|
|
783
|
+
file=[File(path=name, size=size) for name, size in values],
|
|
784
|
+
session=test_session,
|
|
785
|
+
)
|
|
786
|
+
dc.save("cats")
|
|
787
|
+
dc.save("dogs")
|
|
788
|
+
dc.save("cats")
|
|
789
|
+
dc.save("cats")
|
|
790
|
+
dc.save("cats")
|
|
791
|
+
datasets = list(test_session.catalog.ls_datasets())
|
|
792
|
+
|
|
793
|
+
assert [
|
|
794
|
+
(d.name, v.version)
|
|
795
|
+
for d in datasets
|
|
796
|
+
for v in d.versions
|
|
797
|
+
if not d.name.startswith("session_")
|
|
798
|
+
] == [
|
|
799
|
+
("cats", 1),
|
|
800
|
+
("cats", 2),
|
|
801
|
+
("cats", 3),
|
|
802
|
+
("cats", 4),
|
|
803
|
+
("dogs", 1),
|
|
804
|
+
]
|
|
805
|
+
|
|
806
|
+
|
|
775
807
|
def test_ls_datasets_no_json(test_session):
|
|
776
808
|
ids = [1, 2, 3]
|
|
777
809
|
values = tuple(zip(["a", "b", "c"], [1, 2, 3]))
|
|
@@ -193,7 +193,7 @@ class MockResponse:
|
|
|
193
193
|
self.ok = ok
|
|
194
194
|
|
|
195
195
|
|
|
196
|
-
def mock_post(url, data=None, json=None, **kwargs):
|
|
196
|
+
def mock_post(method, url, data=None, json=None, **kwargs):
|
|
197
197
|
source = json["source"]
|
|
198
198
|
path = re.sub(r"\w+://[^/]+/?", "", source).rstrip("/")
|
|
199
199
|
data = [
|
|
@@ -238,7 +238,7 @@ dog3
|
|
|
238
238
|
def test_ls_remote_sources(cloud_type, capsys, monkeypatch, studio_config):
|
|
239
239
|
src = f"{cloud_type}://bucket"
|
|
240
240
|
with monkeypatch.context() as m:
|
|
241
|
-
m.setattr("requests.
|
|
241
|
+
m.setattr("requests.request", mock_post)
|
|
242
242
|
ls([src, f"{src}/dogs/others", f"{src}/dogs"], studio=True)
|
|
243
243
|
captured = capsys.readouterr()
|
|
244
244
|
assert captured.out == ls_remote_sources_output.format(src=src)
|
|
@@ -150,28 +150,28 @@ def remote_dataset_chunk_url():
|
|
|
150
150
|
|
|
151
151
|
@pytest.fixture
|
|
152
152
|
def remote_dataset_info(requests_mock, remote_dataset):
|
|
153
|
-
requests_mock.
|
|
153
|
+
requests_mock.get(f"{STUDIO_URL}/api/datachain/datasets/info", json=remote_dataset)
|
|
154
154
|
|
|
155
155
|
|
|
156
156
|
@pytest.fixture
|
|
157
157
|
def remote_dataset_stats(requests_mock):
|
|
158
|
-
requests_mock.
|
|
159
|
-
f"{STUDIO_URL}/api/datachain/
|
|
158
|
+
requests_mock.get(
|
|
159
|
+
f"{STUDIO_URL}/api/datachain/datasets/stats",
|
|
160
160
|
json={"num_objects": 5, "size": 1000},
|
|
161
161
|
)
|
|
162
162
|
|
|
163
163
|
|
|
164
164
|
@pytest.fixture
|
|
165
165
|
def dataset_export(requests_mock, remote_dataset_chunk_url):
|
|
166
|
-
requests_mock.
|
|
167
|
-
f"{STUDIO_URL}/api/datachain/
|
|
166
|
+
requests_mock.get(
|
|
167
|
+
f"{STUDIO_URL}/api/datachain/datasets/export", json=[remote_dataset_chunk_url]
|
|
168
168
|
)
|
|
169
169
|
|
|
170
170
|
|
|
171
171
|
@pytest.fixture
|
|
172
172
|
def dataset_export_status(requests_mock):
|
|
173
|
-
requests_mock.
|
|
174
|
-
f"{STUDIO_URL}/api/datachain/
|
|
173
|
+
requests_mock.get(
|
|
174
|
+
f"{STUDIO_URL}/api/datachain/datasets/export-status",
|
|
175
175
|
json={"status": "completed"},
|
|
176
176
|
)
|
|
177
177
|
|
|
@@ -303,8 +303,8 @@ def test_pull_dataset_not_found_in_remote(
|
|
|
303
303
|
requests_mock,
|
|
304
304
|
cloud_test_catalog,
|
|
305
305
|
):
|
|
306
|
-
requests_mock.
|
|
307
|
-
f"{STUDIO_URL}/api/datachain/
|
|
306
|
+
requests_mock.get(
|
|
307
|
+
f"{STUDIO_URL}/api/datachain/datasets/info",
|
|
308
308
|
status_code=404,
|
|
309
309
|
json={"message": "Dataset not found"},
|
|
310
310
|
)
|
|
@@ -322,8 +322,8 @@ def test_pull_dataset_error_on_fetching_stats(
|
|
|
322
322
|
cloud_test_catalog,
|
|
323
323
|
remote_dataset_info,
|
|
324
324
|
):
|
|
325
|
-
requests_mock.
|
|
326
|
-
f"{STUDIO_URL}/api/datachain/
|
|
325
|
+
requests_mock.get(
|
|
326
|
+
f"{STUDIO_URL}/api/datachain/datasets/stats",
|
|
327
327
|
status_code=400,
|
|
328
328
|
json={"message": "Internal error"},
|
|
329
329
|
)
|
|
@@ -345,8 +345,8 @@ def test_pull_dataset_exporting_dataset_failed_in_remote(
|
|
|
345
345
|
dataset_export,
|
|
346
346
|
export_status,
|
|
347
347
|
):
|
|
348
|
-
requests_mock.
|
|
349
|
-
f"{STUDIO_URL}/api/datachain/
|
|
348
|
+
requests_mock.get(
|
|
349
|
+
f"{STUDIO_URL}/api/datachain/datasets/export-status",
|
|
350
350
|
json={"status": export_status},
|
|
351
351
|
)
|
|
352
352
|
|
|
@@ -169,7 +169,7 @@ def test_studio_datasets(capsys, studio_datasets, mocker):
|
|
|
169
169
|
|
|
170
170
|
def test_studio_edit_dataset(capsys, mocker):
|
|
171
171
|
with requests_mock.mock() as m:
|
|
172
|
-
m.post(f"{STUDIO_URL}/api/datachain/
|
|
172
|
+
m.post(f"{STUDIO_URL}/api/datachain/datasets", json={})
|
|
173
173
|
|
|
174
174
|
# Studio token is required
|
|
175
175
|
assert (
|
|
@@ -217,6 +217,8 @@ def test_studio_edit_dataset(capsys, mocker):
|
|
|
217
217
|
"dataset_name": "name",
|
|
218
218
|
"new_name": "new-name",
|
|
219
219
|
"team_name": "team_name",
|
|
220
|
+
"description": None,
|
|
221
|
+
"labels": None,
|
|
220
222
|
}
|
|
221
223
|
|
|
222
224
|
# With all arguments
|
|
@@ -251,7 +253,7 @@ def test_studio_edit_dataset(capsys, mocker):
|
|
|
251
253
|
|
|
252
254
|
def test_studio_rm_dataset(capsys, mocker):
|
|
253
255
|
with requests_mock.mock() as m:
|
|
254
|
-
m.
|
|
256
|
+
m.delete(f"{STUDIO_URL}/api/datachain/datasets", json={})
|
|
255
257
|
|
|
256
258
|
# Studio token is required
|
|
257
259
|
assert main(["datasets", "rm", "name", "--team", "team_name", "--studio"]) == 1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|